In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, LinearSVC
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB   
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import spacy

  from numpy.core.umath_tests import inner1d


In [2]:
data = pd.read_csv('Problem1_processed.csv')

data_sample = data
# replace URL with 0 or 1
data_sample['URL'] = data_sample['URL'].replace(np.nan, 0)
data_sample['URL'][data_sample.URL != 0] = 1

# nr of URL and non-URL
print(len(data_sample['URL'][data_sample.URL == 1]),len(data_sample['URL'][data_sample.URL == 0]))

1895 19057


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [3]:
label = data_sample['URL']
text =data_sample['Text']

In [4]:
# Fast Text (read on the link)
def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r',encoding='utf-8-sig')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    gloveModel['<unk>'] = np.mean(list(gloveModel.values()),axis=0)
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [5]:
embeds = loadGloveModel('glove.6B.50d.txt')

Loading Glove Model
400001  words loaded!


In [6]:
# The embedding, takes along time to run for large documents 

embedding_features = []
nlp = spacy.load('en_core_web_sm')

for document in text:
    # Saving the first 20 words of the document as a sequence
    doc = nlp(document)
    words =[w.text for w in doc]
    
    # Retrieving the vector representation of each word and 
    # appending it to the feature vector 
    feature_vector_list = []
    for word in words:
        if word not in embeds:
            word = '<unk>'
        feature_vector_list.append(embeds[word])
    feature_vector =np.mean(feature_vector_list, axis=0)

    
    # Append the document feature vector to the feature table
    embedding_features.append(feature_vector)
embedding_features=np.array(embedding_features)



In [7]:
feature_vector.shape

(50,)

In [8]:
le = LabelEncoder()
le.fit(label)
label = le.transform(label)
label

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [9]:
label.min()

0

In [10]:
embedding_features.shape

(20952, 50)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(embedding_features, label, test_size=0.10)

In [12]:
X_train

array([[ 0.09091502,  0.15895035,  0.0542306 , ..., -0.26448984,
        -0.08347936, -0.07511647],
       [ 0.24771835,  0.03476347,  0.08761458, ...,  0.1065644 ,
         0.01201645,  0.02241294],
       [-0.25807012,  0.06400631,  0.26416082, ..., -0.20257393,
         0.1190221 ,  0.04141369],
       ...,
       [ 0.12413298,  0.04472905,  0.02790423, ..., -0.1328575 ,
         0.02196264,  0.14177058],
       [-0.32619625,  0.39232217, -0.1823737 , ...,  0.02879713,
         0.01330402,  0.61869329],
       [ 0.27090689,  0.02902627,  0.04390984, ...,  0.08822978,
        -0.08499362,  0.02283533]])

In [13]:
X_train.shape

(18856, 50)

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
model1=LinearRegression()
model1.fit(X_train, y_train)
model1.score(X_test,y_test)

0.04841002336008948

In [15]:
model1.predict(X_test)

array([0.2362853 , 0.20960671, 0.03394334, ..., 0.15005467, 0.05052993,
       0.0755353 ])

In [16]:
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression(class_weight='balanced')
model2.fit(X_train, y_train)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [17]:
y_pred=model2.predict(X_test)

In [18]:
model2.score(X_test,y_test)

0.6479007633587787

In [19]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

0.24229979466119098

In [20]:
from sklearn.tree import DecisionTreeClassifier
model3 = DecisionTreeClassifier(class_weight='balanced')
model3.fit(X_train, y_train)

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
y_pred=model3.predict(X_test)
model3.score(X_test,y_test)

0.8602099236641222

In [22]:
f1_score(y_test, y_pred)

0.2765432098765432

In [23]:
from sklearn.ensemble import RandomForestClassifier
model4 = RandomForestClassifier(max_depth=3,class_weight='balanced',criterion='entropy',n_estimators=500,n_jobs=8)
model4.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=3, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=8, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [24]:
y_pred=model4.predict(X_test)
y_train_pred=model4.predict(X_train)
model4.score(X_test,y_test)

0.6607824427480916

In [25]:
y_pred

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

In [26]:
f1_score(y_train, y_train_pred)

0.24873341375150784

In [27]:
f1_score(y_test, y_pred)

0.22968580715059583

In [28]:
X_train_mlp=X_train
y_train_mlp=y_train
for i in range(9):
    X_train_mlp=np.append(X_train_mlp,X_train[y_train==1],axis=0)
    y_train_mlp=np.append(y_train_mlp,y_train[y_train==1],axis=0)

In [29]:
indices=np.arange(len(y_train_mlp))
np.random.shuffle(indices)
X_train_mlp_s=X_train_mlp[indices]
y_train_mlp_s=y_train_mlp[indices]

In [30]:
from sklearn.neural_network import MLPClassifier
model5 = MLPClassifier()
model5.fit(X_train_mlp_s, y_train_mlp_s)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [31]:
y_pred=model5.predict(X_test)
y_train_pred=model5.predict(X_train_mlp_s)
model5.score(X_test,y_test)

0.8306297709923665

In [32]:
f1_score(y_train_mlp_s, y_train_pred)

0.9146912704045422

In [33]:
f1_score(y_test, y_pred)

0.35336976320582875

In [34]:
model6 = BernoulliNB(alpha = 0.01)
model6.fit(X_train_mlp_s, y_train_mlp_s)

BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)

In [35]:
y_pred=model6.predict(X_test)
y_train_pred=model6.predict(X_train_mlp_s)
model6.score(X_test,y_test)

0.6650763358778626

In [36]:
f1_score(y_test, y_pred)

0.2022727272727273