In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, LinearSVC
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB   
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

## Data prepcessing

In [4]:
data = pd.read_csv('clean_data.csv')

data_sample = data
# replace URL with 0 or 1
data_sample['URL'] = data_sample['URL'].replace(np.nan, 0)
data_sample['URL'][data_sample.URL != 0] = 1

# nr of URL and non-URL
print(len(data_sample['URL'][data_sample.URL == 1]),len(data_sample['URL'][data_sample.URL == 0]))

6614 49412


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sample['URL'][data_sample.URL != 0] = 1


In [5]:
label = data_sample['URL']
text =data_sample['Text']

## Fast Text + Embedding

In [198]:
# Fast Text (read on the link)
path_fastText = '/Users/vilhelmbureviksandberg/Downloads/wiki-news-300d-1M.vec'

dictionary = open(path_fastText, 'r', encoding='utf-8',
                  newline='\n', errors='ignore')

embeds = {}
for line in dictionary:
    tokens = line.rstrip().split(' ')
    embeds[tokens[0]] = [float(x) for x in tokens[1:]]
    
    if len(embeds) == 100000:
        break

In [199]:
# The embedding, takes along time to run for large documents 
from keras.preprocessing.text import text_to_word_sequence
array_length = 20 * 300
embedding_features = pd.DataFrame()
for document in text:
    # Saving the first 20 words of the document as a sequence
    words = text_to_word_sequence(document)[0:20] 
    
    # Retrieving the vector representation of each word and 
    # appending it to the feature vector 
    feature_vector = []
    for word in words:
        try:
            feature_vector = np.append(feature_vector, 
                                       np.array(embeds[word]))
        except KeyError:
            # In the event that a word is not included in our 
            # dictionary skip that word
            pass
    # If the text has less then 20 words, fill remaining vector with
    # zeros
    zeroes_to_add = array_length - len(feature_vector)
    feature_vector = np.append(feature_vector, 
                               np.zeros(zeroes_to_add)
                               ).reshape((1,-1))
    
    # Append the document feature vector to the feature table
    embedding_features = embedding_features.append( 
                                     pd.DataFrame(feature_vector))



In [6]:
# Converting the labels from strings to binary
le = LabelEncoder()
le.fit(label)
label = le.transform(label)

In [203]:
# Taking 70/30 train test split
X_train, X_test, y_train, y_test = train_test_split(embedding_features, label, test_size=0.33, random_state=0)

In [170]:
#embedding_features

## Training algorithms

In [135]:
def train_models(X_train,y_train,model):
    
    if model == 'naive_bayes':
        param_grid = [{'alpha':[10e-5,10e-3,10e-1,1]}]
        kfold = StratifiedKFold(n_splits=10,shuffle = True, random_state = 1)
        grid = GridSearchCV(BernoulliNB(),param_grid,cv=kfold,scoring='accuracy')
        grid.fit(X_train,y_train)
        grid_values = list(grid.best_params_.values())
        clf = BernoulliNB(alpha = grid_values[0])
        clf = clf.fit(X_train,y_train)
        return clf
    

    if model == 'Linear_SGD_classifier':
        param_grid = [{'alpha':[0.0001,0.001,0.01,0.1,1]}]
        kfold = StratifiedKFold(n_splits=10,shuffle = True, random_state = 1)
        grid = GridSearchCV(SGDClassifier(loss = 'squared_loss'),param_grid,cv=kfold,scoring='accuracy')
        grid.fit(X_train,y_train)
        grid_values = list(grid.best_params_.values())   
        clf = SGDClassifier(loss = 'squared_loss',alpha = grid_values[0])
        clf = clf.fit(X_train,y_train)
        return clf
    
    if model == 'support_vector_machine':
        param_grid = [{'C':[0.001, 0.01, 0.1,1,5,100],'gamma':[0.001,0.01,0.1,1,5,15]}]
        kfold = StratifiedKFold(n_splits=10,shuffle = True, random_state = 1)
        grid = GridSearchCV(SVC(), param_grid=param_grid, cv=kfold,scoring='accuracy')
        grid.fit(X_train,y_train)
        grid_values = list(grid.best_params_.values())   
        clf = SVC(C=grid_values[1],gamma=grid_values[0], probability = True)
        clf = clf.fit(X_train,y_train)
        return clf

In [None]:
# Naive Bayes
embeded_model1 = train_models(X_train,y_train,'naive_bayes')
embeded_prediction1 = embeded_model1.predict(embedding_features)

#SGD
embeded_model2 = train_models(X_train,y_train,'Linear_SGD_classifier')
embeded_prediction2 = embeded_model2.predict(embedding_features)

In [186]:
#results = pd.DataFrame(index = ['Word Embedding'], 
#          columns = ['Precision', 'Recall', 'F1 score', 'support']
#          )
#results.loc['Word Embedding'] = precision_recall_fscore_support(
#        embedding_features,
#          embeded_prediction,
#        average = 'binary')

In [192]:
data['Prediction'] = embeded_prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Prediction'] = embeded_prediction


In [194]:
# Save to file 
data.to_csv("Problem1.csv",encoding='utf-8-sig')

## Some extra evaluation metrics 

In [90]:
#Function that create the perfomance table for training models 
def performance_table(X_train,y_train):
    acc_list = []
    clf_nb = train_models(X_train,y_train,'naive_bayes')
    acc_nb = round(clf_nb.score(X_train,y_train) * 100, 2)
    
    clf_sgd = train_models(X_train,y_train,'Linear_SGD_classifier')
    acc_sgd = round(clf_sgd.score(X_train,y_train) * 100, 2)   
    
    clf_svm = train_models(X_train,y_train,'support_vector_machine')
    acc_svm = round(clf_svm.score(X_train,y_train) * 100, 2)
    
    acc_list.extend([acc_nb,acc_random_forest,acc_svm])
    return acc_list