In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Spam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_train_fSP.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_test_fSP.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_train_fSP.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_test_fSP.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_train_tfSP.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_test_tfSP.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_train_tfglSP.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_test_tfglSP.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_train_tfccSP.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_test_tfccSP.pkl')

## TF-IDF InferSent with top 15 features - Infersent dataset to large to run with KNN
# X_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_train_infSP.pkl')
# X_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_test_infSP.pkl')
# y_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_train_infSP.pkl')
# y_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_test_infSP.pkl')

##K Nearest Neighbour Regression Classifier

### Hyperparameter Tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
# Number of neighbors to use by default for kneighbors queries.
n_neighbors = [3, 5, 7, 9]
# Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.
weights = ['uniform', 'distance']
# Algorithm used to compute the nearest neighbors.
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
# Power parameter for the Minkowski metric.
p = [1, 2]
# The number of parallel jobs to run for neighbors search.
n_jobs = [-1]

# Create the param grid
param_test = {'n_neighbors': n_neighbors,
              'weights': weights,
              'algorithm': algorithm,
              'p': p,
              'n_jobs': n_jobs}

###Training best model for scaled Vanilla features

In [None]:
clf_1 = KNeighborsClassifier()
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                            param_distributions = param_test, 
                            n_iter = 50,
                            refit = True,
                            random_state = 1,
                            n_jobs = -1
                            )
%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))



CPU times: user 4.43 s, sys: 662 ms, total: 5.09 s
Wall time: 8min 15s
Best score reached: 0.8326449100368054 with params: {'weights': 'distance', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'algorithm': 'brute'} 


In [None]:
%time y_train_pred = rs_1.predict(X_train_scaled)
%time y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_1.predict(X_test_scaled)
%time y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 26.2 s, sys: 864 ms, total: 27 s
Wall time: 18.6 s
CPU times: user 25.8 s, sys: 21 ms, total: 25.9 s
Wall time: 18.1 s
Train Results:
              precision    recall  f1-score   support

           0      0.998     1.000     0.999     12462
           1      1.000     0.997     0.998     10059

    accuracy                          0.999     22521
   macro avg      0.999     0.998     0.999     22521
weighted avg      0.999     0.999     0.999     22521

[[12460     2]
 [   31 10028]]
Accuracy: 0.9985347009457839
AUC_ROC: 0.9999950859659991
f1 score: 0.9983573099706308
False Postive Rate: 0.00016048788316482104

CPU times: user 12.9 s, sys: 16.6 ms, total: 12.9 s
Wall time: 9.14 s
CPU times: user 12.5 s, sys: 25.1 ms, total: 12.5 s
Wall time: 8.74 s
Test Results:
              precision    recall  f1-score   support

           0      0.842     0.879     0.860      6138
           1      0.841     0.796     0.818      4955

    accuracy                          0.842 

###Training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = KNeighborsClassifier()
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))



In [None]:
%time y_train_pred = rs_2.predict(X_train_tfidf)
%time y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_2.predict(X_test_tfidf)
%time y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 20min 35s, sys: 1.08 s, total: 20min 36s
Wall time: 10min 36s
CPU times: user 20min 4s, sys: 613 ms, total: 20min 5s
Wall time: 10min 17s
Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     12462
           1      1.000     1.000     1.000     10059

    accuracy                          1.000     22521
   macro avg      1.000     1.000     1.000     22521
weighted avg      1.000     1.000     1.000     22521

[[12462     0]
 [    1 10058]]
Accuracy: 0.9999555969983571
AUC_ROC: 0.999999996011336
f1 score: 0.9999502907988269
False Postive Rate: 0.0

CPU times: user 10min 10s, sys: 292 ms, total: 10min 11s
Wall time: 5min 11s
CPU times: user 10min 9s, sys: 243 ms, total: 10min 9s
Wall time: 5min 10s
Test Results:
              precision    recall  f1-score   support

           0      0.902     0.681     0.777      6138
           1      0.697     0.909     0.789      4955

    accuracy                     

###Training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = KNeighborsClassifier()
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))



CPU times: user 30.3 s, sys: 3.46 s, total: 33.8 s
Wall time: 1h 48min 31s
Best score reached: 0.9087963443026595 with params: {'weights': 'distance', 'p': 2, 'n_neighbors': 3, 'n_jobs': -1, 'algorithm': 'kd_tree'} 


In [None]:
%time y_train_pred = rs_3.predict(X_train_tfidf_glove)
%time y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_3.predict(X_test_tfidf_glove)
%time y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 5min 19s, sys: 7.96 ms, total: 5min 19s
Wall time: 2min 41s
CPU times: user 5min 18s, sys: 123 ms, total: 5min 18s
Wall time: 2min 41s
Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     12462
           1      1.000     1.000     1.000     10059

    accuracy                          1.000     22521
   macro avg      1.000     1.000     1.000     22521
weighted avg      1.000     1.000     1.000     22521

[[12462     0]
 [    0 10059]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

CPU times: user 2min 46s, sys: 82 ms, total: 2min 46s
Wall time: 1min 24s
CPU times: user 2min 48s, sys: 75.8 ms, total: 2min 48s
Wall time: 1min 25s
Test Results:
              precision    recall  f1-score   support

           0      0.892     0.953     0.922      6138
           1      0.936     0.858     0.895      4955

    accuracy                          0.910     11093
   macro avg      0.914     

###Training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = KNeighborsClassifier()
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))



CPU times: user 17.4 s, sys: 1.76 s, total: 19.2 s
Wall time: 1h 54min 15s
Best score reached: 0.9380136635236553 with params: {'weights': 'distance', 'p': 2, 'n_neighbors': 3, 'n_jobs': -1, 'algorithm': 'kd_tree'} 


In [None]:
%time y_train_pred = rs_4.predict(X_train_tfidf_cc)
%time y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_4.predict(X_test_tfidf_cc)
%time y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 6min 48s, sys: 217 ms, total: 6min 48s
Wall time: 3min 27s
CPU times: user 6min 42s, sys: 181 ms, total: 6min 42s
Wall time: 3min 24s
Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     12462
           1      1.000     1.000     1.000     10059

    accuracy                          1.000     22521
   macro avg      1.000     1.000     1.000     22521
weighted avg      1.000     1.000     1.000     22521

[[12462     0]
 [    0 10059]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

CPU times: user 3min 19s, sys: 95.4 ms, total: 3min 19s
Wall time: 1min 41s
CPU times: user 3min 17s, sys: 75.7 ms, total: 3min 17s
Wall time: 1min 39s
Test Results:
              precision    recall  f1-score   support

           0      0.936     0.958     0.947      6138
           1      0.947     0.919     0.933      4955

    accuracy                          0.941     11093
   macro avg      0.941    

###Training best model for Top 15 Features + TF-IDF InferSent

In [None]:
# clf_5 = KNeighborsClassifier()
# rs_5 = RandomizedSearchCV(estimator=clf_5, 
#                       param_distributions = param_test,  
#                       n_iter = 50,
#                       refit = True,
#                       random_state = 1,
#                       n_jobs = -1
#                       )

# %time rs_5.fit(X_train_tfidf_inf, y_train_tfidf_inf)
# print('Best score reached: {} with params: {} '.format(rs_5.best_score_, rs_5.best_params_))

In [None]:
# %time y_train_pred = rs_5.predict(X_train_tfidf_inf)
# %time y_train_prob = rs_5.predict_proba(X_train_tfidf_inf)[:, 1]
# print("Train Results:")
# show_results(y_train_tfidf_inf, y_train_pred, y_train_prob)

# %time y_test_pred = rs_5.predict(X_test_tfidf_inf)
# %time y_test_prob = rs_5.predict_proba(X_test_tfidf_inf)[:, 1]
# print("Test Results:")
# show_results(y_test_tfidf_inf, y_test_pred, y_test_prob)