In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Scam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_train_fSC.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_test_fSC.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_train_fSC.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_test_fSC.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_train_tfSC.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_test_tfSC.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_train_tfglSC.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_test_tfglSC.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_train_tfccSC.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_test_tfccSC.pkl')

## TF-IDF InferSent with top 15 features
X_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_train_infSC.pkl')
X_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_test_infSC.pkl')
y_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_train_infSC.pkl')
y_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_test_infSC.pkl')

##K Nearest Neighbour Regression Classifier

### Hyperparameter Tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
# Number of neighbors to use by default for kneighbors queries.
n_neighbors = [3, 5, 7, 9]
# Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.
weights = ['uniform', 'distance']
# Algorithm used to compute the nearest neighbors.
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
# Power parameter for the Minkowski metric.
p = [1, 2]
# The number of parallel jobs to run for neighbors search.
n_jobs = [-1]

# Create the param grid
param_test = {'n_neighbors': n_neighbors,
              'weights': weights,
              'algorithm': algorithm,
              'p': p,
              'n_jobs': n_jobs}

###Training best model for scaled Vanilla features

In [None]:
clf_1 = KNeighborsClassifier()
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                            param_distributions = param_test, 
                            n_iter = 50,
                            refit = True,
                            random_state = 1,
                            n_jobs = -1
                            )
%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))

CPU times: user 1.56 s, sys: 124 ms, total: 1.68 s
Wall time: 16.8 s
Best score reached: 0.9052001910717387 with params: {'weights': 'distance', 'p': 1, 'n_neighbors': 3, 'n_jobs': -1, 'algorithm': 'brute'} 


In [None]:
%time y_train_pred = rs_1.predict(X_train_scaled)
%time y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_1.predict(X_test_scaled)
%time y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 115 ms, sys: 22.2 ms, total: 137 ms
Wall time: 131 ms
CPU times: user 116 ms, sys: 2.99 ms, total: 118 ms
Wall time: 130 ms
Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

CPU times: user 72.7 ms, sys: 3.48 ms, total: 76.2 ms
Wall time: 128 ms
CPU times: user 57 ms, sys: 748 µs, total: 57.8 ms
Wall time: 116 ms
Test Results:
              precision    recall  f1-score   support

           0      0.908     0.815     0.859       265
           1      0.904     0.955     0.929       484

    accuracy                          0.905       749
   macro avg      0.906     0.885     0.894       749
we

###Training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = KNeighborsClassifier()
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))

CPU times: user 1.83 s, sys: 109 ms, total: 1.94 s
Wall time: 38.4 s
Best score reached: 0.9697151294076777 with params: {'weights': 'distance', 'p': 2, 'n_neighbors': 9, 'n_jobs': -1, 'algorithm': 'ball_tree'} 


In [None]:
%time y_train_pred = rs_2.predict(X_train_tfidf)
%time y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_2.predict(X_test_tfidf)
%time y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 1.3 s, sys: 4.71 ms, total: 1.31 s
Wall time: 711 ms
CPU times: user 1.31 s, sys: 1.25 ms, total: 1.32 s
Wall time: 710 ms
Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

CPU times: user 653 ms, sys: 1.98 ms, total: 655 ms
Wall time: 407 ms
CPU times: user 647 ms, sys: 0 ns, total: 647 ms
Wall time: 407 ms
Test Results:
              precision    recall  f1-score   support

           0      0.924     0.970     0.947       265
           1      0.983     0.957     0.970       484

    accuracy                          0.961       749
   macro avg      0.954     0.963     0.958       749
weighte

###Training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = KNeighborsClassifier()
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))

CPU times: user 2.02 s, sys: 119 ms, total: 2.13 s
Wall time: 38.7 s
Best score reached: 0.9769584853222165 with params: {'weights': 'distance', 'p': 1, 'n_neighbors': 9, 'n_jobs': -1, 'algorithm': 'auto'} 


In [None]:
%time y_train_pred = rs_3.predict(X_train_tfidf_glove)
%time y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_3.predict(X_test_tfidf_glove)
%time y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 1.9 s, sys: 4.93 ms, total: 1.9 s
Wall time: 1.01 s
CPU times: user 2.47 s, sys: 3.13 ms, total: 2.47 s
Wall time: 1.41 s
Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

CPU times: user 959 ms, sys: 0 ns, total: 959 ms
Wall time: 609 ms
CPU times: user 992 ms, sys: 1.75 ms, total: 994 ms
Wall time: 608 ms
Test Results:
              precision    recall  f1-score   support

           0      0.970     0.966     0.968       265
           1      0.981     0.983     0.982       484

    accuracy                          0.977       749
   macro avg      0.976     0.975     0.975       749
weighted

###Training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = KNeighborsClassifier()
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))

CPU times: user 1.82 s, sys: 136 ms, total: 1.95 s
Wall time: 38 s
Best score reached: 0.9848553934340802 with params: {'weights': 'distance', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'algorithm': 'kd_tree'} 


In [None]:
%time y_train_pred = rs_4.predict(X_train_tfidf_cc)
%time y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_4.predict(X_test_tfidf_cc)
%time y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 1.85 s, sys: 2.67 ms, total: 1.85 s
Wall time: 1.01 s
CPU times: user 2.03 s, sys: 9.48 ms, total: 2.03 s
Wall time: 1.13 s
Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

CPU times: user 907 ms, sys: 2.48 ms, total: 909 ms
Wall time: 507 ms
CPU times: user 908 ms, sys: 5.21 ms, total: 913 ms
Wall time: 506 ms
Test Results:
              precision    recall  f1-score   support

           0      0.977     0.955     0.966       265
           1      0.976     0.988     0.982       484

    accuracy                          0.976       749
   macro avg      0.976     0.971     0.974       749
wei

###Training best model for Top 15 Features + TF-IDF InferSent

In [None]:
clf_5 = KNeighborsClassifier()
rs_5 = RandomizedSearchCV(estimator=clf_5, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_5.fit(X_train_tfidf_inf, y_train_tfidf_inf)
print('Best score reached: {} with params: {} '.format(rs_5.best_score_, rs_5.best_params_))

CPU times: user 3.42 s, sys: 315 ms, total: 3.73 s
Wall time: 6min 6s
Best score reached: 0.7705806388733218 with params: {'weights': 'uniform', 'p': 2, 'n_neighbors': 9, 'n_jobs': -1, 'algorithm': 'brute'} 


In [None]:
%time y_train_pred = rs_5.predict(X_train_tfidf_inf)
%time y_train_prob = rs_5.predict_proba(X_train_tfidf_inf)[:, 1]
print("Train Results:")
show_results(y_train_tfidf_inf, y_train_pred, y_train_prob)

%time y_test_pred = rs_5.predict(X_test_tfidf_inf)
%time y_test_prob = rs_5.predict_proba(X_test_tfidf_inf)[:, 1]
print("Test Results:")
show_results(y_test_tfidf_inf, y_test_pred, y_test_prob)

CPU times: user 1.26 s, sys: 152 ms, total: 1.41 s
Wall time: 761 ms
CPU times: user 1.18 s, sys: 135 ms, total: 1.32 s
Wall time: 759 ms
Train Results:
              precision    recall  f1-score   support

           0      0.860     0.503     0.635       453
           1      0.808     0.962     0.878       981

    accuracy                          0.817      1434
   macro avg      0.834     0.733     0.757      1434
weighted avg      0.824     0.817     0.801      1434

[[228 225]
 [ 37 944]]
Accuracy: 0.8172942817294282
AUC_ROC: 0.8815530397643527
f1 score: 0.8781395348837209
False Postive Rate: 0.13962264150943396

CPU times: user 625 ms, sys: 132 ms, total: 758 ms
Wall time: 464 ms
CPU times: user 611 ms, sys: 121 ms, total: 732 ms
Wall time: 435 ms
Test Results:
              precision    recall  f1-score   support

           0      0.701     0.336     0.455       223
           1      0.753     0.934     0.834       484

    accuracy                          0.745       707
