In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250

import machine_learning as ml
from preprocessing import separate_features_target

from ast import literal_eval

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train = {
    'tfidf_sel' : ['tfidf_chi2_train_balanced.csv','tfidf_chi2_train_imbalanced.csv'],
    'word2vec' : ['word2vec_train_balanced.csv','word2vec_train_imbalanced.csv']
}
test = {
    'tfidf_sel' : ['tfidf_chi2_test_balanced.csv','tfidf_chi2_test_imbalanced.csv'],
    'word2vec' : ['word2vec_test_balanced.csv','word2vec_test_imbalanced.csv']
}

# Balanced Dataset

## Train TF-IDF

In [3]:
tfidf_train_balanced_complete = pd.read_csv(os.path.join(csv_path, train['tfidf_sel'][0]), index_col=0)
tfidf_test_balanced_complete = pd.read_csv(os.path.join(csv_path, test['tfidf_sel'][0]), index_col=0)

In [4]:
tfidf_train_balanced = separate_features_target(tfidf_train_balanced_complete)
tfidf_test_balanced = separate_features_target(tfidf_test_balanced_complete)

### Logistic Regression

In [5]:
lr_tfidf_balanced = ml.train_logistic_regression(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)
lr_tfidf_balanced, lr_tfidf_balanced_scaler = lr_tfidf_balanced['model'], lr_tfidf_balanced['scaler']

Train accuracy: 0.9974016332590943


### Decision Tree

In [6]:
dt_tfidf_balanced = ml.train_decision_tree(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9561989606533037


### Random Forest

In [7]:
rf_tfidf_balanced = ml.train_random_forest(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9450631031922792


### Gradient Boosting Tree

In [8]:
gb_tfidf_balanced = ml.train_gradient_boost(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9832962138084632


### Naive Bayes

In [9]:
nb_tfidf_balanced = ml.train_naive_bayes(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)
nb_tfidf_balanced, nb_tfidf_balanced_scaler = nb_tfidf_balanced['model'], nb_tfidf_balanced['scaler']

Train accuracy: 0.9758723088344469


## Train Word2Vec

In [10]:
word2vec_train_balanced_complete = pd.read_csv(os.path.join(csv_path, train['word2vec'][0]), index_col=0)
word2vec_test_balanced_complete = pd.read_csv(os.path.join(csv_path, test['word2vec'][0]), index_col=0)

In [11]:
word2vec_train_balanced = separate_features_target(word2vec_train_balanced_complete)
word2vec_test_balanced = separate_features_target(word2vec_test_balanced_complete)

### Logistic Regression

In [12]:
lr_word2vec_balanced = ml.train_logistic_regression(word2vec_train_balanced['features'], word2vec_train_balanced['target'], show_train_accuracy=1)
lr_word2vec_balanced, lr_word2vec_balanced_scaler = lr_word2vec_balanced['model'], lr_word2vec_balanced['scaler']

Train accuracy: 0.9925760950259837


### Decision Tree

In [13]:
dt_word2vec_balanced = ml.train_decision_tree(word2vec_train_balanced['features'], word2vec_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.985894580549369


### Random Forest

In [14]:
rf_word2vec_balanced = ml.train_random_forest(word2vec_train_balanced['features'], word2vec_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9851521900519673


### Gradient Boosting Tree

In [15]:
gb_word2vec_balanced = ml.train_gradient_boost(word2vec_train_balanced['features'], word2vec_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9970304380103935


### Naive Bayes

In [16]:
nb_word2vec_balanced = ml.train_naive_bayes(word2vec_train_balanced['features'], word2vec_train_balanced['target'], remove_negatives=True, show_train_accuracy=1)
nb_word2vec_balanced, nb_word2vec_balanced_scaler = nb_word2vec_balanced['model'], nb_word2vec_balanced['scaler']

Train accuracy: 0.9643652561247216


## Results

### TF-IDF

In [17]:
models = [lr_tfidf_balanced, dt_tfidf_balanced, rf_tfidf_balanced, gb_tfidf_balanced, nb_tfidf_balanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [18]:
results_tfidf = ml.multi_model_results(models, names, tfidf_test_balanced['features'], tfidf_test_balanced['target'], lr_tfidf_balanced_scaler, nb_tfidf_balanced_scaler)

In [19]:
results_tfidf

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.959941,0.954674,0.968391,0.961484,0.04908,0.031609,0.978246
Decision Tree,0.936202,0.92011,0.95977,0.939522,0.088957,0.04023,0.938196
Random Forest,0.919881,0.953704,0.887931,0.919643,0.046012,0.112069,0.984817
Gradient Boosting Tree,0.954006,0.946479,0.965517,0.955903,0.058282,0.034483,0.995121
Naive Bayes,0.976261,0.963687,0.991379,0.977337,0.039877,0.008621,0.996201


### Word2Vec

In [20]:
models = [lr_word2vec_balanced, dt_word2vec_balanced, rf_word2vec_balanced, gb_word2vec_balanced, nb_word2vec_balanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [21]:
results_word2vec = ml.multi_model_results(models, names, word2vec_test_balanced['features'], word2vec_test_balanced['target'], lr_word2vec_balanced_scaler, nb_word2vec_balanced_scaler)

In [22]:
results_word2vec

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.977745,0.982609,0.974138,0.978355,0.018405,0.025862,0.996324
Decision Tree,0.952522,0.961988,0.945402,0.953623,0.039877,0.054598,0.939466
Random Forest,0.968843,0.965812,0.974138,0.969957,0.03681,0.025862,0.995081
Gradient Boosting Tree,0.977745,0.977077,0.979885,0.978479,0.02454,0.020115,0.996113
Naive Bayes,0.962908,0.965418,0.962644,0.964029,0.03681,0.037356,0.987254


# Imbalanced Dataset

## Train TF-IDF

In [23]:
tfidf_train_imbalanced_complete = pd.read_csv(os.path.join(csv_path, train['tfidf_sel'][1]), index_col=0)
tfidf_test_imbalanced_complete = pd.read_csv(os.path.join(csv_path, test['tfidf_sel'][1]), index_col=0)

In [24]:
tfidf_train_imbalanced = separate_features_target(tfidf_train_imbalanced_complete)
tfidf_test_imbalanced = separate_features_target(tfidf_test_imbalanced_complete)

### Logistic Regression

In [25]:
lr_tfidf_imbalanced = ml.train_logistic_regression(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)
lr_tfidf_imbalanced, lr_tfidf_imbalanced_scaler= lr_tfidf_imbalanced['model'], lr_tfidf_imbalanced['scaler']

Train accuracy: 0.9933774834437086


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [26]:
dt_tfidf_imbalanced = ml.train_decision_tree(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9662116502230031


### Random Forest

In [27]:
rf_tfidf_imbalanced = ml.train_random_forest(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9507365860251386


### Gradient Boosting Tree

In [28]:
gb_tfidf_imbalanced = ml.train_gradient_boost(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9870252736856332


### Naive Bayes

In [29]:
nb_tfidf_imbalanced = ml.train_naive_bayes(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)
nb_tfidf_imbalanced, nb_tfidf_imbalanced_scaler = nb_tfidf_imbalanced['model'], nb_tfidf_imbalanced['scaler']

Train accuracy: 0.9789836464387079


## Train Word2Vec

In [30]:
word2vec_train_imbalanced_complete = pd.read_csv(os.path.join(csv_path, train['word2vec'][1]), index_col=0)
word2vec_test_imbalanced_complete = pd.read_csv(os.path.join(csv_path, test['word2vec'][1]), index_col=0)

In [31]:
word2vec_train_imbalanced = separate_features_target(word2vec_train_imbalanced_complete)
word2vec_test_imbalanced = separate_features_target(word2vec_test_imbalanced_complete)

### Logistic Regression

In [32]:
lr_word2vec_imbalanced = ml.train_logistic_regression(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], show_train_accuracy=1)
lr_word2vec_imbalanced, lr_word2vec_imbalanced_scaler= lr_word2vec_imbalanced['model'], lr_word2vec_imbalanced['scaler']

Train accuracy: 0.9920935261521827


### Decision Tree

In [33]:
dt_word2vec_imbalanced = ml.train_decision_tree(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9846600892012434


### Random Forest

In [34]:
rf_word2vec_imbalanced = ml.train_random_forest(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9856061629949994


### Gradient Boosting Tree

In [35]:
gb_word2vec_imbalanced = ml.train_gradient_boost(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9956750912285444


### Naive Bayes

In [36]:
nb_word2vec_imbalanced = ml.train_naive_bayes(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], remove_negatives=True, show_train_accuracy=1)
nb_word2vec_imbalanced, nb_word2vec_imbalanced_scaler = nb_word2vec_imbalanced['model'], nb_word2vec_imbalanced['scaler']

Train accuracy: 0.9079605352074605


## Results

### TF-IDF

In [37]:
models = [lr_tfidf_imbalanced, dt_tfidf_imbalanced, rf_tfidf_imbalanced, gb_tfidf_imbalanced, nb_tfidf_imbalanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [38]:
results_tfidf_imbalanced = ml.multi_model_results(models, names, tfidf_test_imbalanced['features'], tfidf_test_imbalanced['target'], lr_tfidf_imbalanced_scaler, nb_tfidf_imbalanced_scaler)

In [39]:
results_tfidf_imbalanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.985405,0.91411,0.919753,0.916923,0.008294,0.080247,0.995144
Decision Tree,0.965405,0.874046,0.70679,0.78157,0.009775,0.29321,0.949155
Random Forest,0.950541,0.993007,0.438272,0.608137,0.000296,0.561728,0.980307
Gradient Boosting Tree,0.980811,0.946996,0.82716,0.883031,0.004443,0.17284,0.989347
Naive Bayes,0.979189,0.959108,0.796296,0.870152,0.003258,0.203704,0.994218


### Word2Vec

In [40]:
models = [lr_word2vec_imbalanced, dt_word2vec_imbalanced, rf_word2vec_imbalanced, gb_word2vec_imbalanced, nb_word2vec_imbalanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [41]:
results_word2vec_imbalanced = ml.multi_model_results(models, names, word2vec_test_imbalanced['features'], word2vec_test_imbalanced['target'], lr_word2vec_imbalanced_scaler, nb_word2vec_imbalanced_scaler)

  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
results_word2vec_imbalanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.99027,0.961538,0.925926,0.943396,0.003555,0.074074,0.997795
Decision Tree,0.978919,0.89172,0.864198,0.877743,0.010071,0.135802,0.974495
Random Forest,0.984324,0.96831,0.848765,0.904605,0.002666,0.151235,0.994936
Gradient Boosting Tree,0.989459,0.952381,0.925926,0.938967,0.004443,0.074074,0.998024
Naive Bayes,0.912432,0.0,0.0,0.0,0.0,1.0,0.989869


# Specific emails

It is possible to see the predictions of each algorithm with Word2Vec features (since those performed the best) for selected emails (some of which were also seen previously).

In [43]:
models = [lr_word2vec_balanced, dt_word2vec_balanced, rf_word2vec_balanced, gb_word2vec_balanced, nb_word2vec_balanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [45]:
ml.results_by_id(models, names, word2vec_test_balanced_complete, [5, 17, 1379], lr_word2vec_balanced_scaler, nb_word2vec_balanced_scaler)

Unnamed: 0,Email ID,True Class,Logistic Regression,Decision Tree,Random Forest,Gradient Boosting Tree,Naive Bayes
0,17,True,True,True,True,True,True
1,1379,True,True,False,False,False,False
2,5,False,False,False,False,False,False


We see that the emails with `id: 17` was correctly identified as phishing by all algorithms and, similarly, the email with `id: 5` was unanimously correctly identified as legitimate.<br>

On the other hand, the email with `id: 1379` was misclassified by all of the algorithms except Logistic Regression.
```
Hello,
 I hope you are safe from the Covid 19.
 We are currently back to work and our company hope to place our urgent orders as previously discussed before the lockdown.
Kindly find below our attached order via Wetransfer and confirm availability of all products.


 https://wetransfer.com/downloads



 Kindly send in your best quote and shortest delivery time.
 -


 Greetings!
Maria Pietrygas (Import Manager)
Nautril Holdings
Athens Gr
Tel.08872917845
```
Indeed, while it is a phishing email, it appears to be legitimate (despite the fact that this company does not even exist). Apart from a sense of urgency, none of the other usual phishing markers are present.

# Conclusions

- As expected, the algorithms performed better on the balanced dataset.
- The best performing algorithms were Gradient Boosting and Logistic Regression. Gradient Boosting was a bit more consistent and achieved the best results in the balanced dataset, but Logistic Regression outperformed it in on the imbalanced set.
- Naive Bayes (despite achieving the best results in balanced TF-IDF) is not very well suited to such classification problems, and is especially bad with imbalanced datasets.
- Wor2Vec features definitely outperformed TF-IDF, both in balanced and imbalanced datasets.