In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import os
import re

from matplotlib_venn import venn2
from sklearn import svm
from sklearn.model_selection import GridSearchCV
plt.rcParams['figure.constrained_layout.use'] = True
np.random.seed(seed=0)

from feature_extraction import featureExtractor, encode_text, clean_text
from naive_bayes import naiveBayes
from decision_tree import decisionTree
from ada_boost import adaBoost

## Declare locations of training and test data

In [2]:
train_file = "../data/enron_train_encode_f2.csv"
train_encode_dict_file = "../data/enron_train_encode_dict_f2.json"

test_folder = "../test"

### Build test set from folder

In [3]:
# read the encoding dict
with open(train_encode_dict_file, 'r') as f:
    train_encode_dict = json.load(f)

# invert encoding dictionary for visuliazation
inv_encode_dict = {val: key for key, val in train_encode_dict.items()}

# get all numbers from the folder
dir_list = os.listdir(test_folder)
pattern = re.compile(r'email(\d+)\.txt')

email_nums = []
for fi in dir_list:
    match = pattern.match(fi)
    if match:
        number = match.group(1)
        email_nums.append(int(number))

# it is assumed that 
# no numbers are skipped
# and numbering starts from 0
email_nums = sorted(email_nums)

# read and encode all mails
X_test = np.zeros((len(email_nums), len(train_encode_dict))).astype(int)

for ni in email_nums:
    with open(test_folder+f'/email{ni}.txt') as f:
        msg = f.read()
        X_test[ni] = encode_text(text = msg, enc_dict=train_encode_dict)

# read labels
# the file 'labels.csv'
# has all the labels in order for each email
y_test = np.loadtxt(test_folder+'/labels.csv', delimiter=',')

## Naive Bayes

In [4]:
nb = naiveBayes(data_file=train_file, laplace_smoothing=True)
nb.run()

y_pred, accuracy = nb.predict(X_test=X_test, y_test=y_test)
accuracy

0.9281183932346723

## Decision Trees

In [5]:
for depth in range(3,7):
    dt = decisionTree(data_file=train_file,
                      max_depth=depth)
    dt.run()
    dt.to_dot(save_file=f"../outputs/decision_trees/dtree_depth{depth}_f2.dot", labels=inv_encode_dict)
    
    y_pred, accuracy = dt.predict(X_test, y_test)
    print(accuracy)

0.7346723044397463
0.7585321655089097
0.792358803986711
0.7967381455753548


## ADA Boost

In [4]:
itr = [10, 30, 30, 10, 30]
depth = [3, 3, 4, 5, 5]

for i in range(len(itr)):
    at = adaBoost(data_file=train_file,
                  max_itr=itr[i],
                  max_tree_depth=depth[i])
    at.run()
    
    y_pred, accuracy = at.predict(X_test, y_test)
    print(accuracy)

[0.6060283  0.43326019 0.64297925 0.13189029 0.01574606 0.02518939
 0.0160482  0.01544393 0.01906971 0.01544393]
0.826336454243431
[ 6.06028302e-01  4.33260185e-01  6.42979248e-01  1.31890291e-01
  1.57460624e-02  2.51893927e-02  1.60481965e-02  1.54439312e-02
  1.90697079e-02  1.54439312e-02  1.61237305e-02  1.59726627e-02
  1.55949964e-02  1.57460624e-02  1.61237305e-02  1.54439312e-02
  1.57460624e-02  1.70301529e-02  4.47384908e-01  1.34119180e-01
  4.56865554e-03  5.66358377e-04  3.81349781e-03 -3.77572211e-05
  8.25354028e-02  3.39815003e-04  7.92901809e-04 -3.77572211e-05
 -1.13271664e-04  4.56865554e-03]
0.8897614013893084
[ 0.70109044  0.43135908  0.77606731  0.26671187  0.0948654   0.09090466
  0.09014332  0.57019073  0.23241415  0.22279527 -0.00147253 -0.00404004
  0.15068661  0.62458711  0.25661133  0.08101499  0.03894738  0.00630554
  0.00328489  0.00638106  0.00094393  0.00177459  0.00502175  0.00275628
  0.00245422  0.00177459  0.00509727  0.00411556  0.0052483   0.01219

## SVM

In [5]:
# load data
data = np.loadtxt(train_file, delimiter=',')

num_points = data.shape[0]
dim = data.shape[1]-1

# data points are in rows
# features along columns
X_train = data[:, 0:dim]
y_train = data[:, dim].astype(int)

### Linear kernel

In [None]:
sv = svm.SVC(kernel='linear', C=100)
sv.fit(X_train, y_train)

y_pred = sv.predict(X_test)
yt = y_test.copy()
yt[np.nonzero(y_test != 1)] = -1
accuracy = sv.score(X_test, yt)
print(accuracy)

### Rbf

In [None]:
sv = svm.SVC(kernel='rbf', C=100)
sv.fit(X_train, y_train)

y_pred = sv.predict(X_test)
yt = y_test.copy()
yt[np.nonzero(y_test != 1)] = -1
accuracy = sv.score(X_test, yt)
print(accuracy)

In [None]:
sv.score(X_train, y_train)

In [None]:
at.trees[0].to_dot(save_file="../outputs/dtree0.dot", labels=inv_encode_dict)
at.trees[1].to_dot(save_file="../outputs/dtree1.dot", labels=inv_encode_dict)
at.trees[2].to_dot(save_file="../outputs/dtree2.dot", labels=inv_encode_dict)

In [None]:
# kernels = list(['linear', 'rbf', 'poly'])
# c = list([1e-2, 0.1, 1, 10, 1e2])
# sv = svm.SVC()
# sv.fit(X_train, y_train)

# param_grid = dict(kernel=kernels, C=c)
# grid = GridSearchCV(sv, param_grid, cv=2, n_jobs=-1)
# grid.fit(X_train, y_train)
# grid.best_params_

In [None]:
# cv_df = pd.DataFrame.from_dict(grid.cv_results_)
# cv_df.to_csv("../outputs/cross_val_df.csv")

In [None]:
sv = svm.SVC(kernel='rbf', C=100)
sv.fit(X_train, y_train)

y_pred = sv.predict(X_test)
yt = y_test.copy()
yt[np.nonzero(y_test != 1)] = -1
accuracy = sv.score(X_test, yt)
print(accuracy)

In [6]:
cv_df = pd.read_csv("../outputs/svm/cross_val_df.csv")
cv_df

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0,65.080502,0.262818,59.971067,2.369089,0.01,linear,"{'C': 0.01, 'kernel': 'linear'}",0.919656,0.895862,0.907759,0.011897,5
1,1,214.844323,3.100899,279.563615,0.457798,0.01,rbf,"{'C': 0.01, 'kernel': 'rbf'}",0.813184,0.689548,0.751366,0.061818,10
2,2,205.306051,1.263803,169.200283,1.648568,0.01,poly,"{'C': 0.01, 'kernel': 'poly'}",0.517858,0.531113,0.524486,0.006627,15
3,3,48.104209,1.657325,39.182311,1.95497,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.919127,0.907718,0.913422,0.005705,4
4,4,142.92866,7.999891,182.067115,12.815305,0.1,rbf,"{'C': 0.1, 'kernel': 'rbf'}",0.853659,0.806751,0.830205,0.023454,9
5,5,189.115461,3.930194,164.336289,5.888312,0.1,poly,"{'C': 0.1, 'kernel': 'poly'}",0.541343,0.568796,0.555069,0.013727,14
6,6,52.631674,1.629166,28.684524,0.611387,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.915125,0.889443,0.902284,0.012841,7
7,7,76.731907,2.44433,95.928374,6.917341,1.0,rbf,"{'C': 1, 'kernel': 'rbf'}",0.91656,0.897296,0.906928,0.009632,6
8,8,155.844709,7.812463,144.351427,5.270319,1.0,poly,"{'C': 1, 'kernel': 'poly'}",0.568753,0.83069,0.699722,0.130968,12
9,9,123.449816,5.659362,21.880418,1.797291,10.0,linear,"{'C': 10, 'kernel': 'linear'}",0.911878,0.891482,0.90168,0.010198,8


In [18]:
cv_df2 = pd.DataFrame()
cv_df2["Kernel"] = cv_df["param_kernel"]
cv_df2["C"] = cv_df["param_C"]
cv_df2["Mean validation Score"] = cv_df["mean_test_score"]
cv_df2["Rank"] = cv_df["rank_test_score"]

# save data to text files
with open(f"../outputs/svm/svm_cross_val.tex", 'w') as f:
    f.write(cv_df2.to_latex(index=False, float_format="{:.3f}".format))