In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import os
import re

from matplotlib_venn import venn2
from sklearn import svm
from sklearn.model_selection import GridSearchCV
plt.rcParams['figure.constrained_layout.use'] = True
np.random.seed(seed=0)

from feature_extraction import featureExtractor, encode_text, clean_text
from naive_bayes import naiveBayes
from decision_tree import decisionTree
from ada_boost import adaBoost

## Declare locations of training and test data

In [2]:
feature_set = 2
train_file = f"../data/enron_train_encode_f{feature_set}.csv"
train_encode_dict_file = f"../data/enron_train_encode_dict_f{feature_set}.json"

test_folder = "../test"

### Build test set from folder

In [3]:
# read the encoding dict
with open(train_encode_dict_file, 'r') as f:
    train_encode_dict = json.load(f)

# invert encoding dictionary for visuliazation
inv_encode_dict = {val: key for key, val in train_encode_dict.items()}

# get all numbers from the folder
dir_list = os.listdir(test_folder)
pattern = re.compile(r'email(\d+)\.txt')

email_nums = []
for fi in dir_list:
    match = pattern.match(fi)
    if match:
        number = match.group(1)
        email_nums.append(int(number))

# it is assumed that 
# no numbers are skipped
# and numbering starts from 0
email_nums = sorted(email_nums)

# read and encode all mails
X_test = np.zeros((len(email_nums), len(train_encode_dict))).astype(int)

for ni in email_nums:
    with open(test_folder+f'/email{ni}.txt') as f:
        msg = f.read()
        X_test[ni] = encode_text(text = msg, enc_dict=train_encode_dict)

# read labels
# the file 'labels.csv'
# has all the labels in order for each email
y_test = np.loadtxt(test_folder+'/labels.csv', delimiter=',')

In [4]:
y_pred_df = pd.DataFrame()
y_pred_df["Given Label"] = y_test

## Naive Bayes

In [5]:
nb = naiveBayes(data_file=train_file, laplace_smoothing=True)
nb.run()

y_pred, accuracy = nb.predict(X_test=X_test, y_test=y_test)
print(accuracy)

y_pred[np.nonzero(y_pred != 1)] = 0
y_pred_df["Naive Bayes"] = y_pred

0.9281183932346723


## Decision Trees

In [6]:
h = 6
dt = decisionTree(data_file=train_file,
                  max_depth=h)
dt.run()
dt.to_dot(save_file=f"../outputs/decision_trees/dtree_depth{h}_f{feature_set}.dot", labels=inv_encode_dict)

y_pred, accuracy = dt.predict(X_test, y_test)
print(accuracy)

y_pred[np.nonzero(y_pred != 1)] = 0
y_pred_df[f"Decision Tree (height: {h})"] = y_pred

0.8740561763817578


## ADA Boost

In [7]:
itr = 30
h = 5

at = adaBoost(data_file=train_file,
              max_itr=itr,
              max_tree_depth=h)
at.run()

y_pred, accuracy = at.predict(X_test, y_test)
print(accuracy)

y_pred[np.nonzero(y_pred != 1)] = 0
y_pred_df[f"ADA Boost (trees: {itr}, height: {h})"] = y_pred

[0.79542045 0.52427047 0.81060954 0.30427914 0.22208145 0.10798814
 0.53414173 0.1781495  0.25628897 0.71526886 0.05370458 0.20714148
 0.15145912 0.04068696 0.19574141 0.35897186 0.17013321 0.05605259
 0.15184544 0.22454109 0.11877324 0.05256866 0.25983784 0.19143051
 0.04288071 0.05537085 0.34151319 0.23735915 0.03456161 0.16710361]
0.9488070069465419


## SVM

In [8]:
# load data
data = np.loadtxt(train_file, delimiter=',')

num_points = data.shape[0]
dim = data.shape[1]-1

# data points are in rows
# features along columns
X_train = data[:, 0:dim]
y_train = data[:, dim].astype(int)

### Linear Kernel

In [9]:
c = 1
sv = svm.SVC(kernel='linear', C=c)
sv.fit(X_train, y_train)

y_pred = sv.predict(X_test)
yt = y_test.copy()
yt[np.nonzero(y_test != 1)] = -1
accuracy = sv.score(X_test, yt)
print(accuracy)

y_pred[np.nonzero(y_pred != 1)] = 0
y_pred_df[f"SVM (kernel: linear, C: {c})"] = y_pred

0.9681365146481425


### Radial Basis Function Kernel

In [10]:
c = 1
sv = svm.SVC(kernel='rbf', C=c)
sv.fit(X_train, y_train)

y_pred = sv.predict(X_test)
yt = y_test.copy()
yt[np.nonzero(y_test != 1)] = -1
accuracy = sv.score(X_test, yt)
print(accuracy)

y_pred[np.nonzero(y_pred != 1)] = 0
y_pred_df[f"SVM (kernel: rbf, C: {c})"] = y_pred

0.9519782543038356


## Results

In [12]:
y_pred_df.to_csv(f"../outputs/predictions_f{feature_set}.csv")