# Modelling

In [1]:
import time
import pandas as pd
import warnings
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore") 

In [2]:
data = pd.read_csv('../data/preprocessed_data.csv', index_col=0)

In [3]:
data.head(3)

Unnamed: 0_level_0,text
fraudulent,Unnamed: 1_level_1
0,marketing intern were food weve created ground...
0,customer service cloud video production second...
0,commissioning machinery assistant cma valor se...


In [4]:
X_train = pd.read_csv('../data/X_train.csv', index_col=0)
y_train = pd.read_csv('../data/y_train.csv')['fraudulent']
X_test = pd.read_csv('../data/X_test.csv', index_col=0)
y_test = pd.read_csv('../data/y_test.csv')['fraudulent']

## Model Selection

### Dummy Classifier

In [5]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
y_predict_dummy = dummy.predict(X_test)

In [6]:
dummy_report = classification_report(y_test, y_predict_dummy)
print(dummy_report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5143
           1       0.00      0.00      0.00       221

    accuracy                           0.96      5364
   macro avg       0.48      0.50      0.49      5364
weighted avg       0.92      0.96      0.94      5364



### Naive Bayes

In [7]:
start_time = time.time()
nb = MultinomialNB()

In [8]:
alpha = [i/10 for i in range(0, 5)]
fit_prior = [True, False]
param_dist = {"alpha": alpha, "fit_prior": fit_prior}

In [9]:
rand_search = GridSearchCV(estimator=nb, param_grid=param_dist, cv=5)
rand_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=MultinomialNB(),
             param_grid={'alpha': [0.0, 0.1, 0.2, 0.3, 0.4],
                         'fit_prior': [True, False]})

In [10]:
best_nb = rand_search.best_estimator_
print(best_nb)

MultinomialNB(alpha=0.1)


In [11]:
nb = best_nb
nb.fit(X_train, y_train)
y_predict_nb = nb.predict(X_test)
end_time = time.time()

In [12]:
nb_report = classification_report(y_test, y_predict_nb)
print(nb_report)
print("Execution time: %s min" % ((end_time - start_time)/60))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5143
           1       0.89      0.58      0.70       221

    accuracy                           0.98      5364
   macro avg       0.94      0.79      0.85      5364
weighted avg       0.98      0.98      0.98      5364

Execution time: 0.6548625508944194 min


This is not a terrible start, but not quite where I want the scores to be.

### Passive Aggresive Classifier

In [13]:
start_time = time.time()
pac = PassiveAggressiveClassifier()
loss = ['hinge', 'squared_hinge']
shuffle = [True, False]
average = [10]
warm_start = [True]
param_dist = {"shuffle": shuffle, "loss": loss, "warm_start": warm_start, "average": average, "n_jobs": [-1]}

In [None]:
grid_search = GridSearchCV(pac, param_grid=param_dist, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

In [None]:
best_pac = grid_search.best_estimator_

In [None]:
print(best_pac)

In [None]:
y_predict_pac = best_pac.predict(X_test)
end_time = time.time()

In [None]:
pac_report = classification_report(y_test, y_predict_pac)
print(pac_report)
print("Execution time: %s min" % ((end_time - start_time)/60))

This is the best classifier so far. An above 0.80 F1 score for the fraudelent class is pretty decent for the amount of time it took to execute. I am pretty happy with this and can conclude this is the best model for the job from the models I've tested.

### Random Forest

In [None]:
start_time = time.time()
rf = RandomForestClassifier()
criterion = ['gini', 'entropy']
min_samples_split = [i for i in range(2, 10)]
param_dist = {"criterion": criterion, 
              "min_samples_split": min_samples_split, "n_jobs": [-1]}

In [None]:
grid_search = GridSearchCV(rf, param_grid=param_dist, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

In [None]:
best_rf = grid_search.best_estimator_

In [None]:
print(best_rf)

In [None]:
y_predict_rf = best_rf.predict(X_test)
end_time = time.time()

In [None]:
rf_report = classification_report(y_test, y_predict_rf)
print(rf_report)
print("Execution time: %s min" % ((end_time - start_time)/60))

## Summary