# Modelling

In [1]:
import time
import pandas as pd
import warnings
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore") 

In [2]:
data = pd.read_csv('../data/preprocessed_data.csv', index_col=0)

In [3]:
data.head(3)

Unnamed: 0_level_0,text
fraudulent,Unnamed: 1_level_1
0,marketing intern u ny new york we re food we v...
0,customer service cloud video production nz auc...
0,commissioning machinery assistant cma u ia wev...


In [4]:
X_train = pd.read_csv('../data/X_train.csv', index_col=0)
y_train = pd.read_csv('../data/y_train.csv')['fraudulent']
X_test = pd.read_csv('../data/X_test.csv', index_col=0)
y_test = pd.read_csv('../data/y_test.csv')['fraudulent']

## Model Selection

### Dummy Classifier

### Naive Bayes

In [5]:
start_time = time.time()
nb = MultinomialNB()

In [6]:
alpha = [i/10 for i in range(0, 11)]
fit_prior = [True, False]
param_dist = {"alpha": alpha, "fit_prior": fit_prior}

In [7]:
rand_search = GridSearchCV(estimator=nb, param_grid=param_dist, cv=5)
rand_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=MultinomialNB(),
             param_grid={'alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                   0.9, 1.0],
                         'fit_prior': [True, False]})

In [8]:
best_nb = rand_search.best_estimator_
print(best_nb)

MultinomialNB(alpha=0.1)


In [9]:
nb = best_nb
nb.fit(X_train, y_train)
y_predict_nb = nb.predict(X_test)
end_time = time.time()

In [10]:
nb_report = classification_report(y_test, y_predict_nb)
print(nb_report)
print("Execution time: %s min" % ((end_time - start_time)/60))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5985
           1       0.92      0.59      0.72       273

    accuracy                           0.98      6258
   macro avg       0.95      0.79      0.85      6258
weighted avg       0.98      0.98      0.98      6258

Execution time: 1.619954800605774 min


This is not a terrible start, but not quite where I want the scores to be.

### Logistic Regression

In [11]:
start_time = time.time()
lr = LogisticRegression()
dual = [True, False]
l1_ratio = [i/10 for i in range(1, 10)]
param_dist = {"dual": dual, "l1_ratio": l1_ratio, "n_jobs": [-1]}

In [12]:
grid_search = GridSearchCV(lr, param_grid=param_dist, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'dual': [True, False],
                         'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                      0.9],
                         'n_jobs': [-1]},
             scoring='f1')

In [13]:
best_lr = grid_search.best_estimator_

In [14]:
print(best_lr)

LogisticRegression(l1_ratio=0.1, n_jobs=-1)


In [15]:
y_predict_lr = best_lr.predict(X_test)
end_time = time.time()

In [16]:
lr_report = classification_report(y_test, y_predict_lr)
print(lr_report)
print("Execution time: %s min" % ((end_time - start_time)/60))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      5985
           1       0.96      0.38      0.54       273

    accuracy                           0.97      6258
   macro avg       0.97      0.69      0.76      6258
weighted avg       0.97      0.97      0.97      6258

Execution time: 8.905601799488068 min


I guess its safe to say that logistic regression is not the best model. The F1 score for the fraudulent jobs is way too low. The execution time is also longer than I would like (feeling a bit impatient today). On to the next.

### Passive Aggresive Classifier

In [23]:
start_time = time.time()
pac = PassiveAggressiveClassifier()
loss = ['hinge', 'squared_hinge']
shuffle = [True, False]
param_dist = {"shuffle": shuffle, "loss": loss, "n_jobs": [-1]}

In [24]:
grid_search = GridSearchCV(pac, param_grid=param_dist, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=PassiveAggressiveClassifier(),
             param_grid={'loss': ['hinge', 'squared_hinge'], 'n_jobs': [-1],
                         'shuffle': [True, False]},
             scoring='f1')

In [25]:
best_pac = grid_search.best_estimator_

In [26]:
print(best_pac)

PassiveAggressiveClassifier(n_jobs=-1, shuffle=False)


In [27]:
y_predict_pac = best_pac.predict(X_test)
end_time = time.time()

In [28]:
pac_report = classification_report(y_test, y_predict_pac)
print(sgd_report)
print("Execution time: %s min" % ((end_time - start_time)/60))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5985
           1       0.97      0.65      0.78       273

    accuracy                           0.98      6258
   macro avg       0.98      0.83      0.89      6258
weighted avg       0.98      0.98      0.98      6258

Execution time: 1.8393629988034566 min


This is the best classifier so far. An above 0.80 F1 score for the fraudelent class is pretty decent for the amount of time it took to execute. I am pretty happy with this and can conclude this is the best model for the job from the models I've tested.

## Summary