# Modelling

In [1]:
import time
import warnings
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../data/preprocessed_data.csv', index_col=0)

In [3]:
data.head(3)

Unnamed: 0_level_0,text
fraudulent,Unnamed: 1_level_1
0,marketing intern u ny new york we re food52 we...
0,customer service cloud video production nz auc...
0,commissioning machinery assistant cma u ia wev...


In [4]:
X_train = pd.read_csv('../data/X_train.csv', index_col=0)
y_train = pd.read_csv('../data/y_train.csv')['fraudulent']
X_test = pd.read_csv('../data/X_test.csv', index_col=0)
y_test = pd.read_csv('../data/y_test.csv')['fraudulent']

## Model Selection

### Dummy Classifier

### Naive Bayes

In [5]:
start_time = time.time()
nb = MultinomialNB()

In [6]:
alpha = [i/10 for i in range(0, 11)]
fit_prior = [True, False]
param_dist = {"alpha": alpha, "fit_prior": fit_prior}

In [7]:
rand_search = GridSearchCV(estimator=nb, param_grid=param_dist, cv=5)
rand_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                   0.9, 1.0],
                         'fit_prior': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [8]:
best_nb = rand_search.best_estimator_
print(best_nb)

MultinomialNB(alpha=0.0, class_prior=None, fit_prior=True)


In [9]:
nb = best_nb
nb.fit(X_train, y_train)
y_predict_nb = nb.predict(X_test)
end_time = time.time()

In [10]:
nb_report = classification_report(y_test, y_predict_nb)
print(nb_report)
print("Execution time: %s min" % ((end_time - start_time)/60))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5985
           1       0.90      0.16      0.28       273

    accuracy                           0.96      6258
   macro avg       0.93      0.58      0.63      6258
weighted avg       0.96      0.96      0.95      6258

Execution time: 0.13900959491729736 min


### Logistic Regression

In [11]:
start_time = time.time()
lr = LogisticRegression()
dual = [True, False]
l1_ratio = [i/10 for i in range(1, 10)]
param_dist = {"dual": dual, "l1_ratio": l1_ratio}

In [12]:
grid_search = GridSearchCV(lr, param_grid=param_dist, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'dual': [True, False],
                         'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                      0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=0)

In [13]:
best_lr = grid_search.best_estimator_

In [14]:
print(best_lr)

LogisticRegression(C=1.0, class_weight=None, dual=True, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.1, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [15]:
y_predict_lr = best_lr.predict(X_test)
end_time = time.time()

In [16]:
lr_report = classification_report(y_test, y_predict_lr)
print(lr_report)
print("Execution time: %s min" % ((end_time - start_time)/60))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      5985
           1       0.94      0.32      0.48       273

    accuracy                           0.97      6258
   macro avg       0.95      0.66      0.73      6258
weighted avg       0.97      0.97      0.96      6258

Execution time: 0.36897594928741456 min


### SGD Classifier

In [17]:
start_time = time.time()
sgd = SGDClassifier()
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
penalty = ['l2', 'l1', 'elasticnet']
param_dist = {"loss": loss, "penalty": penalty}

In [18]:
grid_search = GridSearchCV(sgd, param_grid=param_dist, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'loss': ['hinge', 'log', 'modified_huber',
                                  'squared_hinge', 'perceptron'],
                         'penalty': ['l2', 'l1', 'elasticnet']},


In [19]:
best_sgd = grid_search.best_estimator_

In [20]:
print(best_sgd)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


In [21]:
y_predict_sgd = best_sgd.predict(X_test)
end_time = time.time()

In [22]:
sgd_report = classification_report(y_test, y_predict_sgd)
print(sgd_report)
print("Execution time: %s min" % ((end_time - start_time)/60))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5985
           1       0.86      0.51      0.64       273

    accuracy                           0.98      6258
   macro avg       0.92      0.75      0.81      6258
weighted avg       0.97      0.98      0.97      6258

Execution time: 8.672793873151143 min


### Passive Aggresive Classifier