## 3.1. Multinomial Naive Bayes

## Content
- [Importing Libraries and Dataset](#Importing-Libraries-and-Dataset)
- [Functions for model presentation](#Functions-for-model-presentation)
- [Training model](#Training-model)

## Importing Libraries and Dataset

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from mlxtend.preprocessing import DenseTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import SparsePCA
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, recall_score, accuracy_score, precision_score, f1_score, roc_auc_score
import pickle

In [2]:
val = pd.read_csv('../datasets/val.csv')
train = pd.read_csv('../datasets/train.csv')
df = pd.read_csv('../datasets/dataset.csv')

In [3]:
X_val = val.text
y_val = val.target_variable
X_train = train.text
y_train = train.target_variable
X = df['text']
y = df['target_variable']

## Functions for model presentation

In [4]:
# Our scorer based on accuracy_score
scorers = {'precision_score': make_scorer(precision_score),
           'recall_score': make_scorer(recall_score),
           'accuracy_score': make_scorer(accuracy_score),
           'f1_score': make_scorer(f1_score),
           'roc_auc_score': make_scorer(roc_auc_score, needs_threshold=True)
          }

#make a function that prints evaluation metrics score
def evaluation_metrics(model):
    print('Train\'s accuracy_score: {}'.format(round(model.score(X_train, y_train),4)))
    print('Best accuracy score from training: {}'.format(round(model.best_score_,4)))
    print('Validation\'s accuracy score : {}'.format(round(model.score(X_val, y_val),4)))
    print('Difference in accuracy scores between train and val: {}'.format(round(model.best_score_ - model.score(X_val, y_val),4)))
    model_proba = [i[1] for i in model.predict_proba(X_val)]
    print('ROC_AUC score on Validation Set: {}'.format(round(roc_auc_score(y_val, model_proba), 4)))
    
    y_pred = model.predict(X_val)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    precision = tp/(tp+fp)
    
    print(f'Model sensitivity is : {sensitivity}')
    print(f'Model specificity is : {specificity}')
    print(f'Model f1 score is : {(2*sensitivity*precision)/(sensitivity+precision)}')
    print('\n\nClassification report :\n', classification_report(y_val, y_pred),'\n')
    print(pd.DataFrame({'Pred Negative' : [tn,fn], 'Pred Positive' : [fp,tp]}, index = ['Actual Negative','Actual Postitive']))


#for final model section:
#make a function that prints all classification metrics, AUC-ROC + TN, FP, FN, TP
def all_metrics(model):
    y_pred = model.predict(X_val)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    print("True Negatives: " + str(tn))
    print("False Positives: " + str(fp))
    print("False Negatives: " + str(fn))
    print("True Positives: " + str(tp))
    print()
    print('--------------------------------')
    print()
    print('Accuracy: {}'.format(round(accuracy_score(y_val, y_pred), 4)))
    print('Misclassification rate: {}'.format(round((fp+fn)/(tp+fp+tn+fn),4)))
    print('Precision: {}'.format(round(precision_score(y_val, y_pred), 4)))
    print('Recall: {}'.format(round(recall_score(y_val, y_pred), 4)))
    print('Specificity: {}'.format(round(tn/(tn+fp),4)))
    print(f'Model f1 score is : {(f1_score(y_val, y_pred))}')
    #get roc auc score
    model_proba = [i[1] for i in model.predict_proba(X_val)]
    print('ROC_AUC score on Validation Set: {}'.format(round(roc_auc_score(y_val, model_proba), 4)))

In [5]:
#Set stratified k-fold for cross validation.
#we will use stratified k-fold since it is more suitable for binary classification.

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Training model

**Count Vectorizer**

In [6]:
mnb_pipe_cvec = Pipeline([('cvec',CountVectorizer()),('mnb',MultinomialNB())])

In [7]:
print(f'Accuracy : {np.mean(cross_val_score(mnb_pipe_cvec, X_train, y_train, cv = skf, n_jobs = -1))}')

Accuracy : 0.8259606238803092


**TFIDF Vectorizer**

In [8]:
mnb_pipe_tvec = Pipeline([('cvec',CountVectorizer()),('tvec',TfidfTransformer()),('mnb',MultinomialNB())])

In [9]:
print(f'Accuracy : {np.mean(cross_val_score(mnb_pipe_tvec, X_train, y_train, cv = skf, n_jobs = -1))}')

Accuracy : 0.7950053387739098


**Grid Search**

In [10]:
params = {'cvec__max_features': [5000, 8000, 10000, 16000, 24000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [0.9, 0.95],
    'cvec__ngram_range': [(1,1),(1,2),(2,2)]}

mnb_gs_cvec = GridSearchCV(mnb_pipe_cvec, param_grid = params, cv=skf, n_jobs=-1, verbose=1, scoring=scorers, refit='accuracy_score')
mnb_gs_cvec.fit(X_train, y_train)
mnb_gs_cvec.best_params_

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 12.2min finished


{'cvec__max_df': 0.9,
 'cvec__max_features': 10000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1)}

**Model Performance**

In [11]:
evaluation_metrics(mnb_gs_cvec)

Train's accuracy_score: 0.8467
Best accuracy score from training: 0.8243
Validation's accuracy score : 0.8239
Difference in accuracy scores between train and val: 0.0004
ROC_AUC score on Validation Set: 0.8791
Model sensitivity is : 0.8833017077798861
Model specificity is : 0.7177966101694915
Model f1 score is : 0.8654427143853126


Classification report :
               precision    recall  f1-score   support

           0       0.77      0.72      0.75      1180
           1       0.85      0.88      0.87      2108

    accuracy                           0.82      3288
   macro avg       0.81      0.80      0.81      3288
weighted avg       0.82      0.82      0.82      3288
 

                  Pred Negative  Pred Positive
Actual Negative             847            333
Actual Postitive            246           1862


**Saving model**

In [12]:
mnbc = Pipeline([('cvec',CountVectorizer(max_df = 0.9,
                                                  max_features = 10000,
                                                  min_df = 2,
                                                  ngram_range = (1, 1))),
                          ('mnb',MultinomialNB())])
mnbc.fit(X, y)

Pipeline(steps=[('cvec',
                 CountVectorizer(max_df=0.9, max_features=10000, min_df=2)),
                ('mnb', MultinomialNB())])

In [13]:
pickle.dump(mnbc,open('../saved_models/mnbc.sav','wb'))