## 2. Preparing data for modelling and benchmark model

## Content
- [Importing Libraries and Dataset](#Importing-Libraries-and-Dataset)
- [Bench mark model](#Bench-mark-model)
- [Baseline Score](#Baseline-Score)

## Importing Libraries and Dataset

In [1]:
# Importing libraries

import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from matplotlib.pyplot import get_cmap
%matplotlib inline

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import regex as re
import requests
from lxml import html
import getpass
from googletrans import Translator

from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import urllib
import unicodedata

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jiana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jiana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from mlxtend.preprocessing import DenseTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, recall_score, accuracy_score, precision_score, f1_score, roc_auc_score

In [3]:
df = pd.read_csv('../datasets/dataset.csv')

In [4]:
df

Unnamed: 0,text,target_variable
0,one phone hour clear invited onsite five onsit...,1
1,phone done site four leader site take situatio...,1
2,best insanely fast easy maybe pandemic crisis ...,1
3,sent personality simulator invited chime event...,1
4,pas virtual assessment scheduled two virtual d...,1
...,...,...
32874,new office supposed operate dubai region posit...,0
32875,position soon called clearly junior one maybe ...,0
32876,first notified telephone waited long time fina...,0
32877,within day sent mail set video call one picked...,0


## Preparing dataset

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32879 entries, 0 to 32878
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text             32879 non-null  object
 1   target_variable  32879 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 513.9+ KB


In [6]:
df.target_variable.value_counts()

1    21079
0    11800
Name: target_variable, dtype: int64

In [7]:
X = df['text']
y = df['target_variable']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state=42, stratify = y)

In [9]:
y_val.value_counts()

1    2108
0    1180
Name: target_variable, dtype: int64

In [10]:
y_train.value_counts()

1    18971
0    10620
Name: target_variable, dtype: int64

**Saving train and validation dataframes**

In [11]:
train = pd.concat([X_train,y_train], axis = 1)
val = pd.concat([X_val,y_val], axis = 1)

In [12]:
train.to_csv('../datasets/train.csv', index = False)
val.to_csv('../datasets/val.csv', index = False)

## Bench mark model

In [13]:
# Our scorer based on accuracy_score
scorers = {'precision_score': make_scorer(precision_score),
           'recall_score': make_scorer(recall_score),
           'accuracy_score': make_scorer(accuracy_score),
           'f1_score': make_scorer(f1_score),
           'roc_auc_score': make_scorer(roc_auc_score, needs_threshold=True)
          }

In [14]:
# Our scorer based on accuracy_score
scorers = {'precision_score': make_scorer(precision_score),
           'recall_score': make_scorer(recall_score),
           'accuracy_score': make_scorer(accuracy_score),
           'f1_score': make_scorer(f1_score),
           'roc_auc_score': make_scorer(roc_auc_score, needs_threshold=True)
          }

#make a function that prints evaluation metrics score
def evaluation_metrics(model):
    y_pred = model.predict(X_val)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    precision = tp/(tp+fp)
    train_acc = accuracy_score(y_train, model.predict(X_train))
    val_acc = accuracy_score(y_pred, y_val)
    
    print('Train\'s accuracy score : {}'.format(round(train_acc,5)))
    print('Validation\'s accuracy score : {}'.format(round(val_acc,5)))
    print(f'Difference in accuracy scores between train and val: {round(train_acc-val_acc,5)}')
    print(f'Model sensitivity is : {sensitivity}')
    print(f'Model specificity is : {specificity}')
    print(f'Model f1 score is : {(2*sensitivity*precision)/(sensitivity+precision)}')
    model_proba = [i[1] for i in model.predict_proba(X_val)]
    print('ROC_AUC score on Validation Set: {}'.format(round(roc_auc_score(y_val, model_proba), 4)))
    print('\n\nClassification report :\n', classification_report(y_val, y_pred),'\n')
    print(pd.DataFrame({'Pred Negative' : [tn,fn], 'Pred Positive' : [fp,tp]}, index = ['Actual Negative','Actual Postitive']))


#for final model section:
#make a function that prints all classification metrics, AUC-ROC + TN, FP, FN, TP
def all_metrics(model):
    y_pred = model.predict(X_val)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    print("True Negatives: " + str(tn))
    print("False Positives: " + str(fp))
    print("False Negatives: " + str(fn))
    print("True Positives: " + str(tp))
    print()
    print('--------------------------------')
    print()
    print('Accuracy: {}'.format(round(accuracy_score(y_val, y_pred), 4)))
    print('Misclassification rate: {}'.format(round((fp+fn)/(tp+fp+tn+fn),4)))
    print('Precision: {}'.format(round(precision_score(y_val, y_pred), 4)))
    print('Recall: {}'.format(round(recall_score(y_val, y_pred), 4)))
    print('Specificity: {}'.format(round(tn/(tn+fp),4)))
    print(f'Model f1 score is : {(f1_score(y_val, y_pred))}')
    #get roc auc score
    model_proba = [i[1] for i in model.predict_proba(X_val)]
    print('ROC_AUC score on Validation Set: {}'.format(round(roc_auc_score(y_val, model_proba), 4)))

## Baseline Score

In [15]:
knn_pipe_cvec = Pipeline([('cvec',CountVectorizer()),('knn',KNeighborsClassifier())])
knn_pipe_cvec.fit(X_train, y_train)

Pipeline(steps=[('cvec', CountVectorizer()), ('knn', KNeighborsClassifier())])

In [16]:
knn_pipe_cvec.score(X_val, y_val)

0.676094890510949

In [17]:
evaluation_metrics(knn_pipe_cvec)

Train's accuracy score : 0.7534
Validation's accuracy score : 0.67609
Difference in accuracy scores between train and val: 0.07731
Model sensitivity is : 0.9411764705882353
Model specificity is : 0.20254237288135593
Model f1 score is : 0.7883965825551361
ROC_AUC score on Validation Set: 0.6402


Classification report :
               precision    recall  f1-score   support

           0       0.66      0.20      0.31      1180
           1       0.68      0.94      0.79      2108

    accuracy                           0.68      3288
   macro avg       0.67      0.57      0.55      3288
weighted avg       0.67      0.68      0.62      3288
 

                  Pred Negative  Pred Positive
Actual Negative             239            941
Actual Postitive            124           1984


In [18]:
all_metrics(knn_pipe_cvec)

True Negatives: 239
False Positives: 941
False Negatives: 124
True Positives: 1984

--------------------------------

Accuracy: 0.6761
Misclassification rate: 0.3239
Precision: 0.6783
Recall: 0.9412
Specificity: 0.2025
Model f1 score is : 0.7883965825551361
ROC_AUC score on Validation Set: 0.6402
