In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


# PseudoCode and Task List

1. Load the pickled pandas dataframe from 04 notebook and check file contents
2. Prep the dataset for analysis
>2a. Factorize Tags column to a numeric column
>2b. Split into dev, cv, and test sets
3. Second of 3 different models (Bernoulli Naive Bayes) run each with the 3 diff optimized vectorizors - evaluate with AUC scores on the val set of each transformation - 
>3.1 Count Vectorization 3.2 TFIDF 3.3 Doc2Vec
4. TFIDF / Count Vect give about the same auc scores on the cv set as random forest but all models are less overfitted and model runs much faster.


# Tasks 1 Load file and examine contents

In [None]:
'''
1a Import all modules that are needed
'''
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import re
import nltk
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut 
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import preprocess_documents
from gensim.parsing.preprocessing import preprocess_string
from prettytable import PrettyTable

In [None]:
'''
1b Load file 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24330 entries, 0 to 24352
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Id              24330 non-null  int64 
 1   Tag             24330 non-null  object
 2   BodyText_Clean  24330 non-null  object
dtypes: int64(1), object(2)
memory usage: 760.3+ KB
'''

questions_df_clean = pd.read_pickle('/content/drive/My Drive/Capstone2/Data/questions_df_clean_11052020.pickle')
#questions_df_clean = pd.read_pickle('/content/drive/MyDrive/Data Science/Laura_CP2/Copy of questions_df_clean_11052020.pickle')
questions_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24330 entries, 0 to 24352
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Id              24330 non-null  int64 
 1   Tag             24330 non-null  object
 2   BodyText_Clean  24330 non-null  object
dtypes: int64(1), object(2)
memory usage: 760.3+ KB


In [None]:
'''
1c. Examine contents
'''
questions_df_clean.head()
#questions_df_clean.loc[questions_df_clean['BodyText_Clean'].isnull()]
#questions_df_clean.loc[questions_df_clean['Tag'].isnull()]

Unnamed: 0,Id,Tag,BodyText_Clean
0,5,machine-learning,always interest machine learn figure one thing...
1,7,Other,researcher instructor look opensource book sim...
2,14,data-mining,sure data science discus forum several synonym...
3,15,Other,situation would one system prefer relative adv...
4,16,machine-learning,use libsvm train data predict classification s...


# Task 2 - Prep for modelling
>2a. Factorize Tags column to numeric column
>2b. Split into dev, cv, and test sets
>2c. Verify the distribution of tags within the splits

In [None]:
''' 
2a. Factorize Tags Column to numeric
Converting tag column (our target variable) to a numeric column for modelling; 
originally converted to separate columns for each tag
that process returned 1(yes) or 0(no) for each tag name in the original Tag column
multiple y target variables are represented by these multi-labelled columns
returning a list of all these multi-label target columns
That initial process did not work well using the below code
tag_names = questions_df_clean['Tag'].unique().tolist()
#print(tag_names)
tag_dummy = pd.get_dummies(questions_df_clean['Tag'], prefix = 'Tag')
quest_df_dummies = pd.concat([questions_df_clean, tag_dummy], axis = 1)
quest_df_dummies.drop(columns='Tag', inplace=True)
quest_df_dummies.info()
Using factorize code instead suggested by Ajith
'''
questions_df_factorized = questions_df_clean.copy()
#Creating the dependent variable class
factor = pd.factorize(questions_df_factorized['Tag'])
questions_df_factorized.Tag = factor[0]
definitions = factor[1]
print(questions_df_factorized.head())
print(questions_df_clean.head())
print(definitions)
print(factor)

   Id  Tag                                     BodyText_Clean
0   5    0  always interest machine learn figure one thing...
1   7    1  researcher instructor look opensource book sim...
2  14    2  sure data science discus forum several synonym...
3  15    1  situation would one system prefer relative adv...
4  16    0  use libsvm train data predict classification s...
   Id               Tag                                     BodyText_Clean
0   5  machine-learning  always interest machine learn figure one thing...
1   7             Other  researcher instructor look opensource book sim...
2  14       data-mining  sure data science discus forum several synonym...
3  15             Other  situation would one system prefer relative adv...
4  16  machine-learning  use libsvm train data predict classification s...
Index(['machine-learning', 'Other', 'data-mining', 'bigdata', 'r',
       'statistics', 'clustering', 'recommender-system', 'nlp',
       'feature-selection', 'neural-network', '

In [None]:
'''
2b. Split into train (70%) / test (30%). Use the train data and further split into train/val split (similar ratio). 
Leaving the test split to the end.
Splitting into target (y) and predictor (X) variable sets and then into 
test and train sets and using stratification, given that the tag distribution is imbalanced
Experimented with various means to deal with stratification and multi-label classification
and decided the standard scikit learn module code works better

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X=df[list('ABCD')]
Y=pd.DataFrame(mlb.fit_transform(df[['sex','weight']].values), columns=mlb.classes_, index=df.index)

!pip install scikit-multilearn
from skmultilearn.model_selection import iterative_stratification
X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size = 0.30)

'''
# Splitting X and y variables
X=questions_df_factorized[list(questions_df_factorized.columns)[2]]
y=questions_df_factorized[list(questions_df_factorized.columns)[1]]
print(X.shape)
print(y.shape)
# Splitting into train, test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, stratify=y,random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# Further splitting train into dev and validation
X_dev, X_cv, y_dev, y_cv = train_test_split(X_train,y_train,test_size = 0.30,stratify=y_train,random_state=42)
print(X_dev.shape, y_dev.shape)
print(X_cv.shape, y_cv.shape)


(24330,)
(24330,)
(17031,) (17031,)
(7299,) (7299,)
(11921,) (11921,)
(5110,) (5110,)


In [None]:
print(type(y_dev))
y_dev.isnull().sum()

<class 'pandas.core.series.Series'>


0

In [None]:
X_dev = pd.DataFrame(X_dev)
X_cv = pd.DataFrame(X_cv)
X_test = pd.DataFrame(X_test)
X_train = pd.DataFrame(X_train)
X_dev.head()

Unnamed: 0,BodyText_Clean
8286,seems like thing httpswwwsciencedirectcomscien...
23415,build 2hidden layer mlp use keras use scikit l...
23360,write fast rcnn run problem back propagation g...
1698,hear multilayer perceptron approximate functio...
24085,try correlation analysis dataset data cleanse ...


In [None]:
X_dev['BodyText_Clean'] = X_dev['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
X_cv['BodyText_Clean'] = X_cv['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
X_test['BodyText_Clean'] = X_test['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
X_train['BodyText_Clean'] = X_train['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))

# Task 3 - Build second of 3 different models (Bernoulli Naive Bayes) and run with the 3 diff transformations -  measure the accuracy on the val set for each 


In [10]:
'''
3.1a. Running count vectorizer on dev and cv sets with optimal params
'''
cnt_vect = CountVectorizer(min_df=.005, max_df=0.99, ngram_range=(1,1))
X_dev_cntvect_df = pd.DataFrame(cnt_vect.fit_transform(X_dev.BodyText_Clean).toarray(), index=X_dev.index, columns=cnt_vect.get_feature_names())
print(X_dev_cntvect_df.shape)
X_dev_cntvect_df.head()
X_cv_cntvect = cnt_vect.transform(X_cv.BodyText_Clean)

# Convert cv sets using the same transformation

X_cv_cntvect_df = pd.DataFrame(cnt_vect.transform(X_cv.BodyText_Clean).toarray(), index = X_cv.index, columns = cnt_vect.get_feature_names())
X_cv_cntvect_df.head()

(11921, 1231)


Unnamed: 0,1d,2d,2nd,3d,able,absolute,accept,access,accomplish,accord,according,account,accuracy,accurate,achieve,across,action,activation,activity,actual,actually,adam,add,addition,additional,address,adjust,advance,advantage,advice,advise,affect,age,agent,aggregate,ai,aim,al,algorithm,allow,...,visualization,visualize,want,way,web,website,week,weight,weird,welcome,well,whereas,whether,whole,whose,width,wikipedia,win,window,wish,within,without,wonder,word,word2vec,work,world,worth,would,write,wrong,x1,x2,xgboost,xi,year,yes,yet,yield,zero
20034,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22608,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12975,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
614,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,0,0,0,0,0,0,0,0,0,0,0
22615,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
'''
3.1b. Running Bernoulli NB model
'''
bnb_mod_cv_fit = BernoulliNB()
bnb_mod_cv_fit.fit(X_dev_cntvect_df,y_dev)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [12]:
'''
3.1c Print scores - AUC score for both dev and cv sets; cv score is about the same as random forest but and less overfitting
'''
y_cntvect_prob_dev_bnb = bnb_mod_cv_fit.predict_proba(X_dev_cntvect_df)
roc_auc_cntvect_dev_bnb = roc_auc_score(y_dev, y_cntvect_prob_dev_bnb, multi_class="ovo",
                                  average="macro")

print("CntVect RF Train AUC Score:", round(roc_auc_cntvect_dev_bnb,4))

y_cntvect_prob_val_bnb = bnb_mod_cv_fit.predict_proba(X_cv_cntvect_df)
roc_auc_cntvect_val_bnb = roc_auc_score(y_cv, y_cntvect_prob_val_bnb, multi_class="ovo",
                                  average="macro")
print("CntVect RF Val AUC Score:  ", round(roc_auc_cntvect_val_bnb,4))

CntVect RF Train AUC Score: 0.9356
CntVect RF Val AUC Score:   0.8323


In [13]:
'''
3.2a Run tfidf with optimal params
'''
tfidf_vect = TfidfVectorizer(min_df=.001, max_df=0.999, ngram_range=(1,1))
X_dev_tfidf_df = pd.DataFrame(tfidf_vect.fit_transform(X_dev.BodyText_Clean).toarray(), index=X_dev.index, columns=tfidf_vect.get_feature_names())
print(X_dev_tfidf_df.shape)
X_dev_tfidf_df.head()
X_cv_tfidf = tfidf_vect.transform(X_cv.BodyText_Clean)

# Convert cv sets using the same transformation

X_cv_tfidf_df = pd.DataFrame(tfidf_vect.transform(X_cv.BodyText_Clean).toarray(), index = X_cv.index, columns = tfidf_vect.get_feature_names())
X_cv_tfidf_df.head()

(11921, 3144)


Unnamed: 0,100k,10fold,10k,1d,1m,1st,1x1,20k,2d,2nd,2x2,30k,3d,3rd,3x3,4th,500k,50k,5fold,5k,5th,5x5,8gb,a1,a2,ab,abbreviation,abc,ability,able,abnormal,absolute,absolutely,abstract,academic,acc,accelerate,acceleration,accelerometer,accept,...,wt,x0,x1,x2,x3,x4,xaxis,xgboost,xi,xml,xn,xor,xt,xtest,xtrain,xy,xyz,y0,y1,y2,yaxis,year,yellow,yes,yesno,yet,yi,yield,yolo,york,youtube,ypred,yt,ytest,ytrain,ytrue,zero,zip,zoom,zscore
20034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151038,0.0,0.0
614,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
'''
3.2b. Running Bernoulli NB model
'''
bnb_mod_tfidf_fit = BernoulliNB()
bnb_mod_tfidf_fit.fit(X_dev_tfidf_df,y_dev)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [16]:
'''
3.2c. Print scores - AUC score for both dev and cv sets; cv score is about the same as random forest but and less overfitting
'''
y_tfidf_prob_dev_bnb = bnb_mod_tfidf_fit.predict_proba(X_dev_tfidf_df)
roc_auc_tfidf_dev_bnb = roc_auc_score(y_dev, y_tfidf_prob_dev_bnb, multi_class="ovo",
                                  average="macro")

print("TFIDF BNB Train AUC Score:", round(roc_auc_tfidf_dev_bnb,4))

y_tfidf_prob_val_bnb = bnb_mod_tfidf_fit.predict_proba(X_cv_tfidf)
roc_auc_tfidf_val_bnb = roc_auc_score(y_cv, y_tfidf_prob_val_bnb, multi_class="ovo",
                                  average="macro")
print("TFIDF BNB Val AUC Score:  ", round(roc_auc_tfidf_val_bnb,4))

TFIDF BNB Train AUC Score: 0.9056
TFIDF BNB Val AUC Score:   0.7832


In [17]:
'''
3.3a. Loading hypertuned doc2vec model from previous notebook and re-creating the tagged docs, then evaluate
'''
fnl_d2v_model = Doc2Vec.load("/content/drive/My Drive/Capstone2/Data/final_d2v_500.model")

# Build separate dataframes with the dev, cv, and test 
reversefactor = dict(zip(range(34),definitions))

y_dev_rf = np.vectorize(reversefactor.get)(y_dev)
y_cv_rf = np.vectorize(reversefactor.get)(y_cv)
y_test_rf = np.vectorize(reversefactor.get)(y_test)

d2v_dev_df = pd.DataFrame({'y': y_dev_rf, 'X': X_dev.BodyText_Clean})
d2v_cv_df = pd.DataFrame({'y': y_cv_rf, 'X': X_cv.BodyText_Clean})
d2v_test_df = pd.DataFrame({'y': y_test_rf, 'X': X_test.BodyText_Clean})
#d2v_dev_df.head()

# Ensure they're all string datatype
d2v_dev_df['X'] = d2v_dev_df['X'].astype(str)
d2v_cv_df['X'] = d2v_cv_df['X'].astype(str)
d2v_test_df['X'] = d2v_test_df['X'].astype(str)
d2v_dev_df.head()

class TaggedDocumentIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])
 
docLabels_dev = list(d2v_dev_df['y'])
data_dev = list(d2v_dev_df['X'])
tagged_docs_dev = TaggedDocumentIterator(data_dev, docLabels_dev)

docLabels_cv = list(d2v_cv_df['y'])
data_cv = list(d2v_cv_df['X'])
tagged_docs_cv = TaggedDocumentIterator(data_cv, docLabels_cv)

docLabels_test = list(d2v_test_df['y'])
data_test = list(d2v_test_df['X'])
tagged_docs_test = TaggedDocumentIterator(data_test, docLabels_test)

type(tagged_docs_dev)
d2v_dev_df.info()

fnl_dev_targets, fnl_dev_regressors = zip(*[(doc.tags[0], fnl_d2v_model.infer_vector(doc.words, steps=20)) for doc in tagged_docs_dev])
fnl_cv_targets, fnl_cv_regressors = zip(*[(doc.tags[0], fnl_d2v_model.infer_vector(doc.words, steps=20)) for doc in tagged_docs_cv])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11921 entries, 8286 to 3207
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   y       11921 non-null  object
 1   X       11921 non-null  object
dtypes: object(2)
memory usage: 279.4+ KB


In [18]:
'''
3.3b. Run Running Bernoulli NB model to evaluate the success of doc2vec transform on cv set
'''
bnb_mod_d2v_fit = BernoulliNB()
bnb_mod_d2v_fit.fit(fnl_dev_regressors, fnl_dev_targets)

fnl_cv_targets_pred_d2v = bnb_mod_d2v_fit.predict(fnl_cv_regressors)

In [19]:
#Print scores - not as high a score as RF model but less overfitting

fnl_dev_targets_prob_d2v_bnb = bnb_mod_d2v_fit.predict_proba(fnl_dev_regressors)
roc_auc_d2v_dev_bnb = roc_auc_score(fnl_dev_targets,fnl_dev_targets_prob_d2v_bnb, multi_class="ovo",
                                  average="macro")

print("Doc2Vec BNB Train AUC Score:", round(roc_auc_d2v_dev_bnb,4))

fnl_cv_targets_prob_d2v_bnb = bnb_mod_d2v_fit.predict_proba(fnl_cv_regressors)
#print(fnl_cv_targets_prob_d2v)
roc_auc_d2v_val_bnb = roc_auc_score(fnl_cv_targets, fnl_cv_targets_prob_d2v_bnb, multi_class="ovo", average="macro")

print("Doc2Vec BNB Val AUC Score:  ", round(roc_auc_d2v_val_bnb,4))

Doc2Vec BNB Train AUC Score: 0.9682
Doc2Vec BNB Val AUC Score:   0.8546


In [21]:
'''
Code for creating a nice table for model comparison results
'''
x = PrettyTable()
x.field_names = ["Model","Transformation",'AUC Train Score' ,'AUC Val Score']

x.add_row(["Bernoulli Naive Bayes", "Doc2Vec", round(roc_auc_d2v_dev_bnb,4), round(roc_auc_d2v_val_bnb,4)])
x.add_row(["Bernoulli Naive Bayes", "Count Vectorizer",round(roc_auc_cntvect_dev_bnb,4),round(roc_auc_cntvect_val_bnb,4)])
x.add_row(["Bernoulli Naive Bayes", "Tf-idf",round(roc_auc_tfidf_dev_bnb,4),round(roc_auc_tfidf_val_bnb,4)])

print(x)


+-----------------------+------------------+-----------------+---------------+
|         Model         |  Transformation  | AUC Train Score | AUC Val Score |
+-----------------------+------------------+-----------------+---------------+
| Bernoulli Naive Bayes |     Doc2Vec      |      0.9682     |     0.8546    |
| Bernoulli Naive Bayes | Count Vectorizer |      0.9356     |     0.8323    |
| Bernoulli Naive Bayes |      Tf-idf      |      0.9056     |     0.7832    |
+-----------------------+------------------+-----------------+---------------+


# Task 4 Hypertune to reduce overfitting

In [22]:
'''
Overfitting and underfitting
1. When Laplace smoothing alpha is set to 0- then it overfits the model- high variance model.
2. When alpha set to a very high value- then a data point which occurs a very few times would also have a high accuracy 
resulting in underfittng — biased model.
4.1 Grid Search 5 fold CV to see if we can reduce overfitting
parameters tested will be 
alpha = [1.0,2.0,3.0,4.0,5.0]
'''
# Create the parameter grid 
param_grid = {
    'alpha': [1.0,2.0,3.0,4.0,5.0]
}
# Create a based model
bnb = BernoulliNB()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = bnb, param_grid = param_grid, scoring= 'roc_auc_ovo', cv = 5, verbose = 2)

In [23]:
# 4.2 Fit the grid search to the doc2vec trained data since this provided the best accuracy of all 3 transformations
grid_search.fit(fnl_dev_regressors, fnl_dev_targets)
best_grid = grid_search.best_estimator_

print(grid_search.best_params_)
print(best_grid)
print(grid_search.cv_results_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] alpha=1.0 .......................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........................................ alpha=1.0, total=   0.8s
[CV] alpha=1.0 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] ........................................ alpha=1.0, total=   0.8s
[CV] alpha=1.0 .......................................................
[CV] ........................................ alpha=1.0, total=   0.8s
[CV] alpha=1.0 .......................................................
[CV] ........................................ alpha=1.0, total=   0.8s
[CV] alpha=1.0 .......................................................
[CV] ........................................ alpha=1.0, total=   0.8s
[CV] alpha=2.0 .......................................................
[CV] ........................................ alpha=2.0, total=   0.8s
[CV] alpha=2.0 .......................................................
[CV] ........................................ alpha=2.0, total=   0.7s
[CV] alpha=2.0 .......................................................
[CV] ........................................ alpha=2.0, total=   0.7s
[CV] alpha=2.0 .......................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   18.9s finished


In [24]:
'''
4.2 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
'''
print(best_grid)


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


In [26]:
# 4.2 {'alpha': 1.0}
print(grid_search.best_params_)

{'alpha': 1.0}


In [27]:
print(grid_search.cv_results_)

{'mean_fit_time': array([0.12455645, 0.11872535, 0.11887136, 0.12011218, 0.12114744]), 'std_fit_time': array([0.00657473, 0.00111062, 0.00228002, 0.00224463, 0.0048651 ]), 'mean_score_time': array([0.65118241, 0.63013186, 0.63081613, 0.62681923, 0.63890057]), 'std_score_time': array([0.00967311, 0.00486032, 0.00912582, 0.00682194, 0.00595291]), 'param_alpha': masked_array(data=[1.0, 2.0, 3.0, 4.0, 5.0],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 1.0}, {'alpha': 2.0}, {'alpha': 3.0}, {'alpha': 4.0}, {'alpha': 5.0}], 'split0_test_score': array([0.95582095, 0.95297177, 0.94955169, 0.94578998, 0.94194779]), 'split1_test_score': array([0.94932404, 0.94706809, 0.94451696, 0.94132572, 0.93759028]), 'split2_test_score': array([0.96365597, 0.9612407 , 0.95835067, 0.95514111, 0.95169281]), 'split3_test_score': array([0.96680006, 0.96461282, 0.96182384, 0.95891856, 0.95557868]), 'split4_test_score': array([0.975965