In [1]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


# PseudoCode and Task List

1. Load the pickled pandas dataframe from 04 notebook and check file contents
2. Prep the dataset for analysis
>2a. Factorize Tags column to a numeric column
>2b. Split into dev, cv, and test sets
3. First of 3 different models (Random Forest) run each with the 3 diff vectorizors - evaluate with AUC scores on the val set of each transformation -
>3.1 Count Vectorization
>3.2 TFIDF
>3.3 Doc2Vec 
4. TFIDF gave slightly higher auc scores on the cv set - all models are highly overfitted however; a final hyperparam tuning step performed to reduce overfitting was attempted



# Task 1 Load file and examine contents

In [2]:
'''
1a Import all modules that are needed
'''
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import re
import nltk
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut 
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import preprocess_documents
from gensim.parsing.preprocessing import preprocess_string
from prettytable import PrettyTable

In [3]:
'''
1b Load file 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24330 entries, 0 to 24352
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Id              24330 non-null  int64 
 1   Tag             24330 non-null  object
 2   BodyText_Clean  24330 non-null  object
dtypes: int64(1), object(2)
memory usage: 760.3+ KB
'''

questions_df_clean = pd.read_pickle('/content/drive/My Drive/Capstone2/Data/questions_df_clean_11052020.pickle')
#questions_df_clean = pd.read_pickle('/content/drive/MyDrive/Data Science/Laura_CP2/Copy of questions_df_clean_11052020.pickle')
questions_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24330 entries, 0 to 24352
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Id              24330 non-null  int64 
 1   Tag             24330 non-null  object
 2   BodyText_Clean  24330 non-null  object
dtypes: int64(1), object(2)
memory usage: 760.3+ KB


In [4]:
'''
1c. Examine contents
'''
questions_df_clean.head()
#questions_df_clean.loc[questions_df_clean['BodyText_Clean'].isnull()]
#questions_df_clean.loc[questions_df_clean['Tag'].isnull()]

Unnamed: 0,Id,Tag,BodyText_Clean
0,5,machine-learning,always interest machine learn figure one thing...
1,7,Other,researcher instructor look opensource book sim...
2,14,data-mining,sure data science discus forum several synonym...
3,15,Other,situation would one system prefer relative adv...
4,16,machine-learning,use libsvm train data predict classification s...


# Task 2 - Prep for modelling
>2a. Factorize Tags column to numeric column
>2b. Split into dev, cv, and test sets
>2c. Verify the distribution of tags within the splits

In [5]:
''' 
2a. Factorize Tags Column to numeric
Converting tag column (our target variable) to a numeric column for modelling; 
originally converted to separate columns for each tag
that process returned 1(yes) or 0(no) for each tag name in the original Tag column
multiple y target variables are represented by these multi-labelled columns
returning a list of all these multi-label target columns
That initial process did not work well using the below code
tag_names = questions_df_clean['Tag'].unique().tolist()
#print(tag_names)
tag_dummy = pd.get_dummies(questions_df_clean['Tag'], prefix = 'Tag')
quest_df_dummies = pd.concat([questions_df_clean, tag_dummy], axis = 1)
quest_df_dummies.drop(columns='Tag', inplace=True)
quest_df_dummies.info()
Using factorize code instead suggested by Ajith
'''
questions_df_factorized = questions_df_clean.copy()
#Creating the dependent variable class
factor = pd.factorize(questions_df_factorized['Tag'])
questions_df_factorized.Tag = factor[0]
definitions = factor[1]
print(questions_df_factorized.head())
print(questions_df_clean.head())
print(definitions)
print(factor)

   Id  Tag                                     BodyText_Clean
0   5    0  always interest machine learn figure one thing...
1   7    1  researcher instructor look opensource book sim...
2  14    2  sure data science discus forum several synonym...
3  15    1  situation would one system prefer relative adv...
4  16    0  use libsvm train data predict classification s...
   Id               Tag                                     BodyText_Clean
0   5  machine-learning  always interest machine learn figure one thing...
1   7             Other  researcher instructor look opensource book sim...
2  14       data-mining  sure data science discus forum several synonym...
3  15             Other  situation would one system prefer relative adv...
4  16  machine-learning  use libsvm train data predict classification s...
Index(['machine-learning', 'Other', 'data-mining', 'bigdata', 'r',
       'statistics', 'clustering', 'recommender-system', 'nlp',
       'feature-selection', 'neural-network', '

In [6]:
'''
2b. Split into train (70%) / test (30%). Use the train data and further split into train/val split (similar ratio). 
Leaving the test split to the end.
Splitting into target (y) and predictor (X) variable sets and then into 
test and train sets and using stratification, given that the tag distribution is imbalanced
Experimented with various means to deal with stratification and multi-label classification
and decided the standard scikit learn module code works better

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X=df[list('ABCD')]
Y=pd.DataFrame(mlb.fit_transform(df[['sex','weight']].values), columns=mlb.classes_, index=df.index)

!pip install scikit-multilearn
from skmultilearn.model_selection import iterative_stratification
X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size = 0.30)

'''
# Splitting X and y variables
X=questions_df_factorized[list(questions_df_factorized.columns)[2]]
y=questions_df_factorized[list(questions_df_factorized.columns)[1]]
print(X.shape)
print(y.shape)
# Splitting into train, test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, stratify=y,random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# Further splitting train into dev and validation
X_dev, X_cv, y_dev, y_cv = train_test_split(X_train,y_train,test_size = 0.30,stratify=y_train,random_state=42)
print(X_dev.shape, y_dev.shape)
print(X_cv.shape, y_cv.shape)


(24330,)
(24330,)
(17031,) (17031,)
(7299,) (7299,)
(11921,) (11921,)
(5110,) (5110,)


In [7]:
print(type(y_dev))
y_dev.isnull().sum()

<class 'pandas.core.series.Series'>


0

In [8]:
X_dev = pd.DataFrame(X_dev)
X_cv = pd.DataFrame(X_cv)
X_test = pd.DataFrame(X_test)
X_train = pd.DataFrame(X_train)
X_dev.head()

Unnamed: 0,BodyText_Clean
8286,seems like thing httpswwwsciencedirectcomscien...
23415,build 2hidden layer mlp use keras use scikit l...
23360,write fast rcnn run problem back propagation g...
1698,hear multilayer perceptron approximate functio...
24085,try correlation analysis dataset data cleanse ...


In [9]:
X_dev['BodyText_Clean'] = X_dev['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
X_cv['BodyText_Clean'] = X_cv['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
X_test['BodyText_Clean'] = X_test['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
X_train['BodyText_Clean'] = X_train['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))

# Task 3 - Build first of 3 different models (Random Forest) and run with the 3 diff transformations -  measure the accuracy on the val set for each 


In [10]:
'''
3.1a. Running count vectorizer on dev and cv sets with optimal params
'''
cnt_vect = CountVectorizer(min_df=.005,max_df=.99, ngram_range=(1, 1))
X_dev_cntvect_df = pd.DataFrame(cnt_vect.fit_transform(X_dev.BodyText_Clean).toarray(), index=X_dev.index, columns=cnt_vect.get_feature_names())
print(X_dev_cntvect_df.shape)
X_dev_cntvect_df.head()
X_cv_cntvect = cnt_vect.transform(X_cv.BodyText_Clean)

# Convert cv sets using the same transformation

X_cv_cntvect_df = pd.DataFrame(cnt_vect.transform(X_cv.BodyText_Clean).toarray(), index = X_cv.index, columns = cnt_vect.get_feature_names())
X_cv_cntvect_df.head()

(11921, 1231)


Unnamed: 0,1d,2d,2nd,3d,able,absolute,accept,access,accomplish,accord,according,account,accuracy,accurate,achieve,across,action,activation,activity,actual,actually,adam,add,addition,additional,address,adjust,advance,advantage,advice,advise,affect,age,agent,aggregate,ai,aim,al,algorithm,allow,...,visualization,visualize,want,way,web,website,week,weight,weird,welcome,well,whereas,whether,whole,whose,width,wikipedia,win,window,wish,within,without,wonder,word,word2vec,work,world,worth,would,write,wrong,x1,x2,xgboost,xi,year,yes,yet,yield,zero
20034,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22608,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12975,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
614,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,0,0,0,0,0,0,0,0,0,0,0
22615,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
'''
3.1b. Running base random forest model
'''
rf_base_cv  = RandomForestClassifier(n_estimators=10, max_features='sqrt', criterion = 'entropy', random_state = 42)

rf_base_cv.fit(X_dev_cntvect_df,y_dev)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [12]:
'''
3.1c Print scores - AUC score for both dev and cv sets; very high train score - significant overfitting and val auc score of .64
'''
y_cntvect_prob_dev = rf_base_cv.predict_proba(X_dev_cntvect_df)
roc_auc_cntvect_dev = roc_auc_score(y_dev, y_cntvect_prob_dev, multi_class="ovo",
                                  average="macro")

print("CntVect RF Train AUC Score:", round(roc_auc_cntvect_dev,4))

y_cntvect_prob_val = rf_base_cv.predict_proba(X_cv_cntvect_df)
roc_auc_cntvect_val = roc_auc_score(y_cv, y_cntvect_prob_val, multi_class="ovo",
                                  average="macro")
print("CntVect RF Val AUC Score:   ", round(roc_auc_cntvect_val,4))


CntVect RF Train AUC Score: 0.9999
CntVect RF Val AUC Score:    0.6557


In [13]:
'''
3.1d. Running base random forest model with more estimators
'''
rf_base_cv_1000  = RandomForestClassifier(n_estimators=1000, max_features='sqrt', criterion = 'entropy', random_state = 42)

rf_base_cv_1000 .fit(X_dev_cntvect_df,y_dev)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [14]:
'''
3.1e Print scores - AUC score for both dev and cv sets; very high train score - still significant overfitting but increasing estimators improves val score significantly
'''
y_cntvect_prob_dev_1000 = rf_base_cv_1000.predict_proba(X_dev_cntvect_df)
roc_auc_cntvect_dev_1000 = roc_auc_score(y_dev, y_cntvect_prob_dev_1000, multi_class="ovo",
                                  average="macro")

print("CntVect RF Train AUC Score:", round(roc_auc_cntvect_dev_1000,4))

y_cntvect_prob_val_1000 = rf_base_cv_1000.predict_proba(X_cv_cntvect_df)
roc_auc_cntvect_val_1000 = roc_auc_score(y_cv, y_cntvect_prob_val_1000, multi_class="ovo",
                                  average="macro")
print("CntVect RF Val AUC Score:   ", round(roc_auc_cntvect_val_1000,4))

CntVect RF Train AUC Score: 1.0
CntVect RF Val AUC Score:    0.8627


In [15]:
'''
3.2a Run tfidf with optimal params
'''
tfidf_vect = TfidfVectorizer(min_df=.001,max_df=.999, ngram_range=(1, 1))
X_dev_tfidf_df = pd.DataFrame(tfidf_vect.fit_transform(X_dev.BodyText_Clean).toarray(), index=X_dev.index, columns=tfidf_vect.get_feature_names())
print(X_dev_tfidf_df.shape)
X_dev_tfidf_df.head()
X_cv_tfidf = tfidf_vect.transform(X_cv.BodyText_Clean)

(11921, 3144)


In [16]:
'''
3.2b Running base random forest model
'''
rf_base_tfidf  = RandomForestClassifier(n_estimators=10, max_features='sqrt', criterion = 'entropy', random_state = 42)

rf_base_tfidf.fit(X_dev_tfidf_df,y_dev)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [17]:
'''
3.2c Print scores - AUC score for both dev and cv sets; very high train score - significant overfitting again; but val scores are slightly higher than with count vectorizer
'''
y_tfidf_prob_dev = rf_base_tfidf.predict_proba(X_dev_tfidf_df)
roc_auc_tfidf_dev = roc_auc_score(y_dev, y_tfidf_prob_dev, multi_class="ovo",
                                  average="macro")

print("TFIDF RF Train AUC Score:", round(roc_auc_tfidf_dev,4))

y_tfidf_prob_val = rf_base_tfidf.predict_proba(X_cv_tfidf)
roc_auc_tfidf_val = roc_auc_score(y_cv, y_tfidf_prob_val, multi_class="ovo",
                                  average="macro")
print("TFIDF RF Val AUC Score:  ", round(roc_auc_tfidf_val,4))

TFIDF RF Train AUC Score: 0.9999
TFIDF RF Val AUC Score:   0.6285


In [18]:
'''
3.2d. Running base random forest model with more estimators
'''
rf_base_tfidf_1000  = RandomForestClassifier(n_estimators=1000, max_features='sqrt', criterion = 'entropy', random_state = 42)

rf_base_tfidf_1000.fit(X_dev_tfidf_df,y_dev)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [19]:
'''
3.2e Print scores - AUC score for both dev and cv sets; very high train score - still significant overfitting but increasing estimators improves val score significantly
this is best score so far
'''
y_tfidf_prob_dev_1000 = rf_base_tfidf_1000.predict_proba(X_dev_tfidf_df)
roc_auc_tfidf_dev_1000 = roc_auc_score(y_dev, y_tfidf_prob_dev_1000, multi_class="ovo",
                                  average="macro")

print("TFIDF RF Train AUC Score:", round(roc_auc_tfidf_dev_1000,4))

y_tfidf_prob_val_1000 = rf_base_tfidf_1000.predict_proba(X_cv_tfidf)
roc_auc_tfidf_val_1000 = roc_auc_score(y_cv, y_tfidf_prob_val_1000, multi_class="ovo",
                                  average="macro")
print("TFIDF RF Val AUC Score:   ", round(roc_auc_tfidf_val_1000,4))

TFIDF RF Train AUC Score: 1.0
TFIDF RF Val AUC Score:    0.8678


In [20]:
'''
3.3a. Loading hypertuned doc2vec model from previous notebook and re-creating the tagged docs, then evaluate
'''
fnl_d2v_model_200 = Doc2Vec.load("/content/drive/My Drive/Capstone2/Data/final_d2v_500.model")

# Build separate dataframes with the dev, cv, and test 
reversefactor = dict(zip(range(34),definitions))

y_dev_rf = np.vectorize(reversefactor.get)(y_dev)
y_cv_rf = np.vectorize(reversefactor.get)(y_cv)
y_test_rf = np.vectorize(reversefactor.get)(y_test)

d2v_dev_df = pd.DataFrame({'y': y_dev_rf, 'X': X_dev.BodyText_Clean})
d2v_cv_df = pd.DataFrame({'y': y_cv_rf, 'X': X_cv.BodyText_Clean})
d2v_test_df = pd.DataFrame({'y': y_test_rf, 'X': X_test.BodyText_Clean})
#d2v_dev_df.head()

# Ensure they're all string datatype
d2v_dev_df['X'] = d2v_dev_df['X'].astype(str)
d2v_cv_df['X'] = d2v_cv_df['X'].astype(str)
d2v_test_df['X'] = d2v_test_df['X'].astype(str)
d2v_dev_df.head()

class TaggedDocumentIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])
 
docLabels_dev = list(d2v_dev_df['y'])
data_dev = list(d2v_dev_df['X'])
tagged_docs_dev = TaggedDocumentIterator(data_dev, docLabels_dev)

docLabels_cv = list(d2v_cv_df['y'])
data_cv = list(d2v_cv_df['X'])
tagged_docs_cv = TaggedDocumentIterator(data_cv, docLabels_cv)

docLabels_test = list(d2v_test_df['y'])
data_test = list(d2v_test_df['X'])
tagged_docs_test = TaggedDocumentIterator(data_test, docLabels_test)

type(tagged_docs_dev)
d2v_dev_df.info()

fnl_dev_targets, fnl_dev_regressors = zip(*[(doc.tags[0], fnl_d2v_model_200.infer_vector(doc.words, steps=20)) for doc in tagged_docs_dev])
fnl_cv_targets, fnl_cv_regressors = zip(*[(doc.tags[0], fnl_d2v_model_200.infer_vector(doc.words, steps=20)) for doc in tagged_docs_cv])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11921 entries, 8286 to 3207
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   y       11921 non-null  object
 1   X       11921 non-null  object
dtypes: object(2)
memory usage: 279.4+ KB


In [21]:
'''
3.3b. Run base random forest multi-class model to evaluate the success of doc2vec transform on cv set
'''

fnl_rf_base_d2v  = RandomForestClassifier(n_estimators=10, max_features='sqrt', criterion = 'entropy', random_state = 42)
fnl_fit = fnl_rf_base_d2v.fit(fnl_dev_regressors, fnl_dev_targets)
fnl_cv_targets_pred_d2v = fnl_rf_base_d2v.predict(fnl_cv_regressors)


In [22]:
'''
3.3c Print scores
'''
fnl_dev_targets_prob_d2v = fnl_rf_base_d2v.predict_proba(fnl_dev_regressors)
roc_auc_d2v_dev = roc_auc_score(fnl_dev_targets, fnl_dev_targets_prob_d2v, multi_class="ovo",
                                  average="macro")

print("Doc2Vec RF Train AUC Score:", round(roc_auc_d2v_dev,4))

fnl_val_targets_prob_d2v = fnl_rf_base_d2v.predict_proba(fnl_cv_regressors)
#print(fnl_cv_targets_prob_d2v)
roc_auc_d2v_val = roc_auc_score(fnl_cv_targets, fnl_val_targets_prob_d2v, multi_class="ovo", average="macro")

print("Cross Val AUC Score:        ", round(roc_auc_d2v_val,4))

Doc2Vec RF Train AUC Score: 1.0
Cross Val AUC Score:         0.6724


In [23]:
'''
3.3d. Run base random forest multi-class model to evaluate the success of doc2vec transform on cv set - increasing estimators
'''

fnl_rf_base_d2v_1000  = RandomForestClassifier(n_estimators=1000, max_features='sqrt', criterion = 'entropy', random_state = 42)
fnl_fit_1000 = fnl_rf_base_d2v_1000 .fit(fnl_dev_regressors, fnl_dev_targets)
fnl_cv_targets_pred_d2v_1000 = fnl_rf_base_d2v_1000.predict(fnl_cv_regressors)

In [24]:
'''
3.3e Print scores
'''
fnl_dev_targets_prob_d2v_1000 = fnl_rf_base_d2v_1000.predict_proba(fnl_dev_regressors)
roc_auc_d2v_dev_1000 = roc_auc_score(fnl_dev_targets, fnl_dev_targets_prob_d2v_1000, multi_class="ovo",
                                  average="macro")

print("Doc2Vec RF Train AUC Score:", round(roc_auc_d2v_dev_1000,4))

fnl_val_targets_prob_d2v_1000 = fnl_rf_base_d2v_1000.predict_proba(fnl_cv_regressors)
#print(fnl_cv_targets_prob_d2v)
roc_auc_d2v_val_1000 = roc_auc_score(fnl_cv_targets, fnl_val_targets_prob_d2v_1000, multi_class="ovo", average="macro")

print("Cross Val AUC Score:        ", round(roc_auc_d2v_val_1000,4))

Doc2Vec RF Train AUC Score: 1.0
Cross Val AUC Score:         0.8574


In [26]:
'''
Code for creating a nice table for model comparison results
'''
x = PrettyTable()
x.field_names = ["Model","Estimators","Transformation",'AUC Train Score' ,'AUC Val Score']


x.add_row(["Random Forest",1000, "Tf-idf",round(roc_auc_tfidf_dev_1000,4),round(roc_auc_tfidf_val_1000,4)])
x.add_row(["Random Forest",1000, "Count Vectorizer", round(roc_auc_cntvect_dev_1000,4),round(roc_auc_cntvect_val_1000,4)])
x.add_row(["Random Forest",1000, "Doc2Vec", round(roc_auc_d2v_dev_1000,4), round(roc_auc_d2v_val_1000,4)])
x.add_row(["Random Forest",10, "Count Vectorizer", round(roc_auc_cntvect_dev,4),round(roc_auc_cntvect_val,4)])
x.add_row(["Random Forest",10, "Doc2Vec", round(roc_auc_d2v_dev,4), round(roc_auc_d2v_val,4)])
x.add_row(["Random Forest",10, "Tf-idf", round(roc_auc_tfidf_dev,4),round(roc_auc_tfidf_val,4)])

print(x)


+---------------+------------+------------------+-----------------+---------------+
|     Model     | Estimators |  Transformation  | AUC Train Score | AUC Val Score |
+---------------+------------+------------------+-----------------+---------------+
| Random Forest |    1000    |      Tf-idf      |       1.0       |     0.8678    |
| Random Forest |    1000    | Count Vectorizer |       1.0       |     0.8627    |
| Random Forest |    1000    |     Doc2Vec      |       1.0       |     0.8574    |
| Random Forest |     10     | Count Vectorizer |      0.9999     |     0.6557    |
| Random Forest |     10     |     Doc2Vec      |       1.0       |     0.6724    |
| Random Forest |     10     |      Tf-idf      |      0.9999     |     0.6285    |
+---------------+------------+------------------+-----------------+---------------+


# Task 4 - hypertune to reduce overfitting

In [None]:
'''
4.1 Grid Search 5 fold CV to see if we can reduce overfitting
parameters tested will be 
max_features = ['sqrt', .25, .50] of the number of features - not included as this gave an error
min_samples_leaf = [1,2,5] branches will stop splitting once the leaves have that number of samples default is 1
mtry(number of features randomly selected for each tree) - did not find this parameter as an option (same as max_samples?)
max_depth = [None, 5,10,15] (how deep the tree can growand reduction of this parameter will reduce complexity) default is None
n_estimators = [100,500,1000,2000] default is 100 - best param was 100
'''
# Create the parameter grid 
param_grid = {
    'n_estimators': [10,100,500,1000],
    'min_samples_leaf' : [1,2,5],
    'max_depth' : [None, 5,10,15]
}
# Create a based model
rf = RandomForestClassifier(max_features='sqrt', criterion = 'entropy', random_state = 42)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, scoring= 'roc_auc_ovo', cv = 5, verbose = 2)

In [None]:
# 4.2 Fit the grid search to the tfidf trained data since this provided the best accuracy of all 3 transformations
grid_search.fit(X_dev_tfidf_df,y_dev)
best_grid = grid_search.best_estimator_

print(grid_search.best_params_)
print(best_grid)
print(grid_search.cv_results_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] max_depth=None, min_samples_leaf=1, n_estimators=10 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=None, min_samples_leaf=1, n_estimators=10, total=   4.2s
[CV] max_depth=None, min_samples_leaf=1, n_estimators=10 .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.2s remaining:    0.0s


[CV]  max_depth=None, min_samples_leaf=1, n_estimators=10, total=   3.6s
[CV] max_depth=None, min_samples_leaf=1, n_estimators=10 .............
[CV]  max_depth=None, min_samples_leaf=1, n_estimators=10, total=   3.7s
[CV] max_depth=None, min_samples_leaf=1, n_estimators=10 .............
[CV]  max_depth=None, min_samples_leaf=1, n_estimators=10, total=   3.6s
[CV] max_depth=None, min_samples_leaf=1, n_estimators=10 .............
[CV]  max_depth=None, min_samples_leaf=1, n_estimators=10, total=   3.7s
[CV] max_depth=None, min_samples_leaf=1, n_estimators=100 ............
[CV]  max_depth=None, min_samples_leaf=1, n_estimators=100, total=  30.6s
[CV] max_depth=None, min_samples_leaf=1, n_estimators=100 ............
[CV]  max_depth=None, min_samples_leaf=1, n_estimators=100, total=  30.3s
[CV] max_depth=None, min_samples_leaf=1, n_estimators=100 ............
[CV]  max_depth=None, min_samples_leaf=1, n_estimators=100, total=  30.6s
[CV] max_depth=None, min_samples_leaf=1, n_estimators=100 ..

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 230.7min finished


{'max_depth': 15, 'min_samples_leaf': 5, 'n_estimators': 1000}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=15, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
{'mean_fit_time': array([  3.17725468,  29.76168466, 150.53063316, 269.6014957 ,
         2.35120544,  22.43549848, 118.49562855, 242.60547099,
         2.18398728,  20.68777142, 103.86691227, 196.11157722,
         0.84222093,   6.29203572,  30.64689741,  61.64968052,
         0.78786922,   6.25896435,  30.39445848,  60.5715065 ,
         0.73844991,   6.03005462,  29.45691643,  58.8666

In [None]:
'''
4.2 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=15, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
'''
print(best_grid)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


In [None]:
# 4.2 
print(grid_search.best_params_)
{'max_depth': 15, 'min_samples_leaf': 5, 'n_estimators': 1000}

{'n_estimators': 100}


In [None]:
print(grid_search.cv_results_)

{'mean_fit_time': array([ 27.2721221 , 148.15297918, 297.07272348, 545.12819142]), 'std_fit_time': array([ 0.9856402 ,  5.57172965, 12.11067414,  3.14160984]), 'mean_score_time': array([0.24836831, 1.26323528, 2.56578836, 5.39648328]), 'std_score_time': array([0.00745542, 0.03476885, 0.10739159, 0.87424313]), 'param_n_estimators': masked_array(data=[100, 500, 1000, 2000],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 100}, {'n_estimators': 500}, {'n_estimators': 1000}, {'n_estimators': 2000}], 'split0_test_score': array([0.34884696, 0.34842767, 0.34884696, 0.34884696]), 'split1_test_score': array([0.35025168, 0.34479866, 0.34437919, 0.34521812]), 'split2_test_score': array([0.34689597, 0.35067114, 0.35151007, 0.34857383]), 'split3_test_score': array([0.34815436, 0.34983221, 0.34815436, 0.34815436]), 'split4_test_score': array([0.34647651, 0.33766779, 0.34018456, 0.34060403]), 'mean_test_score': array([0.348

In [None]:
'''
4.3 Running optimized random forest model with best results from gridsearchcv
'''
rf_fnl_tfidf  = RandomForestClassifier(n_estimators=1000, max_features='sqrt', criterion = 'entropy', max_depth= 15, min_samples_leaf = 5,random_state = 42)

rf_fnl_tfidf.fit(X_dev_tfidf_df,y_dev)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=15, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
'''
4.4 Print scores - AUC score for both dev and cv sets; do optimized rf params reduce overfitting - yes but still pretty highly overfitted
'''
y_tfidf_prob_dev_fnl = rf_fnl_tfidf.predict_proba(X_dev_tfidf_df)
roc_auc_tfidf_dev_fnl = roc_auc_score(y_dev, y_tfidf_prob_dev_fnl, multi_class="ovo",
                                  average="macro")

print("TFIDF RF Train AUC Score:", round(roc_auc_tfidf_dev_fnl,4))

y_tfidf_prob_val_fnl = rf_fnl_tfidf.predict_proba(X_cv_tfidf)
roc_auc_tfidf_val_fnl = roc_auc_score(y_cv, y_tfidf_prob_val_fnl, multi_class="ovo",
                                  average="macro")
print("TFIDF RF Val AUC Score:   ", round(roc_auc_tfidf_val_fnl,4))

TFIDF RF Train AUC Score: 0.976
TFIDF RF Val AUC Score:    0.8836
