In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


# PseudoCode and Task List

1. Load the pickled pandas dataframe from 04 notebook and check file contents
2. Prep the dataset for analysis
>2a. Factorize Tags column to a numeric column
>2b. Split into dev, cv, and test sets
>2c. Verify the distribution of tags within the splits
3. Third of 3 different models (Light GBM) run each with the 3 diff vectorizors - evaluate with AUC scores on the val set of each transformation



# Tasks 1 Load file and examine contents

In [None]:
'''
1a Import all modules that are needed
'''
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import re
import nltk
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut 
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import preprocess_documents
from gensim.parsing.preprocessing import preprocess_string
from sklearn.preprocessing import StandardScaler
from prettytable import PrettyTable

In [None]:
'''
1b Load file 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24330 entries, 0 to 24352
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Id              24330 non-null  int64 
 1   Tag             24330 non-null  object
 2   BodyText_Clean  24330 non-null  object
dtypes: int64(1), object(2)
memory usage: 760.3+ KB
'''

questions_df_clean = pd.read_pickle('/content/drive/My Drive/Capstone2/Data/questions_df_clean_11052020.pickle')
#questions_df_clean = pd.read_pickle('/content/drive/MyDrive/Data Science/Laura_CP2/Copy of questions_df_clean_11052020.pickle')
questions_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24330 entries, 0 to 24352
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Id              24330 non-null  int64 
 1   Tag             24330 non-null  object
 2   BodyText_Clean  24330 non-null  object
dtypes: int64(1), object(2)
memory usage: 760.3+ KB


In [None]:
'''
1c. Examine contents
'''
questions_df_clean.head()
#questions_df_clean.loc[questions_df_clean['BodyText_Clean'].isnull()]
#questions_df_clean.loc[questions_df_clean['Tag'].isnull()]

Unnamed: 0,Id,Tag,BodyText_Clean
0,5,machine-learning,always interest machine learn figure one thing...
1,7,Other,researcher instructor look opensource book sim...
2,14,data-mining,sure data science discus forum several synonym...
3,15,Other,situation would one system prefer relative adv...
4,16,machine-learning,use libsvm train data predict classification s...


# Task 2 - Prep for modelling
>2a. Factorize Tags column to numeric column
>2b. Split into dev, cv, and test sets
>2c. Verify the distribution of tags within the splits

In [None]:
''' 
2a. Factorize Tags Column to numeric
Converting tag column (our target variable) to a numeric column for modelling; 
originally converted to separate columns for each tag
that process returned 1(yes) or 0(no) for each tag name in the original Tag column
multiple y target variables are represented by these multi-labelled columns
returning a list of all these multi-label target columns
That initial process did not work well using the below code
tag_names = questions_df_clean['Tag'].unique().tolist()
#print(tag_names)
tag_dummy = pd.get_dummies(questions_df_clean['Tag'], prefix = 'Tag')
quest_df_dummies = pd.concat([questions_df_clean, tag_dummy], axis = 1)
quest_df_dummies.drop(columns='Tag', inplace=True)
quest_df_dummies.info()
Using factorize code instead suggested by Ajith
'''
questions_df_factorized = questions_df_clean.copy()
#Creating the dependent variable class
factor = pd.factorize(questions_df_factorized['Tag'])
questions_df_factorized.Tag = factor[0]
definitions = factor[1]
print(questions_df_factorized.head())
print(questions_df_clean.head())
print(definitions)
print(factor)

   Id  Tag                                     BodyText_Clean
0   5    0  always interest machine learn figure one thing...
1   7    1  researcher instructor look opensource book sim...
2  14    2  sure data science discus forum several synonym...
3  15    1  situation would one system prefer relative adv...
4  16    0  use libsvm train data predict classification s...
   Id               Tag                                     BodyText_Clean
0   5  machine-learning  always interest machine learn figure one thing...
1   7             Other  researcher instructor look opensource book sim...
2  14       data-mining  sure data science discus forum several synonym...
3  15             Other  situation would one system prefer relative adv...
4  16  machine-learning  use libsvm train data predict classification s...
Index(['machine-learning', 'Other', 'data-mining', 'bigdata', 'r',
       'statistics', 'clustering', 'recommender-system', 'nlp',
       'feature-selection', 'neural-network', '

In [None]:
'''
2b. Split into train (70%) / test (30%). Use the train data and further split into train/val split (similar ratio). 
Leaving the test split to the end.
Splitting into target (y) and predictor (X) variable sets and then into 
test and train sets and using stratification, given that the tag distribution is imbalanced
Experimented with various means to deal with stratification and multi-label classification
and decided the standard scikit learn module code works better

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X=df[list('ABCD')]
Y=pd.DataFrame(mlb.fit_transform(df[['sex','weight']].values), columns=mlb.classes_, index=df.index)

!pip install scikit-multilearn
from skmultilearn.model_selection import iterative_stratification
X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size = 0.30)

'''
# Splitting X and y variables
X=questions_df_factorized[list(questions_df_factorized.columns)[2]]
y=questions_df_factorized[list(questions_df_factorized.columns)[1]]
print(X.shape)
print(y.shape)
# Splitting into train, test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, stratify=y,random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# Further splitting train into dev and validation
X_dev, X_cv, y_dev, y_cv = train_test_split(X_train,y_train,test_size = 0.30,stratify=y_train,random_state=42)
print(X_dev.shape, y_dev.shape)
print(X_cv.shape, y_cv.shape)


(24330,)
(24330,)
(17031,) (17031,)
(7299,) (7299,)
(11921,) (11921,)
(5110,) (5110,)


In [None]:
print(type(y_dev))
y_dev.isnull().sum()

<class 'pandas.core.series.Series'>


0

In [None]:
X_dev = pd.DataFrame(X_dev)
X_cv = pd.DataFrame(X_cv)
X_test = pd.DataFrame(X_test)
X_train = pd.DataFrame(X_train)
X_dev.head()

Unnamed: 0,BodyText_Clean
8286,seems like thing httpswwwsciencedirectcomscien...
23415,build 2hidden layer mlp use keras use scikit l...
23360,write fast rcnn run problem back propagation g...
1698,hear multilayer perceptron approximate functio...
24085,try correlation analysis dataset data cleanse ...


In [None]:
X_dev['BodyText_Clean'] = X_dev['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
X_cv['BodyText_Clean'] = X_cv['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
X_test['BodyText_Clean'] = X_test['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
X_train['BodyText_Clean'] = X_train['BodyText_Clean'].apply(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))

In [None]:
# Create dataframe from Excel spreadsheet with 1000 randomized parameter settings

param_sample_1000_df = pd.read_excel('/content/drive/My Drive/Capstone2/Data/LAE_Lgbm_ParamTuning_Grid.xlsx')
#param_sample_1000
param_sample_1000_df.head()

Unnamed: 0.1,Unnamed: 0,subsample,learning_rate,min_data_in_leaf,lambda_l2,colsample_bytree,max_depth,max_bin,min_gain_to_split,num_leaves,min_split_gain,n_estimators,lambda_l1
0,0,0.2,0.005,50,50,0.2,23,200,0.5,125,0.5,1500,10
1,1,0.4,0.01,50,50,0.6,19,100,0.0,45,0.5,1250,50
2,2,1.0,0.01,50,50,0.6,17,100,0.0,125,0.5,1000,0
3,3,0.8,0.005,25,25,0.4,19,50,0.5,85,1.0,1500,50
4,4,0.6,0.01,100,50,0.6,19,200,10.0,45,1.0,1000,10


# Task 3 - Build third of 3 different models (Light GBM) and run with the 3 diff transformations -  measure the accuracy on the val set for each 
>3.1. Count Vectorizer
>3.2. TFIDF
>3.3. Doc2Vec


In [None]:
'''
3.1a. Running count vectorizer on dev and cv sets with optimal params
'''
cnt_vect = CountVectorizer(min_df=.005,max_df=.99, ngram_range=(1, 1))
X_dev_cntvect_df = pd.DataFrame(cnt_vect.fit_transform(X_dev.BodyText_Clean).toarray(), index=X_dev.index, columns=cnt_vect.get_feature_names())
print(X_dev_cntvect_df.shape)
X_dev_cntvect_df.head()
X_cv_cntvect = cnt_vect.transform(X_cv.BodyText_Clean)

# Convert cv set using the same transformation

X_cv_cntvect_df = pd.DataFrame(cnt_vect.transform(X_cv.BodyText_Clean).toarray(), index = X_cv.index, columns = cnt_vect.get_feature_names())
X_cv_cntvect_df.head()

# Convert test set using the same transformation

X_test_cntvect_df = pd.DataFrame(cnt_vect.transform(X_test.BodyText_Clean).toarray(), index = X_test.index, columns = cnt_vect.get_feature_names())
X_test_cntvect_df.head()

(11921, 1231)


Unnamed: 0,1d,2d,2nd,3d,able,absolute,accept,access,accomplish,accord,according,account,accuracy,accurate,achieve,across,action,activation,activity,actual,actually,adam,add,addition,additional,address,adjust,advance,advantage,advice,advise,affect,age,agent,aggregate,ai,aim,al,algorithm,allow,...,visualization,visualize,want,way,web,website,week,weight,weird,welcome,well,whereas,whether,whole,whose,width,wikipedia,win,window,wish,within,without,wonder,word,word2vec,work,world,worth,would,write,wrong,x1,x2,xgboost,xi,year,yes,yet,yield,zero
23307,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10984,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0
3768,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6506,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
20872,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0


In [None]:
'''
3.1.b. Scaling the results
'''
sc=StandardScaler()
sc.fit(X_dev_cntvect_df)
X_dev_cntvect_df_sc =pd.DataFrame(sc.fit_transform(X_dev_cntvect_df))
X_cv_cntvect_df_sc = pd.DataFrame(sc.transform(X_cv_cntvect_df))
X_test_cntvect_df_sc = pd.DataFrame(sc.transform(X_test_cntvect_df))

In [None]:
#param_sample_space = pd.DataFrame(pd.DataFrame(param_sample_1000_df.iloc[361,1:]).transpose())
#param_sample_space.head()
print(len(param_sample_1000_df))

1000


In [None]:
'''
3.1c. LGBM hypertuning for count vectorization transformation
Warning this can take 24-48 hrs to run
'''
cntvect_lgbm_final_report = pd.DataFrame()

for i in range(0, len(param_sample_1000_df)):
  
  print("Running iteration: " + str(i))

  param_sample_space = pd.DataFrame(pd.DataFrame(param_sample_1000_df.iloc[i,1:]).transpose())
  #record parameters
  lrng_rate = float(param_sample_1000_df.loc[i,'learning_rate'])
  n_est = int(param_sample_1000_df.loc[i,'n_estimators'])
  n_leaves = int(param_sample_1000_df.loc[i,'num_leaves'])
  m_depth = int(param_sample_1000_df.loc[i,'max_depth'])
  col_samp_tree = float(param_sample_1000_df.loc[i,'colsample_bytree'])
  m_bin = int(param_sample_1000_df.loc[i,'max_bin'])
  s_sample = float(param_sample_1000_df.loc[i,'subsample'])
  m_split_gain = float(param_sample_1000_df.loc[i,'min_split_gain'])
  m_data_leaf = int(param_sample_1000_df.loc[i,'min_data_in_leaf'])
  lamb_l1 = float(param_sample_1000_df.loc[i,'lambda_l1'])
  lamb_l2 = float(param_sample_1000_df.loc[i,'lambda_l2'])
  m_gain_split = float(param_sample_1000_df.loc[i,'min_gain_to_split'])
  
  # Intialize parameters with current parameter combination
  model1 = lgb.LGBMClassifier(learning_rate = lrng_rate,
                             n_estimators = n_est,
                             num_leaves = n_leaves,
                             max_depth = m_depth,
                             max_bin = m_bin,
                             colsample_bytree = col_samp_tree,
                             subsample = s_sample,
                             min_split_gain = m_split_gain,
                             min_data_in_leaf = m_data_leaf,
                             lambda_l1 = lamb_l1,
                             lambda_l2 = lamb_l2,
                             min_gain_to_split = m_gain_split,
                             boosting_type = 'gbdt',
                             objective = 'multiclass',
                             metric = 'multi_logloss',
                             silent = False,
                             bagging_seed = 2018,
                             num_class = len(np.unique(y_dev)),
                             #drop_rate = 0.25, # not necessary
                             #drop_seed = 2018,# not necessary
                             data_random_seed = 2018,
                             random_state = 2018)
  # fit the model with results from count vectorization process above
  model1.fit(X=X_dev_cntvect_df_sc,y=y_dev.values.ravel(),
          eval_set=[(X_dev_cntvect_df_sc,y_dev.values.ravel()),
                    (X_cv_cntvect_df_sc,y_cv.values.ravel())],
           early_stopping_rounds=10,verbose=50)
  
  # Predict on various datasets
  dev_trn_pred = model1.predict_proba(X_dev_cntvect_df_sc)
  dev_val_pred = model1.predict_proba(X_cv_cntvect_df_sc)
  dev_oot_pred = model1.predict_proba(X_test_cntvect_df_sc)
  
  train_auc = roc_auc_score(y_dev,dev_trn_pred,multi_class="ovo", average = "macro")
  val_auc = roc_auc_score(y_cv,dev_val_pred,multi_class="ovo", average = "macro")
  oot_auc = roc_auc_score(y_test,dev_oot_pred,multi_class="ovo", average = "macro")
  
  print("train_auc : {:.2}".format(train_auc))
  print("val_auc: {:.2}".format(val_auc))
  print("oot_auc: {:.2}".format(oot_auc))
  
  loc_observations = pd.DataFrame({
  'train_auc': train_auc,
  'val_auc': val_auc,
  'oot_auc': oot_auc
  }, index = [0])
  
  local_findings = pd.concat([param_sample_space.reset_index(), loc_observations], axis = 1).drop(['index'], axis = 1)
  cntvect_lgbm_final_report = cntvect_lgbm_final_report.append(local_findings)
  gc.collect() # Memory optimization to clear buffers
  if i%10 == 0:
    cntvect_lgbm_final_report.to_excel("/content/drive/My Drive/Capstone2/Data/cntvect_lgbm_final_report.xlsx")

#Remember to export out final report here so it is saved after this long process

cntvect_lgbm_final_report.to_excel("/content/drive/My Drive/Capstone2/Data/cntvect_lgbm_final_report.xlsx")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
oot_auc: 0.75
Running iteration: 575
Training until validation scores don't improve for 10 rounds.
[50]	valid_0's multi_logloss: 1.89972	valid_1's multi_logloss: 2.06622
[100]	valid_0's multi_logloss: 1.70588	valid_1's multi_logloss: 1.96804
[150]	valid_0's multi_logloss: 1.60233	valid_1's multi_logloss: 1.93779
[200]	valid_0's multi_logloss: 1.54196	valid_1's multi_logloss: 1.92923
[250]	valid_0's multi_logloss: 1.50533	valid_1's multi_logloss: 1.9266
Early stopping, best iteration is:
[279]	valid_0's multi_logloss: 1.49165	valid_1's multi_logloss: 1.92584
train_auc : 0.96
val_auc: 0.87
oot_auc: 0.88
Running iteration: 576
Training until validation scores don't improve for 10 rounds.
[50]	valid_0's multi_logloss: 2.41339	valid_1's multi_logloss: 2.47469
[100]	valid_0's multi_logloss: 2.35648	valid_1's multi_logloss: 2.41822
[150]	valid_0's multi_logloss: 2.34073	valid_1's multi_logloss: 2.40222
Early stopping, best itera

In [None]:
'''
3.2a Run tfidf with optimal params
'''
tfidf_vect = TfidfVectorizer(min_df=.001,max_df=.999, ngram_range=(1, 1))
X_dev_tfidf_df = pd.DataFrame(tfidf_vect.fit_transform(X_dev.BodyText_Clean).toarray(), index=X_dev.index, columns=tfidf_vect.get_feature_names())
print(X_dev_tfidf_df.shape)
X_dev_tfidf_df.head()
X_cv_tfidf = tfidf_vect.transform(X_cv.BodyText_Clean)

# Convert cv set using the same transformation

X_cv_tfidf_df = pd.DataFrame(tfidf_vect.transform(X_cv.BodyText_Clean).toarray(), index = X_cv.index, columns = tfidf_vect.get_feature_names())
X_cv_tfidf_df.head()

# Convert cv set using the same transformation

X_test_tfidf_df = pd.DataFrame(tfidf_vect.transform(X_test.BodyText_Clean).toarray(), index = X_test.index, columns = tfidf_vect.get_feature_names())
X_test_tfidf_df.head()

(11921, 3144)


Unnamed: 0,100k,10fold,10k,1d,1m,1st,1x1,20k,2d,2nd,2x2,30k,3d,3rd,3x3,4th,500k,50k,5fold,5k,5th,5x5,8gb,a1,a2,ab,abbreviation,abc,ability,able,abnormal,absolute,absolutely,abstract,academic,acc,accelerate,acceleration,accelerometer,accept,...,wt,x0,x1,x2,x3,x4,xaxis,xgboost,xi,xml,xn,xor,xt,xtest,xtrain,xy,xyz,y0,y1,y2,yaxis,year,yellow,yes,yesno,yet,yi,yield,yolo,york,youtube,ypred,yt,ytest,ytrain,ytrue,zero,zip,zoom,zscore
23307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149301,0.0,0.0,0.0
20872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.095232,0.095761,0.0,0.0,0.0,0.0,0.0,0.0,0.118902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.110429,0.117484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
'''
3.2.b. Scaling the results
'''
sc=StandardScaler()
sc.fit(X_dev_tfidf_df)
X_dev_tfidf_df_sc =pd.DataFrame(sc.fit_transform(X_dev_tfidf_df))
X_cv_tfidf_df_sc = pd.DataFrame(sc.transform(X_cv_tfidf_df))
X_test_tfidf_df_sc = pd.DataFrame(sc.transform(X_test_tfidf_df))


In [None]:
'''
3.2c. LGBM hypertuning for tfidf transformation
Warning this can take 48-96 hrs to run
'''
tfidf_lgbm_final_report = pd.DataFrame()

for i in range(0, len(param_sample_1000_df)):
  
  print("Running iteration: " + str(i))

  param_sample_space = pd.DataFrame(pd.DataFrame(param_sample_1000_df.iloc[i,1:]).transpose())
  #record parameters
  lrng_rate = float(param_sample_1000_df.loc[i,'learning_rate'])
  n_est = int(param_sample_1000_df.loc[i,'n_estimators'])
  n_leaves = int(param_sample_1000_df.loc[i,'num_leaves'])
  m_depth = int(param_sample_1000_df.loc[i,'max_depth'])
  col_samp_tree = float(param_sample_1000_df.loc[i,'colsample_bytree'])
  m_bin = int(param_sample_1000_df.loc[i,'max_bin'])
  s_sample = float(param_sample_1000_df.loc[i,'subsample'])
  m_split_gain = float(param_sample_1000_df.loc[i,'min_split_gain'])
  m_data_leaf = int(param_sample_1000_df.loc[i,'min_data_in_leaf'])
  lamb_l1 = float(param_sample_1000_df.loc[i,'lambda_l1'])
  lamb_l2 = float(param_sample_1000_df.loc[i,'lambda_l2'])
  m_gain_split = float(param_sample_1000_df.loc[i,'min_gain_to_split'])
  
  # Intialize parameters with current parameter combination
  model1 = lgb.LGBMClassifier(learning_rate = lrng_rate,
                             n_estimators = n_est,
                             num_leaves = n_leaves,
                             max_depth = m_depth,
                             max_bin = m_bin,
                             colsample_bytree = col_samp_tree,
                             subsample = s_sample,
                             min_split_gain = m_split_gain,
                             min_data_in_leaf = m_data_leaf,
                             lambda_l1 = lamb_l1,
                             lambda_l2 = lamb_l2,
                             min_gain_to_split = m_gain_split,
                             boosting_type = 'gbdt',
                             objective = 'multiclass',
                             metric = 'multi_logloss',
                             silent = False,
                             bagging_seed = 2018,
                             num_class = len(np.unique(y_dev)),
                             #drop_rate = 0.25, # not necessary
                             #drop_seed = 2018,# not necessary
                             data_random_seed = 2018,
                             random_state = 2018)
  # fit the model with results from count vectorization process above
  model1.fit(X=X_dev_tfidf_df_sc,y=y_dev.values.ravel(),
          eval_set=[(X_dev_tfidf_df_sc,y_dev.values.ravel()),
                    (X_cv_tfidf_df_sc,y_cv.values.ravel())],
           early_stopping_rounds=10,verbose=50)
  
  # Predict on various datasets
  dev_trn_pred = model1.predict_proba(X_dev_tfidf_df_sc)
  dev_val_pred = model1.predict_proba(X_cv_tfidf_df_sc)
  dev_oot_pred = model1.predict_proba(X_test_tfidf_df_sc)
  
  train_auc = roc_auc_score(y_dev,dev_trn_pred,multi_class="ovo", average = "macro")
  val_auc = roc_auc_score(y_cv,dev_val_pred,multi_class="ovo", average = "macro")
  oot_auc = roc_auc_score(y_test,dev_oot_pred,multi_class="ovo", average = "macro")
  
  print("train_auc : {:.2}".format(train_auc))
  print("val_auc: {:.2}".format(val_auc))
  print("oot_auc: {:.2}".format(oot_auc))
  
  loc_observations = pd.DataFrame({
  'train_auc': train_auc,
  'val_auc': val_auc,
  'oot_auc': oot_auc
  }, index = [0])
  
  local_findings = pd.concat([param_sample_space.reset_index(), loc_observations], axis = 1).drop(['index'], axis = 1)
  tfidf_lgbm_final_report = tfidf_lgbm_final_report.append(local_findings)
  gc.collect() # Memory optimization to clear buffers
  if i%10 == 0:
    tfidf_lgbm_final_report.to_excel("/content/drive/My Drive/Capstone2/Data/tfidf_lgbm_final_report.xlsx")

#Remember to export out final report here so it is saved after this long process

tfidf_lgbm_final_report.to_excel("/content/drive/My Drive/Capstone2/Data/tfidf_lgbm_final_report.xlsx")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[100]	valid_0's multi_logloss: 2.43538	valid_1's multi_logloss: 2.60985
[150]	valid_0's multi_logloss: 2.37279	valid_1's multi_logloss: 2.55691
[200]	valid_0's multi_logloss: 2.33111	valid_1's multi_logloss: 2.52132
[250]	valid_0's multi_logloss: 2.29951	valid_1's multi_logloss: 2.4949
[300]	valid_0's multi_logloss: 2.27329	valid_1's multi_logloss: 2.4738
[350]	valid_0's multi_logloss: 2.25162	valid_1's multi_logloss: 2.45667
[400]	valid_0's multi_logloss: 2.23293	valid_1's multi_logloss: 2.4426
[450]	valid_0's multi_logloss: 2.21645	valid_1's multi_logloss: 2.43078
[500]	valid_0's multi_logloss: 2.20154	valid_1's multi_logloss: 2.4203
[550]	valid_0's multi_logloss: 2.1879	valid_1's multi_logloss: 2.41108
[600]	valid_0's multi_logloss: 2.17554	valid_1's multi_logloss: 2.40289
[650]	valid_0's multi_logloss: 2.16449	valid_1's multi_logloss: 2.3956
[700]	valid_0's multi_logloss: 2.15464	valid_1's multi_logloss: 2.38907
[750]

In [None]:
'''
3.3a. Loading hypertuned doc2vec model from previous notebook and re-creating the tagged docs, then evaluate
'''
fnl_d2v_model_500 = Doc2Vec.load("/content/drive/My Drive/Capstone2/Data/final_d2v_500.model")

# Build separate dataframes with the dev, cv, and test 
reversefactor = dict(zip(range(34),definitions))

y_dev_rf = np.vectorize(reversefactor.get)(y_dev)
y_cv_rf = np.vectorize(reversefactor.get)(y_cv)
y_test_rf = np.vectorize(reversefactor.get)(y_test)

d2v_dev_df = pd.DataFrame({'y': y_dev_rf, 'X': X_dev.BodyText_Clean})
d2v_cv_df = pd.DataFrame({'y': y_cv_rf, 'X': X_cv.BodyText_Clean})
d2v_test_df = pd.DataFrame({'y': y_test_rf, 'X': X_test.BodyText_Clean})
#d2v_dev_df.head()

# Ensure they're all string datatype
d2v_dev_df['X'] = d2v_dev_df['X'].astype(str)
d2v_cv_df['X'] = d2v_cv_df['X'].astype(str)
d2v_test_df['X'] = d2v_test_df['X'].astype(str)
d2v_dev_df.head()

class TaggedDocumentIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])
 
docLabels_dev = list(d2v_dev_df['y'])
data_dev = list(d2v_dev_df['X'])
tagged_docs_dev = TaggedDocumentIterator(data_dev, docLabels_dev)

docLabels_cv = list(d2v_cv_df['y'])
data_cv = list(d2v_cv_df['X'])
tagged_docs_cv = TaggedDocumentIterator(data_cv, docLabels_cv)

docLabels_test = list(d2v_test_df['y'])
data_test = list(d2v_test_df['X'])
tagged_docs_test = TaggedDocumentIterator(data_test, docLabels_test)

type(tagged_docs_dev)
d2v_dev_df.info()

fnl_dev_targets, fnl_dev_regressors = zip(*[(doc.tags[0], fnl_d2v_model_500.infer_vector(doc.words, steps=20)) for doc in tagged_docs_dev])
fnl_cv_targets, fnl_cv_regressors = zip(*[(doc.tags[0], fnl_d2v_model_500.infer_vector(doc.words, steps=20)) for doc in tagged_docs_cv])
fnl_test_targets, fnl_test_regressors = zip(*[(doc.tags[0], fnl_d2v_model_500.infer_vector(doc.words, steps=20)) for doc in tagged_docs_test])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11921 entries, 8286 to 3207
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   y       11921 non-null  object
 1   X       11921 non-null  object
dtypes: object(2)
memory usage: 279.4+ KB


In [None]:
'''
3.3.b. Scaling the results
'''
sc=StandardScaler()
sc.fit(fnl_dev_regressors)
X_dev_d2v_df_sc =pd.DataFrame(sc.fit_transform(fnl_dev_regressors))
X_cv_d2v_df_sc = pd.DataFrame(sc.transform(fnl_cv_regressors))
X_test_d2v_df_sc = pd.DataFrame(sc.transform(fnl_test_regressors))

In [None]:
'''
3.3c. LGBM hypertuning for Doc2Vec transformation
Warning this can take 48-96 hrs to run
'''
d2v_lgbm_500_final_report = pd.DataFrame()

for i in range(0, len(param_sample_1000_df)):
  
  print("Running iteration: " + str(i))

  param_sample_space = pd.DataFrame(pd.DataFrame(param_sample_1000_df.iloc[i,1:]).transpose())
  #record parameters
  lrng_rate = float(param_sample_1000_df.loc[i,'learning_rate'])
  n_est = int(param_sample_1000_df.loc[i,'n_estimators'])
  n_leaves = int(param_sample_1000_df.loc[i,'num_leaves'])
  m_depth = int(param_sample_1000_df.loc[i,'max_depth'])
  col_samp_tree = float(param_sample_1000_df.loc[i,'colsample_bytree'])
  m_bin = int(param_sample_1000_df.loc[i,'max_bin'])
  s_sample = float(param_sample_1000_df.loc[i,'subsample'])
  m_split_gain = float(param_sample_1000_df.loc[i,'min_split_gain'])
  m_data_leaf = int(param_sample_1000_df.loc[i,'min_data_in_leaf'])
  lamb_l1 = float(param_sample_1000_df.loc[i,'lambda_l1'])
  lamb_l2 = float(param_sample_1000_df.loc[i,'lambda_l2'])
  m_gain_split = float(param_sample_1000_df.loc[i,'min_gain_to_split'])
  
  # Intialize parameters with current parameter combination
  model1 = lgb.LGBMClassifier(learning_rate = lrng_rate,
                             n_estimators = n_est,
                             num_leaves = n_leaves,
                             max_depth = m_depth,
                             max_bin = m_bin,
                             colsample_bytree = col_samp_tree,
                             subsample = s_sample,
                             min_split_gain = m_split_gain,
                             min_data_in_leaf = m_data_leaf,
                             lambda_l1 = lamb_l1,
                             lambda_l2 = lamb_l2,
                             min_gain_to_split = m_gain_split,
                             boosting_type = 'gbdt',
                             objective = 'multiclass',
                             metric = 'multi_logloss',
                             silent = False,
                             bagging_seed = 2018,
                             num_class = len(np.unique(y_dev)),
                             #drop_rate = 0.25, # not necessary
                             #drop_seed = 2018,# not necessary
                             data_random_seed = 2018,
                             random_state = 2018)
  # fit the model with results from count vectorization process above
  model1.fit(X=X_dev_d2v_df_sc,y=y_dev.values.ravel(),
          eval_set=[(X_dev_d2v_df_sc,y_dev.values.ravel()),
                    (X_cv_d2v_df_sc,y_cv.values.ravel())],
           early_stopping_rounds=10,verbose=50)
  
  # Predict on various datasets
  dev_trn_pred = model1.predict_proba(X_dev_d2v_df_sc)
  dev_val_pred = model1.predict_proba(X_cv_d2v_df_sc)
  dev_oot_pred = model1.predict_proba(X_test_d2v_df_sc)
  
  train_auc = roc_auc_score(y_dev,dev_trn_pred,multi_class="ovo", average = "macro")
  val_auc = roc_auc_score(y_cv,dev_val_pred,multi_class="ovo", average = "macro")
  oot_auc = roc_auc_score(y_test,dev_oot_pred,multi_class="ovo", average = "macro")
  
  print("train_auc : {:.2}".format(train_auc))
  print("val_auc: {:.2}".format(val_auc))
  print("oot_auc: {:.2}".format(oot_auc))
  
  loc_observations = pd.DataFrame({
  'train_auc': train_auc,
  'val_auc': val_auc,
  'oot_auc': oot_auc
  }, index = [0])
  
  local_findings = pd.concat([param_sample_space.reset_index(), loc_observations], axis = 1).drop(['index'], axis = 1)
  d2v_lgbm_500_final_report = d2v_lgbm_500_final_report.append(local_findings)
  gc.collect() # Memory optimization to clear buffers
  if i%10 == 0:
    d2v_lgbm_500_final_report.to_excel("/content/drive/My Drive/Capstone2/Data/d2v_lgbm_500_final_report.xlsx")

#Remember to export out final report here so it is saved after this long process

d2v_lgbm_500_final_report.to_excel("/content/drive/My Drive/Capstone2/Data/d2v_lgbm_500_final_report.xlsx")

Running iteration: 620
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[4]	valid_0's multi_logloss: 0.998163	valid_1's multi_logloss: 2.32301
train_auc : 0.99
val_auc: 0.76
oot_auc: 0.77
Running iteration: 621
Training until validation scores don't improve for 10 rounds.
[50]	valid_0's multi_logloss: 1.47007	valid_1's multi_logloss: 2.26905
[100]	valid_0's multi_logloss: 1.12728	valid_1's multi_logloss: 2.23966
Early stopping, best iteration is:
[93]	valid_0's multi_logloss: 1.15867	valid_1's multi_logloss: 2.23933
train_auc : 0.97
val_auc: 0.76
oot_auc: 0.77
Running iteration: 622
Training until validation scores don't improve for 10 rounds.
Early stopping, best iteration is:
[6]	valid_0's multi_logloss: 1.07231	valid_1's multi_logloss: 2.30396
train_auc : 0.98
val_auc: 0.78
oot_auc: 0.78
Running iteration: 623
Training until validation scores don't improve for 10 rounds.
[50]	valid_0's multi_logloss: 2.30937	valid_1's multi_logloss: 2.