## 1. Imports (Libraries)

<a id='imports'></a>

In [1]:
# Import basic libraries
import pandas as pd
import numpy as np
from ast import literal_eval

In [2]:
# Import visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Import NLP libraries
import re
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
from gensim.models import Word2Vec

In [5]:
# Import sklearn libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import multilabel_confusion_matrix

In [8]:
#uncomment if not installed

#!pip install scikit-multilearn

from skmultilearn.model_selection import iterative_train_test_split

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
     ---------------------------------------- 0.0/89.4 kB ? eta -:--:--
     --------------------------- ------------ 61.4/89.4 kB 1.7 MB/s eta 0:00:01
     ---------------------------------------- 89.4/89.4 kB 1.3 MB/s eta 0:00:00
Installing collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [None]:
from xgboost import XGBClassifier

In [None]:
import shap

In [None]:
# some display adjustments to account for the fact that we have many columns
# and some columns contain many characters

np.set_printoptions(threshold=np.inf)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', None)

## 2. Imports (Data)

<a id='data_imports'></a>

Since we exported the cleaned dataset as a csv file in Part II, we can simply import that file and start working on it immediately. The two columns we are most concerned with are 'redditlabel' and 'text_lemma'.

Data dictionary:

|column| datatype|explanation|
|:-|:-:|:-|
|<b>redditlabel</b>|*integer*| The numeric boolean representation of our two classes. 0 means Keto and 1 means Paleo.|
|<b>title</b>| *string*| The subject title of the Reddit post.|
<br>

In [None]:
test_df = pd.read_csv('simpsons_10_tropes.csv')
train_df = pd.read_csv('10_tropes.csv')

In [None]:
test_df

In [None]:
train_df

In [None]:
train_df = train_df.set_index(['Trope Name'])['text_lemma'].str.split().apply(lambda x:  pd.Series([' '.join(x[i:i+ (len(x)//30)]) 
                                                                                         for i in range(0, len(x), len(x)//30)])).stack().reset_index().drop('level_1', axis = 1)

In [None]:
train_df.shape

In [None]:
train_df.columns = ['Trope Name', 'text_lemma']

In [None]:
# Defining X and y
X = train_df['text_lemma']
y = train_df['Trope Name']


In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify = y,
                                                    train_size=0.8,
                                                    random_state=42)

print(f'The X train set is {X_train.shape[0]} rows long.')
print(f'The y train set is {y_train.shape[0]} rows long.')
print(f'The X test set is {X_test.shape[0]} rows long.')
print(f'The y test set is {y_test.shape[0]} rows long.')


In [None]:
# Fitting the model
pipe_cv_lr = Pipeline(steps=[('cvec', CountVectorizer()),
                               ('logreg', LogisticRegression(solver='liblinear'))])

pipe_cv_lr_params = {'cvec__max_features':[5000], #2000, 3000, 4000, 5000
                       'cvec__min_df':[3], #2, 3
                       'cvec__max_df':[.85], #.85, .90, .95
                       'cvec__ngram_range':[(1,2)], #(1,1), (1,2), (1,3), (2,2)
                       'logreg__C': [0.1], #0.05, 0.1, 1
                       'logreg__penalty': ['l2']} #'l1', 'l2'

gs_cv_lr = GridSearchCV(pipe_cv_lr, param_grid=pipe_cv_lr_params, cv=3)

gs_cv_lr.fit(X_train, y_train)


In [None]:
# Making predictions
y_pred_cv_lr_train = gs_cv_lr.predict(X_train)
y_pred_cv_lr = gs_cv_lr.predict(X_test)
y_pred_proba_cv_lr = gs_cv_lr.predict_proba(X_test)


In [None]:
# Metrics
pred_prob_train = gs_cv_lr.predict_proba(X_train)
auc_score_train = roc_auc_score(y_train, pred_prob_train, 
                                multi_class="ovr", average="micro")
pred_prob_test = gs_cv_lr.predict_proba(X_test)
auc_score_test = roc_auc_score(y_test, pred_prob_test, 
                                multi_class="ovr", average="micro")

print(f'ROC-AUC on training set: {auc_score_train}')
print(f'ROC-AUC on testing set: {auc_score_test}')


print(classification_report(y_test, y_pred_cv_lr))

In [None]:
test_df = test_df.set_index(['10 Tropes'])['text_lemma'].str.split().apply(lambda x:  pd.Series([' '.join(x[i:i+ (len(x)//30)]) 
                                                                                         for i in range(0, len(x), len(x)//30)])).stack().reset_index().drop('level_1', axis = 1)

In [None]:
test_df.columns = ['Trope Name', 'text_lemma']

In [None]:
# Defining X and y
X = test_df['text_lemma']
y = test_df['Trope Name']


In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify = y,
                                                    train_size=0.8,
                                                    random_state=42)

print(f'The X train set is {X_train.shape[0]} rows long.')
print(f'The y train set is {y_train.shape[0]} rows long.')
print(f'The X test set is {X_test.shape[0]} rows long.')
print(f'The y test set is {y_test.shape[0]} rows long.')


In [None]:
test_df['Trope Name'] = test_df['Trope Name'].apply(literal_eval)

In [None]:
mlb = MultiLabelBinarizer()
binarised_df = pd.DataFrame(mlb.fit_transform(test_df['Trope Name']),columns=mlb.classes_, index=test_df.index)


In [None]:
binarised_df

In [None]:
test_df = test_df.join(binarised_df).drop('Trope Name', axis = 1)

In [None]:
test_df.head()

In [None]:
X = test_df['text_lemma']
y = test_df.drop('text_lemma', axis = 1)

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify = y,
                                                    train_size=0.8,
                                                    random_state=42)

print(f'The X train set is {X_train.shape[0]} rows long.')
print(f'The y train set is {y_train.shape[0]} rows long.')
print(f'The X test set is {X_test.shape[0]} rows long.')
print(f'The y test set is {y_test.shape[0]} rows long.')


In [None]:
pipe_cv_xgb = Pipeline(steps=[('cvec', CountVectorizer()),
                              ('xgb', MultiOutputClassifier(estimator=XGBClassifier(booster = 'gblinear',
                                                                                   eta = 0.05)))])

pipe_cv_xgb_params = {'cvec__max_features':[5000], #2000, 3000, 4000, 5000
                      'cvec__max_df':[.85], #.85, .90, .95
                      'cvec__ngram_range':[(1,2)], #(1,1), (1,2), (1,3), (2,2)
                     } 

gs_cv_xgb = GridSearchCV(pipe_cv_xgb, param_grid=pipe_cv_xgb_params, cv=3)

gs_cv_xgb.fit(X_train, y_train)


In [None]:
# Making predictions
y_pred_cv_xgb_train = gs_cv_xgb.predict(X_train)
y_pred_cv_xgb = gs_cv_xgb.predict(X_test)
y_pred_proba_cv_xgb = gs_cv_xgb.predict_proba(X_test)


In [None]:
gs_cv_xgb.score(X_train, y_train)

In [None]:
print(classification_report(y_test, y_pred_cv_xgb))