In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split
import gensim
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.utils import simple_preprocess, lemmatize
import gensim.corpora as corpora
import re
from nltk.corpus import stopwords
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
from collections import Counter
from topic_modeling import get_new_topic_probs, get_topic_probs
import pronouncing
import copy
import textstat
nlp = spacy.load("en_core_web_lg")
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import log_loss
from keras import models
from keras import layers
from keras import optimizers
import random
from keras.callbacks import ModelCheckpoint
from AutoCluster import AutoKMeans
from sklearn.metrics import confusion_matrix
import pickle
import tqdm

## Part 1: Preprocessing

1. Import datasets
2. Split training data into train and val
3. Create dummy variable for each author from training data
4. Split newly created dummies into train and val

In [None]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
X_train, X_val, y_train, y_val = train_test_split(df.drop('author', axis = 1),encoder.fit_transform(df.author), random_state=0, test_size = .2)


author_df=pd.DataFrame(0,columns=(['Poe','Lovecraft','Shelley']),index=train.index)
df.Poe = df.author.apply(lambda x: 1 if x=='EAP' else 0)
df.Shelley = df.author.apply(lambda x: 1 if x=='MWS' else 0)
df.Lovecraft = df.author.apply(lambda x: 1 if x=='HPL' else 0)


Poey_train, Poey_val = train_test_split(author_df.Poe, random_state=0, test_size = .2)
Lovecrafty_train, Lovecrafty_val = train_test_split(author_df.Lovecraft, random_state=0, test_size = .2)
Shelleyy_train, Shelleyy_val = train_test_split(author_df.Shelley, random_state=0, test_size = .2)

# Adaboost with LSTM neural network as base estimator

1. Tokenize texts
2. Instantiate Adaboost with LSTM base estimator
3. Get multi-class and multi-label predictions

In [None]:
#Tokenize texts
tk.fit_on_texts(X_train.text)
train_post = tk.texts_to_sequences(X_train.text)
X_train_post = pad_sequences(train_post,255,truncating='post')
val_post = tk.texts_to_sequences(X_val.text)
X_val_post = pad_sequences(val_post,255,truncating='post')
test_post = tk.texts_to_sequences(test_df.text)
X_test_post = pad_sequences(test_post,255,truncating='post')

In [None]:
#Define neural network architecture within a function, apply sklearn wrapper and instantiate adaboost
np.random.seed(0)
def lstm(max_sequence_len = 255):
    num_words = 5000
    embed_vec_len = 32
    lstm_nn = models.Sequential()
    lstm_nn.add(layers.Embedding(num_words, embed_vec_len, input_length=max_sequence_len))
    lstm_nn.add(layers.SpatialDropout1D(0.2))
    lstm_nn.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    lstm_nn.add(layers.Dense(2, activation='sigmoid'))
    lstm_nn.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['categorical_accuracy'])
    return lstm_nn
model = KerasClassifier(build_fn=lstm, epochs=1, batch_size=256, verbose=2)
ada = AdaBoostClassifier(base_estimator=model,n_estimators=25)

In [None]:
#multiclass adaboost
ada.fit(X_train_post,y_train)
ada_overall_train = ada.predict(X_train_post)
ada_overall_val = ada.predict(X_val_post)
ada_overall_test = ada.predict(X_test_post)

#multilabel adaboost
ada.fit(X_train_post,Poey_train)
ada_poe_train = ada.predict(X_train_post)
ada_poe_val = ada.predict(X_val_post)
ada_poe_test = ada.predict(X_test_post)
ada.fit(X_train_post,Lovecrafty_train)
ada_lovecraft_train = ada.predict(X_train_post)
ada_lovecraft_val = ada.predict(X_val_post)
ada_lovecraft_test = ada.predict(X_test_post)
ada.fit(X_train_post,Shelleyy_train)
ada_shelley_train = ada.predict(X_train_post)
ada_shelley_val = ada.predict(X_val_post)
ada_shelley_test = ada.predict(X_test_post)

# Tfidf vectorization

1. Vectorize
2. Predict author with naive bayes
3. Predict author with svm
4. Cluster vectors with KMeans

In [None]:
tfidf = TfidfVectorizer(stop_words='english',max_features=12000)
nb = MultinomialNB()

svm = SVC(kernel='linear')
encoder = LabelEncoder()
#tokenize
t_X_train = tfidf.fit_transform(X_train.text)
t_X_val = tfidf.fit_transform(X_val.text)
t_X_test = tfidf.fit_transform(test_df.text)

#overall nb
nb.fit(t_X_train,y_train)
nb_train = nb.predict(t_X_train)
nb_val = nb.predict(t_X_val)
nb_test = nb.predict(t_X_test)

#binary nb
nb.fit(t_X_train,Poey_train)
nb_poe_train = nb.predict(t_X_train)
nb_poe_val = nb.predict(t_X_val)
nb_poe_test = nb.predict(t_X_test)

nb.fit(t_X_train,Lovecrafty_train)
nb_lovecraft_train = nb.predict(lt_X_train)
nb_lovecraft_val = nb.predict(lt_X_val)
nb_lovecraft_test = nb.predict(t_X_test)

nb.fit(t_X_train,Shelleyy_train)
nb_shelley_train = nb.predict(t_X_train)
nb_shelley_val = nb.predict(t_X_val)
nb_shelley_test = nb.predict(t_X_test)

#overall svm
svm.fit(t_X_train,y_train)
svm_train = svm.predict(t_X_train)
svm_val = svm.predict(t_X_val)
svm_test = svm.predict(t_X_test)

#binary svm
svm.fit(t_X_train,Poey_train)
svm_poe_train = svm.predict(t_X_train)
svm_poe_val = svm.predict(t_X_val)
svm_poe_test = svm.predict(t_X_test)

svm.fit(t_X_train,Lovecrafty_train)
svm_lovecraft_train = svm.predict(t_X_train)
svm_lovecraft_val = svm.predict(t_X_val)
svm_lovecraft_test = svm.predict(t_X_test)

svm.fit(t_X_train,Shelleyy_train)
svm_shelley_train = svm.predict(t_X_train)
svm_shelley_val = svm.predict(t_X_val)
svm_shelley_test = svm.predict(t_X_test)

#KMeans Clustering
train_clusters, val_clusters, test_clusters = AutoKMeans(t_X_train,t_X_val,t_X_test)

# Combine results into dfs

In [None]:
#Overall Results
columns = ['Ada_1','Ada_2','NB_1','NB_2','SVM_1','SVM_2']
overall_train = pd.get_dummies(pd.DataFrame(np.concatenate([ada_overall_train, nb_train,svm_train],axis=1),columns=columns).astype(str),drop_first=True)
overall_val = pd.get_dummies(pd.DataFrame(np.concatenate([ada_overall_val,nb_val,svm_val],axis=1),columns=columns).astype(str),drop_first=True)
overall_test = pd.get_dummies(pd.DataFrame(np.concatenate([ada_overall_test,nb_test,svm_test],axis=1),columns=columns).astype(str),drop_first=True)

#Binary Results
author_columns = ['Poe_'+i for i in columns] + ['Lovecraft_'+i for i in columns] + ['Shelley_'+i for i in columns]
author_train = np.concatenate([ada_poe_train,nb_poe_train,svm_poe_train,ada_lovecraft_train,nb_lovecraft_train,svm_lovecraft_train,ada_shelley_train,nb_shelley_train,svm_shelley_train],axis=1)
author_train = pd.DataFrame(author_train,columns=author_columns).astype(str)
author_train = pd.get_dummies(author_train,drop_first=True)
author_columns = ['Poe_'+i for i in columns] + ['Lovecraft_'+i for i in columns] + ['Shelley_'+i for i in columns]
author_val = np.concatenate([ada_poe_val,nb_poe_val,svm_poe_val,ada_lovecraft_val,nb_lovecraft_val,svm_lovecraft_val,ada_shelley_val,nb_shelley_val,svm_shelley_val],axis=1)
author_val = pd.DataFrame(author_val,columns=author_columns).astype(str)
author_val = pd.get_dummies(author_val,drop_first=True)
author_columns = ['Poe_'+i for i in columns] + ['Lovecraft_'+i for i in columns] + ['Shelley_'+i for i in columns]
author_test = np.concatenate([ada_poe_test,nb_poe_test,svm_poe_test,ada_lovecraft_test,nb_lovecraft_test,svm_lovecraft_test,ada_shelley_test,nb_shelley_test,svm_shelley_test],axis=1)
author_test = pd.DataFrame(author_test,columns=author_columns).astype(str)
author_test = pd.get_dummies(author_test,drop_first=True)

#Clusters 
train_clusters = pd.get_dummies(pd.DataFrame(train_clusters,columns=(['Clusters'])).astype(str),drop_first=True)
val_clusters = pd.get_dummies(pd.DataFrame(val_clusters,columns=(['Clusters'])).astype(str),drop_first=True)
test_clusters = pd.get_dummies(pd.DataFrame(test_clusters,columns=(['Clusters'])).astype(str),drop_first=True) 