<a href="https://colab.research.google.com/github/kgedney/author-id-project/blob/master/exp2_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Modelling for Data with Name-Entities Removed
### ANLY 590 Project


***

#### Google Colab Prep

In [0]:
import numpy as np
import pandas as pd

In [4]:
# get data from Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


The cross topic preprocessing was done here: https://github.com/kgedney/author-id-project/blob/master/Cross_Topic_Processing.ipynb 
<br/>Saving it into a csv file, we pull in the tokenized dataframe here to begin our modeling

In [59]:
# manually add file from local to drive
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/author_tokens_df.csv')
df.head()

Unnamed: 0,author,work
0,author,['body']
1,GuaranteedAdmission,"['You', 'can', 'be', 'in', 'favor', 'of', 'a',..."
2,GuaranteedAdmission,"['That', 'depends', '.', 'What', 'is', 'the', ..."
3,GuaranteedAdmission,"['Because', 'I', 'like', 'to', 'think', 'long'..."
4,GuaranteedAdmission,"['We', 'do', ""n't"", 'talk', 'about', 'the', 'H..."


Removed the first line that contained the column names of the dataframe

In [0]:
df = df.iloc[1:,:]

In [61]:
df.shape

(9999, 2)

***

#### Preprocess  for Modelling

In [0]:
# install packages
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import keras
from keras import optimizers
import tensorflow as tf

from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.models import Model, Sequential
from keras.layers import Input, Dense, CuDNNLSTM, LSTM, Embedding, Bidirectional, GlobalAveragePooling1D, Conv1D, Activation, Flatten, Dropout, MaxPooling1D, Embedding, GlobalMaxPooling1D

from keras.layers.core import Dense, Dropout

In [0]:
# create class assignments
df['author_id'] = pd.Categorical(df.author).codes

#### 0. Baseline Model: Linear SVM

In [65]:
x = df['work'].values
y = df['author_id'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=22)

tfidf_vec   = TfidfVectorizer()
x_train_vec = tfidf_vec.fit_transform(x_train)
x_test_vec  = tfidf_vec.transform(x_test)
x_train_vec.shape

(7999, 17830)

In [0]:
from sklearn.svm import LinearSVC
clf             = LinearSVC().fit(x_train_vec, y_train)
predicted       = clf.predict(x_test_vec)
predicted_score = clf.decision_function(x_test_vec)

In [12]:
print('accuracy', metrics.accuracy_score(y_test, predicted))

accuracy 0.553


In [13]:
# top k accuracy (# ref: https://scikit-learn.org/stable/modules/svm.html)
predicted_score = clf.decision_function(x_test_vec)
predicted_score.shape

(2000, 14)

In [0]:
# transform matrix of predictions to put them in order
best_n = predicted_score.argsort()[:,::-1] # need to do in reverse order thats why need "[::-1]"

In [15]:
# set up function to calculate
count = 0
for i in range(0, y_test.shape[0]):
    if (y_test[i] in best_n[i,0:5]):
        count = count + 1

top_5_acc = count / y_test.shape[0]
top_5_acc

0.8645

#### Preprocess Data for Keras Models

In [16]:
# data preprocess
x = df['work'].values
y = df['author_id'].values

# create sequences
max_features = 25000
tokenizer    = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(x)
x_sequences  = tokenizer.texts_to_sequences(x)

# pad each sequence to be max length
maxlen = max(len(x) for x in x_sequences)
print(maxlen)
x_sequences = sequence.pad_sequences(x_sequences, maxlen)

296


In [0]:
# from keras.utils import to_categorical
# print(y.shape)
# y = to_categorical(y)
# print(y.shape)

(72662,)
(72662, 100)


In [0]:
# split test and train
x_train, x_test, y_train, y_test = train_test_split(x_sequences, y, test_size=0.20, random_state=22)

#### 1. Faster RNN Model: CuDNNLSTM 

- Run on Google Colab, 14 mins.    
- Very overfit

In [0]:
# ref: https://keras.io/layers/recurrent/#cudnnlstm
# faster LSTM implementation

model1 = Sequential()
model1.add(Embedding(input_dim=max_features,
                     output_dim=128))
model1.add(CuDNNLSTM(128))
model1.add(Dropout(0.5))
model1.add(Dense(14, activation="softmax"))

model1.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])

In [35]:
history1 = model1.fit(x_train, y_train,
            batch_size=128,
            epochs=16,
            validation_data=(x_test, y_test))

Train on 7999 samples, validate on 2000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [36]:
print('accuracy', model1.evaluate(x_test, y_test)[1])

accuracy 0.372


In [37]:
print('accuracy', model1.evaluate(x_train, y_train)[1])

accuracy 0.9868733591698963


#### 2. CNN

In [0]:
model_conv2 = Sequential()
model_conv2.add(Embedding(max_features, output_dim = 30, input_length=296))
model_conv2.add(Dropout(0.2))
model_conv2.add(Conv1D(128, 5, activation='relu'))
model_conv2.add(Conv1D(128, 5, activation='relu'))
model_conv2.add(GlobalMaxPooling1D())
model_conv2.add(Dropout(0.2))
model_conv2.add(Dense(14, activation='softmax'))

In [0]:
from keras import optimizers
opt = optimizers.rmsprop(lr=0.001) # speed up optimization
model_conv2.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["acc"])


In [40]:
history2 = model_conv2.fit(x_train, y_train,
            batch_size=128,
            epochs=16,
            validation_data=(x_test, y_test))

Train on 7999 samples, validate on 2000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [41]:
model_conv2.evaluate(x_test,y_test)




[2.312704874038696, 0.376]

In [42]:
model_conv2.evaluate(x_train,y_train)



[0.46749899311056137, 0.8811101386704763]

#### 3. CNN + LSTM

In [0]:
model_conv3 = Sequential()
model_conv3.add(Embedding(max_features, output_dim = 30, input_length=296))
model_conv3.add(Dropout(0.2))
model_conv3.add(Conv1D(128, 5, activation='relu'))
model_conv3.add(MaxPooling1D(4))
model_conv3.add(LSTM(100))
#model_conv.add(GlobalMaxPooling1D())

model_conv3.add(Dense(14, activation='softmax'))

In [0]:
opt = optimizers.rmsprop(lr=0.001) # speed up optimization
model_conv3.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["acc"])

In [45]:
history3 = model_conv3.fit(x_train, y_train,
            batch_size=128,
            epochs=16,
            validation_data=(x_test, y_test))

Train on 7999 samples, validate on 2000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [46]:
model_conv3.evaluate(x_test, y_test)




[2.7244121742248537, 0.3385]

In [47]:
model_conv3.evaluate(x_train, y_train)



[0.4914594929297874, 0.8552319038836773]

#### 4. Simple Pooling Model

In [0]:
#ref: https://github.com/keras-team/keras/blob/master/examples/imdb_fasttext.py

In [0]:
model4 = Sequential()
model4.add(Embedding(input_dim=max_features,
                    output_dim=30))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model4.add(GlobalAveragePooling1D())
model4.add(Dropout(0.5))

model4.add(Dense(14, activation="softmax"))

In [0]:
opt = keras.optimizers.Adam(lr=0.01) # speed up optimization
model4.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["acc"])

In [50]:
history4 = model4.fit(x_train, y_train,
            batch_size=256,
            epochs=16,
            validation_data=(x_test, y_test))

Train on 7999 samples, validate on 2000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [51]:
print('accuracy', model4.evaluate(x_test, y_test)[1])

accuracy 0.51


In [52]:
print('accuracy', model4.evaluate(x_train, y_train)[1])

accuracy 0.7614701836910049
