<a href="https://colab.research.google.com/github/kgedney/author-id-project/blob/master/exp1_text_preprocessing_and_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Preprocess Text Data and Modelling with Name-Entities Intacted
### ANLY 590 Project


***

#### Google Colab Prep

In [0]:
import numpy as np
import pandas as pd

In [2]:
# get data from Google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading subsetted dataset with 9,999 rows and 14 authors

In [3]:
# manually add file from local to drive
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/filtered_data_sub.csv')
df.head()

Unnamed: 0,author,body,subreddit,permalink,num_chars,num_words
0,GuaranteedAdmission,You can be in favor of a policy he supports an...,AskReddit,/r/AskReddit/comments/9zklcs/as_a_brit_who_onl...,74,17
1,GuaranteedAdmission,That depends. What is the penalty for breaking...,AskReddit,/r/AskReddit/comments/9zkfcd/how_would_you_fee...,202,36
2,GuaranteedAdmission,Because I like to think long term. Tax cuts fo...,AskReddit,/r/AskReddit/comments/9yxl98/liberals_of_reddi...,120,22
3,GuaranteedAdmission,"We don't talk about the Highway Shoes, OP! Are...",AskReddit,/r/AskReddit/comments/9yrbvd/people_who_lost_a...,110,22
4,GuaranteedAdmission,"Are you badmouthing Mistress Luna, infidel? A ...",Stellaris,/r/Stellaris/comments/9y9y1j/oh_ok_then/ea12kh0/,93,15


In [11]:
len(df)

9999

In [4]:
# download nltk
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

#### Text Cleaning

ref: https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/

In [0]:
# remove URLs and replace as '<url>'
import re
df['body_no_urls'] = df.apply(lambda row: re.sub(r"http\S+", "<url>", row['body']), axis=1) 

In [0]:
# tokenize
from nltk.tokenize import word_tokenize
df['tokenized_nltk']  = df.apply(lambda row: word_tokenize(row['body_no_urls']), axis=1)

***

#### Preprocess  for Modelling

In [12]:
# install packages
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import keras
from keras import optimizers
import tensorflow as tf

from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.models import Model, Sequential
from keras.layers import Input, Dense, CuDNNLSTM, LSTM, Embedding, Bidirectional, GlobalAveragePooling1D, Conv1D, Activation, Flatten, Dropout, MaxPooling1D, Embedding, GlobalMaxPooling1D

from keras.layers.core import Dense, Dropout

Using TensorFlow backend.


In [0]:
# create class assignments
df['author_id'] = pd.Categorical(df.author).codes

#### 0. Baseline Model: Linear SVM

In [21]:
x = df['body'].values
y = df['author_id'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=22)

tfidf_vec   = TfidfVectorizer()
x_train_vec = tfidf_vec.fit_transform(x_train)
x_test_vec  = tfidf_vec.transform(x_test)
x_train_vec.shape

(7999, 20458)

In [0]:
from sklearn.svm import LinearSVC
clf             = LinearSVC().fit(x_train_vec, y_train)
predicted       = clf.predict(x_test_vec)
predicted_score = clf.decision_function(x_test_vec)

In [16]:
print('accuracy', metrics.accuracy_score(y_test, predicted))

accuracy 0.5805


In [17]:
# top k accuracy (# ref: https://scikit-learn.org/stable/modules/svm.html)
predicted_score = clf.decision_function(x_test_vec)
predicted_score.shape

(2000, 14)

In [0]:
# transform matrix of predictions to put them in order
best_n = predicted_score.argsort()[:,::-1] # need to do in reverse order thats why need "[::-1]"

In [19]:
# set up function to calculate
count = 0
for i in range(0, y_test.shape[0]):
    if (y_test[i] in best_n[i,0:5]):
        count = count + 1

top_5_acc = count / y_test.shape[0]
top_5_acc

0.8695

#### Preprocess Data for Keras Models

In [22]:
# data preprocess
x = df['tokenized_nltk'].values
y = df['author_id'].values

# create sequences
max_features = 25000
tokenizer    = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(x)
x_sequences  = tokenizer.texts_to_sequences(x)

# pad each sequence to be max length
maxlen = max(len(x) for x in x_sequences)
print(maxlen)
x_sequences = sequence.pad_sequences(x_sequences, maxlen)

229


In [0]:
# from keras.utils import to_categorical
# print(y.shape)
# y = to_categorical(y)
# print(y.shape)

(72662,)
(72662, 100)


In [0]:
# split test and train
x_train, x_test, y_train, y_test = train_test_split(x_sequences, y, test_size=0.20, random_state=22)

#### 1. Faster RNN Model: CuDNNLSTM 

- Run on Google Colab, 14 mins.    
- Very overfit

In [0]:
# ref: https://keras.io/layers/recurrent/#cudnnlstm
# faster LSTM implementation

model1 = Sequential()
model1.add(Embedding(input_dim=max_features,
                     output_dim=128))
model1.add(CuDNNLSTM(128))
model1.add(Dropout(0.5))
model1.add(Dense(14, activation="softmax"))

model1.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])

In [25]:
history1 = model1.fit(x_train, y_train,
            batch_size=128,
            epochs=16,
            validation_data=(x_test, y_test))

Train on 7999 samples, validate on 2000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [26]:
print('accuracy', model1.evaluate(x_test, y_test)[1])

accuracy 0.463


In [27]:
print('accuracy', model1.evaluate(x_train, y_train)[1])

accuracy 0.9841230153769222


#### 2. CNN

In [0]:
model_conv2 = Sequential()
model_conv2.add(Embedding(max_features, output_dim = 30, input_length=229))
model_conv2.add(Dropout(0.2))
model_conv2.add(Conv1D(128, 5, activation='relu'))
model_conv2.add(Conv1D(128, 5, activation='relu'))
model_conv2.add(GlobalMaxPooling1D())
model_conv2.add(Dropout(0.2))
model_conv2.add(Dense(14, activation='softmax'))

In [0]:
from keras import optimizers
opt = optimizers.rmsprop(lr=0.001) # speed up optimization
model_conv2.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["acc"])


In [30]:
history2 = model_conv2.fit(x_train, y_train,
            batch_size=128,
            epochs=16,
            validation_data=(x_test, y_test))

Train on 7999 samples, validate on 2000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [31]:
model_conv2.evaluate(x_test,y_test)




[2.084706964492798, 0.48]

In [32]:
model_conv2.evaluate(x_train,y_train)



[0.1864259823502891, 0.9563695462007257]

#### 3. CNN + LSTM

In [0]:
model_conv3 = Sequential()
model_conv3.add(Embedding(max_features, output_dim = 30, input_length=229))
model_conv3.add(Dropout(0.2))
model_conv3.add(Conv1D(128, 5, activation='relu'))
model_conv3.add(MaxPooling1D(4))
model_conv3.add(LSTM(100))
#model_conv.add(GlobalMaxPooling1D())

model_conv3.add(Dense(14, activation='softmax'))

In [0]:
opt = optimizers.rmsprop(lr=0.001) # speed up optimization
model_conv3.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["acc"])

In [35]:
history3 = model_conv3.fit(x_train, y_train,
            batch_size=128,
            epochs=16,
            validation_data=(x_test, y_test))

Train on 7999 samples, validate on 2000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [36]:
model_conv3.evaluate(x_test, y_test)




[2.3283257150650023, 0.4355]

In [37]:
model_conv3.evaluate(x_train, y_train)



[0.22807032822750406, 0.9424928116089017]

#### 4. Simple Pooling Model

In [0]:
#ref: https://github.com/keras-team/keras/blob/master/examples/imdb_fasttext.py

In [0]:
model4 = Sequential()
model4.add(Embedding(input_dim=max_features,
                    output_dim=30))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model4.add(GlobalAveragePooling1D())
model4.add(Dropout(0.5))

model4.add(Dense(14, activation="softmax"))

In [0]:
opt = keras.optimizers.Adam(lr=0.01) # speed up optimization
model4.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["acc"])

In [40]:
history4 = model4.fit(x_train, y_train,
            batch_size=256,
            epochs=16,
            validation_data=(x_test, y_test))

Train on 7999 samples, validate on 2000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [41]:
print('accuracy', model4.evaluate(x_test, y_test)[1])

accuracy 0.63


In [42]:
print('accuracy', model4.evaluate(x_train, y_train)[1])

accuracy 0.8828603575521446
