### Preprocess Text Data and Modelling 
### ANLY 590 Project


***

#### Prep

In [1]:
# set working directory
import os
import sys
path = '/Users/kgedney/Documents/georgetown/anly590/author-id-project'
os.chdir(path)

In [3]:
import numpy as np
import pandas as pd

In [4]:
# load filtered data
df = pd.read_csv('filtered_data.csv')

#### Text Cleaning

ref: https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/

In [33]:
# remove URLs and replace as '<url>'
import re
df['body_no_urls'] = df.apply(lambda row: re.sub(r"http\S+", "<url>", row['body']), axis=1) 

In [37]:
# tokenize
from nltk.tokenize import word_tokenize
df['tokenized_nltk']  = df.apply(lambda row: word_tokenize(row['body_no_urls']), axis=1)

In [None]:
# remove NERs

In [None]:
import spacy
nlp = spacy.load('en')

In [None]:
def get_ners(input_doc):
    doc = nlp(input_doc)
    ners = str(doc.ents)
    return (ners)

#### Modelling

In [176]:
# install packages
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import keras
import tensorflow as tf
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.models import Model, Sequential
from keras.layers import Input, Dense
from keras.layers import LSTM, Embedding
from keras.layers.core import Dense, Dropout

In [57]:
# create class assignments
df['author_id'] = pd.Categorical(df.author).codes

#### Baseline Model: Linear SVM

In [177]:
x = df['body_no_urls'].values
y = df['author_id'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=22)

tfidf_vec   = TfidfVectorizer()
x_train_vec = tfidf_vec.fit_transform(x_train)
x_test_vec  = tfidf_vec.transform(x_test)
x_train_vec.shape

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(58129, 53161)

In [178]:
from sklearn.svm import LinearSVC
clf             = LinearSVC().fit(x_train_vec, y_train)
predicted       = clf.predict(x_test_vec)
predicted_score = clf.decision_function(x_test_vec)

In [179]:
print('accuracy', metrics.accuracy_score(y_test, predicted))

accuracy 0.37789857565540497


#### Recurrent Nueral Network: LSTM

In [108]:
# data preprocess
x = df['tokenized_nltk'].values
y = df['author_id'].values

# create sequences
max_features = 2000
tokenizer    = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(x)
x_sequences  = tokenizer.texts_to_sequences(x)

# pad each sequence to be max length
maxlen = max(len(x) for x in x_sequences)
print(maxlen)
x_sequences = sequence.pad_sequences(x_sequences, maxlen)

In [111]:
# split test and train
x_train, x_test, y_train, y_test = train_test_split(x_sequences, y, test_size=0.20, random_state=22)

In [117]:
model = Sequential()

# add embedding layer
model.add(Embedding(input_dim=max_features,
                    output_dim=128,
                    mask_zero=True))
# add rnn layer
model.add(LSTM(128))
# model.add(Dropout(0.2))

# output layer, 100-categorical classification
model.add(Dense(100, activation="softmax"))

In [118]:
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["acc"])

In [119]:
model.fit(x_train, y_train,
            batch_size=128,
            epochs=1,
            validation_data=(x_test, y_test))

Train on 58129 samples, validate on 14533 samples
Epoch 1/1
 4608/58129 [=>............................] - ETA: 3:20 - loss: 4.6005 - acc: 0.0117

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x12af0d9b0>>
Traceback (most recent call last):
  File "/Users/kgedney/anaconda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1408, in __del__
    def __del__(self):
KeyboardInterrupt


 5632/58129 [=>............................] - ETA: 3:18 - loss: 4.5987 - acc: 0.0133

KeyboardInterrupt: 

In [None]:
print('accuracy', model.evaluate(x_test, y_test)[1])

In [116]:
(y_train == 57).mean() # guess majority class baseline

0.012558275559531387

In [None]:
keras.metrics.top_k_categorical_accuracy(y_test, predicted, k=5)