In [133]:
import pandas as pd
import json
from pandas.io.json import json_normalize
import sys
import math
import os
import numpy as np
import nltk
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
nltk.download('stopwords')

# local code
sys.path.insert(1, "./code/")
from Utils import Utils # student's library
from Eval import Eval # student's library
from Extract import Extract # student's library


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peterkong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [134]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [135]:
# sanity checks
#train_df.id.nunique()
# 19579

#train_df.author.unique()
# array(['EAP', 'HPL', 'MWS'], dtype=object)

In [136]:
Utils.check_for_nulls(train_df)

{'author': 0, 'id': 0, 'text': 0}

In [137]:
# splitting data

# regular data
#     train: 19580 * .9 rows
#     test:  8393 rows
#     val:   19580 * .1 rows


if os.path.isfile('data/traindata.pickle'):
    traindata = pd.read_pickle('data/traindata.pickle')
    valdata   = pd.read_pickle('data/valdata.pickle')
    testdata  = pd.read_pickle('data/testdata.pickle')
else: 
    VAL_IDX  = math.ceil(len(train_df) * .8)
    TEST_IDX = math.ceil(len(train_df) * .9)

    traindata = train_df[:VAL_IDX]
    valdata   = train_df[VAL_IDX:TEST_IDX]
    testdata  = train_df[TEST_IDX:]

    print(VAL_IDX, TEST_IDX)

    traindata.to_pickle('data/traindata.pickle')
    valdata.to_pickle('data/valdata.pickle')
    testdata.to_pickle('data/testdata.pickle')

In [138]:
print("traindata: {}, valdata: {}, testdata: {}".format(len(traindata), len(valdata), len(testdata)))

traindata: 15664, valdata: 1958, testdata: 1957


# Feature engineering

In [139]:
# labels
Y_train = list(traindata.author )
Y_val = list(valdata.author)
Y_test = list(testdata.author)

In [146]:
# grammatical feature engineering 
# we want to include stopwords here

def gen_gram_feats(name, seq_no, train, val, test):
    train_df = Extract.gram_feats(train.text, None, seq_no)

    # need to remember so that val/test process
    # does not add additional columns
    gram_feat_list = list(train_df.columns)

    val_df = Extract.gram_feats(val.text, gram_feat_list, seq_no)
    test_df = Extract.gram_feats(test.text, gram_feat_list, seq_no)

    # removes a singleton feature
    for df in [train_df, val_df, test_df]:
        if 'SYM_count' in list(df.columns):
            df.drop('SYM_count', axis=1, inplace=True)
        
    train_df.to_pickle('data/train_' + name + '_df.pickle')
    val_df.to_pickle('data/val_' + name + '_df.pickle')
    test_df.to_pickle('data/test_' + name + '_df.pickle')       


if os.path.isfile('data/train_gram_df.pickle'):
    print("reading gram feats from pickle")
    train_gram_df = pd.read_pickle('data/train_gram_df.pickle')
    val_gram_df   = pd.read_pickle('data/val_gram_df.pickle')
    test_gram_df  = pd.read_pickle('data/test_gram_df.pickle')
    
    train_gram_seq_df = pd.read_pickle('data/train_gram_seq_df.pickle')
    val_gram_seq_df   = pd.read_pickle('data/val_gram_seq_df.pickle')
    test_gram_seq_df  = pd.read_pickle('data/test_gram_seq_df.pickle')
else:
    print("writing gram feats pickles")
    gen_gram_feats('gram', None, traindata, valdata, testdata)
    gen_gram_feats('gram_seq', 7, traindata, valdata, testdata)


reading gram feats from pickle


In [149]:
print(train_gram_df.shape)
print(val_gram_df.shape)
print(test_gram_df.shape)

print(train_gram_seq_df.shape)
print(val_gram_seq_df.shape)
print(test_gram_seq_df.shape)

assert(train_gram_df.shape == (15664, 23))
assert(train_gram_seq_df.shape == (15664, 1622))
#set(GRAM_FEAT_LIST) - set(list(val_gram_feats_df.columns))

(15664, 23)
(1958, 23)
(1957, 23)
(15664, 1622)
(1958, 1622)
(1957, 1622)


In [151]:
train_gram_seq_df.head()

Unnamed: 0,Unnamed: 1,ADJ_count,ADP_count,ADV_count,CCONJ,CCONJ_CCONJ_CCONJ_VERB_NOUN,CCONJ_CCONJ_NOUN,CCONJ_CCONJ_NOUN_CCONJ_NOUN_CCONJ_NOUN,CCONJ_CCONJ_NOUN_CCONJ_NOUN_NOUN_VERB,CCONJ_CCONJ_NOUN_CCONJ_NOUN_VERB_VERB,...,X_count,adj_noun_ratio,adv_verb_ratio,bang_count,colon_count,ellipse_count,lparen_count,quote_count,semicolon_count,sent_len
0,,4,6.0,3,,,,,,,...,,2.25,2.666667,0,0,0,0,0,2,231
1,,1,2.0,2,,,,,,,...,,2.0,1.5,0,0,0,0,0,0,71
2,,7,6.0,1,,,,,,,...,,1.428571,4.0,0,0,0,0,0,0,200
3,,6,6.0,2,,,,,,,...,,1.0,3.0,0,0,0,0,0,0,206
4,,4,3.0,4,,,,,,,...,,1.5,1.25,0,0,0,0,0,1,174


In [142]:
# textual feature engineering
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words=STOPWORDS, max_features=1500)

train_text_feats = vectorizer.fit_transform(traindata.text)
val_text_feats = vectorizer.transform(valdata.text) 
test_text_feats = vectorizer.transform(testdata.text) 

cols = ["text_" + str(x) for x in range(train_text_feats.shape[1])]

train_text_feats_df = pd.DataFrame(train_text_feats.todense(), index=None, columns=cols)
val_text_feats_df = pd.DataFrame(val_text_feats.todense(), index=None, columns=cols)
test_text_feats_df = pd.DataFrame(test_text_feats.todense(), index=None, columns=cols)

print(train_text_feats_df.shape)
print(val_text_feats_df.shape)
print(test_text_feats_df.shape)


(15664, 1500)
(1958, 1500)
(1957, 1500)


In [143]:
# persist to disk
# if not os.path.isfile('data/train_text_feats_df.pickle'):
#     train_text_feats_df.to_pickle('data/train_text_feats_df.pickle')
#     val_text_feats_df.to_pickle('data/val_text_feats_df.pickle')
#     test_text_feats_df.to_pickle('data/test_text_feats_df.pickle')  

In [144]:
train_text_feats_df.head()

Unnamed: 0,text_0,text_1,text_2,text_3,text_4,text_5,text_6,text_7,text_8,text_9,...,text_1490,text_1491,text_1492,text_1493,text_1494,text_1495,text_1496,text_1497,text_1498,text_1499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.225799,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
# gensim feature engineering
import numpy as np
import gensim
GENSIM = True

if GENSIM:

    #https://radimrehurek.com/gensim/models/doc2vec.html
    from gensim.test.utils import common_texts
    from gensim.models.doc2vec import Doc2Vec, TaggedDocument
    #TaggedDocument does not filter or stem

    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(list(X_val.text))]
    model = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)

    train_gensim = np.array([model.infer_vector(x) for x in list(X_train.text)])
    val_gensim = np.array([model.infer_vector(x) for x in list(X_val.text)])
    test_gensim = np.array([model.infer_vector(x) for x in list(X_test.text)])

    # numpy to pandas
    cols = ["gensim_" + str(x) for x in range(len(train_gensim[0]))]

    train_gensim_df = pd.DataFrame(train_gensim, index=None, columns=cols)
    val_gensim_df = pd.DataFrame(val_gensim, index=None, columns=cols)
    test_gensim_df = pd.DataFrame(test_gensim, index=None, columns=cols)
    
# gensim didn't help. so we're settling on tfidf extual features for now, and will explore neural models


AttributeError: 'DataFrame' object has no attribute 'text'

# Exploration

In [None]:
# lex = Utils.build_lexicon(traindata.text, STOPWORDS)
# len(lex)

# 22847 different tokens in full lexicon

In [None]:
mws_df = train_df[train_df.author == 'MWS']
hpl_df = train_df[train_df.author == 'HPL']
eap_df = train_df[train_df.author == 'EAP']

cutoff = min([mws_df.shape[0], hpl_df.shape[0], eap_df.shape[0]])

# equalize corpus sizes to avoid bias during exploration
mws_df = mws_df[:cutoff]
hpl_df = hpl_df[:cutoff]
eap_df = eap_df[:cutoff]

mws_lexicon = Utils.build_lexicon(mws_df.text, STOPWORDS)
hpl_lexicon = Utils.build_lexicon(hpl_df.text, STOPWORDS)
eap_lexicon = Utils.build_lexicon(eap_df.text, STOPWORDS)

# sanity check
assert(cutoff * 3 == len(mws_df) + len(hpl_df) + len(eap_df))

# add grammatical features (for exploration this time, not training)
mws_gram_feats_df = Extract.gram_feats(mws_df.text, None, None)
hpl_gram_feats_df = Extract.gram_feats(hpl_df.text, None, None)
eap_gram_feats_df = Extract.gram_feats(eap_df.text, None, None)

mws_gram_feats_df.describe()

In [None]:
# looks like sentence length values are consistently higher by at least a degree of magnitude
# so we'll take the log
# This is not done via the standardize() method

# for df in [mws_gram_feats_df, hpl_gram_feats_df, eap_gram_feats_df]:
#     df['sent_len'] = df['sent_len'].apply(lambda x: math.log(x))
#     df.rename(inplace=True, columns={'sent_len': 'log_sent_len'})


In [None]:
# NER exploration

# avoiding NER for three reasons:
# features very sparse
# features seem content-specific, so may contribute to misprediction
# should we add data from the same authors about other topics

NER = False
if NER:
    import spacy
    spacy_mdl = spacy.load('en_core_web_sm')

    def sent_to_ents(sent, spacy):
        sent = spacy(sent)
        ents = []
        for ent in sent.ents:
            ents.append(ent.text + ':' + ent.label_)
        return ents

    entities = []
    for i in range(1500):
        sent = valdata.iloc[i].text
        ents = sent_to_ents(sent, spacy_mdl)
        entities.append(ents)
 

# example entities list HERE
# many sentences don't have any entities, like below
#'In whatever way the shifting is managed, it is of course concealed at every step from observation.'

In [None]:
if NER:
    import statistics as stat
    entity_freq = [len(x) for x in entities]


    print("stats for NER within a sample group: \n")
    print("min: {} \nmax: {} \nmean: {} \nstdev: {}" \
          .format(min(entity_freq), max(entity_freq), stat.mean(entity_freq), stat.stdev(entity_freq)))



# Exploration - visualization

In [None]:
# data viz

def plot_word_freq(lexicon, name, quantity=20):
    plt.rcdefaults()
    fig, ax = plt.subplots()

    elems = [x[0] for x in lexicon[:quantity]]
    y_pos = np.arange(quantity)
    vals = [x[1] for x in lexicon[:quantity]]

    ax.barh(y_pos, vals, align='center',
            color='green', ecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(elems)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Corpus-wide frequency')
    ax.set_title(name + ' - Word Frequencies')

    plt.show()
 
authors = {'MWS': mws_lexicon, 'HPL': hpl_lexicon, 'EAP': eap_lexicon}

pd.set_option('display.max_colwidth', -1)
print("Example MWS sentence: \n{}\n".format(mws_df.text[:1].to_string()))
print("Example HPL sentence: \n{}\n".format(hpl_df.text[:1].to_string()))
print("Example EAP sentence: \n{}\n".format(eap_df.text[:1].to_string()))
pd.set_option('display.max_colwidth', 80)

# for key in authors: 
#     plot_word_freq(authors[key], key)


In [None]:
def plot_box(df, subset):
    features = {
        'tag_features': [
             'ADJ_count',
             'ADP_count',
             'ADV_count',
             'CCONJ_count',
             'DET_count',
             'NOUN_count',
             'PRON_count',
             'VERB_count'],
        'punc_features': [
            'bang_count',
            'colon_count',
            'ellipse_count',
            'lparen_count',
            'quote_count',
            'semicolon_count'],
        'ratio_features': [
             'adj_noun_ratio',
             'adv_verb_ratio',
             'log_sent_len']
    }
    
    fig, ax = plt.subplots()
    boxplot = df.boxplot(column=features[subset], \
        showfliers=False, fontsize=6, figsize=None)
    plt.show()

In [None]:
#plot_box(mws_gram_feats_df, 'ratio_features')

In [None]:
#plot_box(hpl_gram_feats_df, 'ratio_features')

In [None]:
#plot_box(eap_gram_feats_df, 'ratio_features')

In [None]:
#plot_box(mws_gram_feats_df, 'tag_features')

In [None]:
#plot_box(hpl_gram_feats_df, 'tag_features')

In [None]:
#plot_box(eap_gram_feats_df, 'tag_features')

In [None]:
#plot_box(mws_gram_feats_df, 'punc_features')

In [None]:
#plot_box(hpl_gram_feats_df, 'punc_features')

In [None]:
#plot_box(eap_gram_feats_df, 'punc_features')

In [None]:
eap_gram_feats_df.describe()

In [None]:
# strangely enough, grepping through the raw input indeed shows that no bang characters exist
# the boxplots indicate that the grammatical features indeed don't seem to have much 
# predictive power, so we'll try other features.

# Model selection, training, prediction

# pipelining

In [None]:
#train_gensim_df

    #traindata.text.to_frame() for gensim
    #val_gram_feats_df.fillna(0)
    
X_train = train_text_feats_df.fillna(0)
X_val = val_text_feats_df.fillna(0)
X_test = test_text_feats_df.fillna(0)

# this cell throws on non numerical columns

# X_train = transform(X_train).fillna(0)
# X_val = transform(X_val).fillna(0)
# X_test = transform(X_test).fillna(0)



start with gram, seq, tfidf, gensim

for each, remove and review score. 5 runs total

pipelines = {'all': {
    'train': [train_gram_seq_df, train_text_feats_df, train_gensim_df], 
    'val': [val_gram_seq_df, val_text_feats_df, val_gensim_df], 
    'test': [val_gram_seq_df, val_text_feats_df, val_gensim_df]}
            
}

def run_pipelines(model, pipelines,):
    for pipe in pipelines:
        init_df
        for df in dfs:
            init_df.join(df)
        #sanity checking
        # assert(list(X_train.columns) == list(X_val.columns))
        # assert(list(X_train.columns) == list(X_test.columns))
        train_test_nn(init_df)

## Linear

In [None]:
def train_n_run(X_train, Y_train, X_val, Y_val, X_test, Y_test):
    lin_clf = LinearSVC()
    lin_clf.fit(X_train, Y_train) 
    
    preds = lin_clf.predict(X_val)
    accuracy = Eval.get_accuracy(preds, Y_val)
    print("Val Accuracy: ", accuracy)
    
    preds = lin_clf.predict(X_test)
    accuracy = Eval.get_accuracy(preds, Y_test)
    print("Test Accuracy: ", accuracy)
    


In [None]:
train_n_run(X_train, Y_train, X_val, Y_val, X_test, Y_test)

## Neural

In [None]:
# preprocessing for NN
encoder = sklearn.preprocessing.LabelEncoder()
encoder.fit(traindata.author)

Y_train = tf.keras.utils.to_categorical(encoder.transform(traindata.author))
Y_val = tf.keras.utils.to_categorical(encoder.transform(valdata.author))
Y_test = tf.keras.utils.to_categorical(encoder.transform(testdata.author))


In [None]:
# inspired by keras docs example: https://www.tensorflow.org/guide/keras#input_numpy_data

# simple NN - but from a BOW perspective.
# we hypothesize that signal is to be recovered from the sequence of features,
# so we'll try RNNs next

model = tf.keras.Sequential([
# Adds a densely-connected layer with 64 units to the model:
tf.keras.layers.Dense(100, activation='relu'),
# Add another:
tf.keras.layers.Dense(100, activation='relu'),
# Add a softmax layer with 10 output units:
tf.keras.layers.Dense(3, activation='softmax')])

# Configure a model for categorical classification.
model.compile(optimizer=tf.train.RMSPropOptimizer(0.01),
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=[tf.keras.metrics.categorical_accuracy]
              )

model.fit(X_train, Y_train, epochs=10, batch_size=32, validation_data=(X_val, Y_val))

# returns loss, accuracy
result = model.predict(X_test, verbose=2 )

In [None]:
label_lookup = {}
for idx, label in enumerate(encoder.classes_):
    label_lookup[idx] = label
label_lookup

def best_label(row, lookup):
    mx = max(row)
    return lookup[list(row).index(mx)]

preds = [best_label(x, label_lookup) for x in result]

accuracy = Eval.get_accuracy(preds, list(testdata.author))
print("Accuracy: ", accuracy)

# Error Analysis

In [None]:
# error analysis
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools

conf_mat = confusion_matrix(Y_val, preds)


# NOTE: this function taken from: 
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f'
    thresh = 500
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


plt.figure()
print(conf_mat)
plot_confusion_matrix(conf_mat, classes=lin_clf.classes_,
                      title="Confusion matrix")
plt.show()

In [None]:
# [good place to insert val vs. test metrics]

In [None]:
import time
timenow = time.asctime( time.localtime(time.time()) )
print("Finished at: ", timenow)