In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize
import sys
import math
import os
import numpy as np
import nltk
import matplotlib.pyplot as plt
nltk.download('stopwords')

# local code
sys.path.insert(1, "./code/")
from Utils import Utils # student's library
from Eval import Eval # student's library
from Extract import Extract # student's library


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peterkong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [3]:
# sanity checks
#train_df.id.nunique()
# 19579

#train_df.author.unique()
# array(['EAP', 'HPL', 'MWS'], dtype=object)

In [4]:
Utils.check_for_nulls(train_df)

{'author': 0, 'id': 0, 'text': 0}

In [5]:
short_df = train_df[:20]
short_df.head(1)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP


In [17]:
# splitting data

# regular data
#     train: 19580 * .9 rows
#     test:  8393 rows
#     val:   19580 * .1 rows


if os.path.isfile('data/traindata.pickle'):
    traindata = pd.read_pickle('data/traindata.pickle')
    valdata   = pd.read_pickle('data/valdata.pickle')
    testdata  = pd.read_pickle('data/testdata.pickle')
else: 
    VAL_IDX  = math.ceil(len(train_df) * .8)
    TEST_IDX = math.ceil(len(train_df) * .9)

    traindata = train_df[:VAL_IDX]
    valdata   = train_df[VAL_IDX:TEST_IDX]
    testdata  = train_df[TEST_IDX:]

    print(VAL_IDX, TEST_IDX)

    traindata.to_pickle('data/traindata.pickle')
    valdata.to_pickle('data/valdata.pickle')
    testdata.to_pickle('data/testdata.pickle')

15664 17622


In [18]:
print("traindata: {}, valdata: {}, testdata: {}".format(len(traindata), len(valdata), len(testdata)))

traindata: 15664, valdata: 1958, testdata: 1957


In [8]:
# grammatical feature engineering 
# we want to include stopwords here

if os.path.isfile('data/train_gram_feats.pickle'):
    print("reading gram feats from pickle")
    train_gram_feats_df = pd.read_pickle('data/train_gram_feats_df.pickle')
    val_gram_feats_df   = pd.read_pickle('data/val_gram_feats_df.pickle')
    test_gram_feats_df  = pd.read_pickle('data/test_gram_feats_df.pickle')
else:
    seq_no = None
    train_gram_feats_df = Extract.gram_feats(traindata.text, None, seq_no)

    # need to remember so that val/test process
    # does not add additional columns
    GRAM_FEAT_LIST = list(train_gram_feats_df.columns)

    val_gram_feats_df = Extract.gram_feats(valdata.text, GRAM_FEAT_LIST, seq_no)
    test_gram_feats_df = Extract.gram_feats(testdata.text, GRAM_FEAT_LIST, seq_no)

    # there are 21 columns excluding sequence columns 
    # 7ary sequence columns can generate up to 2187
    
    train_gram_feats_df.to_pickle('data/train_gram_feats_df.pickle')
    val_gram_feats_df.to_pickle('data/val_gram_feats_df.pickle')
    test_gram_feats_df.to_pickle('data/test_gram_feats_df.pickle')    


In [9]:
# removes a singleton feature
for df in train_gram_feats_df, val_gram_feats_df, test_gram_feats_df:
    if 'SYM_count' in list(df.columns):
        df.drop('SYM_count', axis=1, inplace=True)
        
print(train_gram_feats_df.shape)
print(val_gram_feats_df.shape)
print(test_gram_feats_df.shape)

#set(GRAM_FEAT_LIST) - set(list(val_gram_feats_df.columns))

(15664, 23)
(1958, 23)
(1957, 23)


In [10]:
train_gram_feats_df.shape

(15664, 23)

In [11]:
# textual feature engineering
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words=STOPWORDS, max_features=100)

train_text_feats = vectorizer.fit_transform(traindata.text)
Y_train = traindata.author 

val_text_feats = vectorizer.transform(valdata.text) 
Y_val = list(valdata.author)

test_text_feats = vectorizer.transform(testdata.text) 
Y_test = list(testdata.author)



In [20]:
 type(test_text_feats.todense())

numpy.matrixlib.defmatrix.matrix

In [12]:
#convert text feats to pandas
print(train_text_feats.shape)
print(val_text_feats.shape)
print(test_text_feats.shape)

cols = ["text_" + str(x) for x in range(train_text_feats.shape[1])]

train_text_feats_df = pd.DataFrame(train_text_feats.todense(), index=None, columns=cols)
val_text_feats_df = pd.DataFrame(val_text_feats.todense(), index=None, columns=cols)
test_text_feats_df = pd.DataFrame(test_text_feats.todense(), index=None, columns=cols)

print(train_text_feats_df.shape)
print(val_text_feats_df.shape)
print(test_text_feats_df.shape)


(15664, 100)
(1958, 100)
(1957, 100)
(15664, 100)
(1958, 100)
(1957, 100)


In [16]:
# persist to disk
if not os.path.isfile('data/train_text_feats_df.pickle'):
    train_text_feats_df.to_pickle('data/train_text_feats_df.pickle')
    val_text_feats_df.to_pickle('data/val_text_feats_df.pickle')
    test_text_feats_df.to_pickle('data/test_text_feats_df.pickle')  

In [15]:
# concatenating features

    #traindata.text.to_frame() for gensim
    #val_gram_feats_df.fillna(0)
X_train = train_text_feats_df
X_val = val_text_feats_df.fillna(0)
X_test = test_text_feats_df.fillna(0)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(15664, 1)
(1958, 1)
(1957, 1)


In [16]:
#df = pd.DataFrame({'foo':[1,3,99], 'bar':[2,4,7]})

def transform(df):
    def xform(x, **kwargs):
        return (x - avg) / stdv

    for col in df.columns:
        stdv = df[col].std()
        avg = df[col].mean()
        df[col] = df[col].apply(xform, avg=avg, stdv=stdv)
        
    return df

In [17]:
# this cell throws on non numerical columns

# X_train = transform(X_train).fillna(0)
# X_val = transform(X_val).fillna(0)
# X_test = transform(X_test).fillna(0)

# print(X_train.shape)
# print(X_val.shape)
# print(X_test.shape)

TypeError: could not convert string to float: 'This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.'

In [None]:
#sanity checking
assert(list(X_train.columns) == list(X_val.columns))
assert(list(X_train.columns) == list(X_test.columns))

In [None]:
#list(w2v.keys())[9990:9999]
# [b'iconic',
#  b'erp',
#  b'crest',
#  b'radius',
#  b'spiral',
#  b'nyse',
#  b'lotion',
#  b'oriental',
#  b'admire']

In [None]:
# a_key = list(w2v.keys())[9999]
# a_key.decode("utf-8")

In [None]:
#type(w2v[str.encode('owl')])

In [None]:
# linear svc
from sklearn.svm import LinearSVC

lin_clf = LinearSVC()
lin_clf.fit(X_train, Y_train) 
preds = lin_clf.predict(X_val)
preds

In [None]:
# evaluation

accuracy = Eval.get_accuracy(preds, Y_val)
print("Accuracy: ", accuracy)

In [None]:
# import importlib
# import Extract
# import Utils
# importlib.reload(Extract)
# importlib.reload(Utils)
# from Utils import Utils
from Extract import Extract

In [None]:
# exploration


mws_df = train_df[train_df.author == 'MWS']
hpl_df = train_df[train_df.author == 'HPL']
eap_df = train_df[train_df.author == 'EAP']

cutoff = min([mws_df.shape[0], hpl_df.shape[0], eap_df.shape[0]])

# equalize corpus sizes to avoid bias during exploration
mws_df = mws_df[:cutoff]
hpl_df = hpl_df[:cutoff]
eap_df = eap_df[:cutoff]

mws_lexicon = Utils.build_lexicon(mws_df.text, STOPWORDS)
hpl_lexicon = Utils.build_lexicon(hpl_df.text, STOPWORDS)
eap_lexicon = Utils.build_lexicon(eap_df.text, STOPWORDS)

# sanity check
assert(cutoff * 3 == len(mws_df) + len(hpl_df) + len(eap_df))

# add grammatical features (for exploration this time, not training)
mws_gram_feats_df = Extract.gram_feats(mws_df.text, None, None)
hpl_gram_feats_df = Extract.gram_feats(hpl_df.text, None, None)
eap_gram_feats_df = Extract.gram_feats(eap_df.text, None, None)

mws_gram_feats_df.describe()

In [None]:
# looks like sentence length values are consistently higher by at least a degree of magnitude
# so we'll take the log
for df in [mws_gram_feats_df, hpl_gram_feats_df, eap_gram_feats_df]:
    df['sent_len'] = df['sent_len'].apply(lambda x: math.log(x))
    df.rename(inplace=True, columns={'sent_len': 'log_sent_len'})


In [None]:
# data viz

def plot_word_freq(lexicon, name, quantity=20):
    plt.rcdefaults()
    fig, ax = plt.subplots()

    elems = [x[0] for x in lexicon[:quantity]]
    y_pos = np.arange(quantity)
    vals = [x[1] for x in lexicon[:quantity]]

    ax.barh(y_pos, vals, align='center',
            color='green', ecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(elems)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Corpus-wide frequency')
    ax.set_title(name + ' - Word Frequencies')

    plt.show()
 
authors = {'MWS': mws_lexicon, 'HPL': hpl_lexicon, 'EAP': eap_lexicon}

pd.set_option('display.max_colwidth', -1)
print("Example MWS sentence: \n{}\n".format(mws_df.text[:1].to_string()))
print("Example HPL sentence: \n{}\n".format(hpl_df.text[:1].to_string()))
print("Example EAP sentence: \n{}\n".format(eap_df.text[:1].to_string()))
pd.set_option('display.max_colwidth', 80)

# for key in authors: 
#     plot_word_freq(authors[key], key)


In [None]:
def plot_box(df, subset):
    features = {
        'tag_features': [
             'ADJ_count',
             'ADP_count',
             'ADV_count',
             'CCONJ_count',
             'DET_count',
             'NOUN_count',
             'PRON_count',
             'VERB_count'],
        'punc_features': [
            'bang_count',
            'colon_count',
            'ellipse_count',
            'lparen_count',
            'quote_count',
            'semicolon_count'],
        'ratio_features': [
             'adj_noun_ratio',
             'adv_verb_ratio',
             'log_sent_len']
    }
    
    fig, ax = plt.subplots()
    boxplot = df.boxplot(column=features[subset], \
        showfliers=False, fontsize=6, figsize=None)
    plt.show()

In [None]:
#plot_box(mws_gram_feats_df, 'ratio_features')

In [None]:
#plot_box(hpl_gram_feats_df, 'ratio_features')

In [None]:
#plot_box(eap_gram_feats_df, 'ratio_features')

In [None]:
#plot_box(mws_gram_feats_df, 'tag_features')

In [None]:
#plot_box(hpl_gram_feats_df, 'tag_features')

In [None]:
#plot_box(eap_gram_feats_df, 'tag_features')

In [None]:
#plot_box(mws_gram_feats_df, 'punc_features')

In [None]:
#plot_box(hpl_gram_feats_df, 'punc_features')

In [None]:
#plot_box(eap_gram_feats_df, 'punc_features')

In [None]:
eap_gram_feats_df.describe()

In [None]:
# strangely enough, grepping through the raw input indeed shows that no bang characters exist
# the boxplots indicate that the grammatical features indeed don't seem to have much 
# predictive power, so we'll try other features.

In [None]:
# gensim feature engineering

import numpy as np
import gensim

# http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

# this glove code is wrong type for later work
# with open("data/glove.42B.300d.txt", "rb") as lines:
#     w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
#            for line in lines}


# let X be a list of tokenized texts (i.e. list of lists of tokens)
# sentences param is token lists

# model = gensim.models.Word2Vec(short_sents, size=100)
# w2v = dict(zip(model.wv.index2word, model.wv.vectors))
# type(w2v)

In [None]:
#https://radimrehurek.com/gensim/models/doc2vec.html
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
#TaggedDocument does not filter or stem

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(list(X_val.text))]
model = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)

In [None]:
#sample = Utils.tokenize(X_train.iloc[0].text)

# vector = model.infer_vector(sample)
# vector

train_gensim = np.array([model.infer_vector(x) for x in list(X_train.text)])
val_gensim = np.array([model.infer_vector(x) for x in list(X_val.text)])
#X_test = np.array([model.infer_vector(x) for x in list(X_test.text)])

In [None]:
# numpy to pandas
cols = ["gensim_" + str(x) for x in range(len(train_gensim[0]))]


train_gensim_df = pd.DataFrame(train_gensim, index=None, columns=cols)
val_gensim_df = pd.DataFrame(val_gensim, index=None, columns=cols)

In [None]:
lin_clf = LinearSVC()
lin_clf.fit(train_gensim_df, Y_train) 
preds = lin_clf.predict(val_gensim_df)
preds

accuracy = Eval.get_accuracy(preds, Y_val)
print("Accuracy: ", accuracy)

In [None]:
# gensim didn't help. so we're settling on tfidf extual features for now, and will explore neural models

In [None]:
import time
timenow = time.asctime( time.localtime(time.time()) )
print("Finished at: ", timenow)