In [38]:
import pandas as pd
import json
from pandas.io.json import json_normalize
import sys
import math
import os
import numpy as np
import nltk
nltk.download('stopwords')

# local code
sys.path.insert(1, "./code/")
from Utils import Utils # student's library
from Eval import Eval # student's library
from Extract import Extract # student's library


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peterkong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [3]:
# sanity checks
#train_df.id.nunique()
# 19579

#train_df.author.unique()
# array(['EAP', 'HPL', 'MWS'], dtype=object)

In [4]:
Utils.check_for_nulls(train_df)

{'author': 0, 'id': 0, 'text': 0}

In [5]:
short_df = train_df[:20]
short_df.head(1)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP


In [6]:
# splitting data

# regular data
#     train: 19580 * .9 rows
#     test:  8393 rows
#     val:   19580 * .1 rows


if os.path.isfile('data/traindata.pickle'):
    traindata = pd.read_pickle('data/traindata.pickle')
    valdata   = pd.read_pickle('data/valdata.pickle')
    testdata  = pd.read_pickle('data/testdata.pickle')
else: 
    VAL_IDX  = math.ceil(len(train_df) * .8)
    TEST_IDX = math.ceil(len(train_df) * .9)

    traindata = train_df[:VAL_IDX]
    valdata   = train_df[VAL_IDX:TEST_IDX]
    testdata  = train_df[TEST_IDX:]

    print(VAL_IDX, TEST_IDX)

    traindata.to_pickle('data/traindata.pickle')
    valdata.to_pickle('data/valdata.pickle')
    testdata.to_pickle('data/testdata.pickle')

In [68]:
# grammatical feature engineering 
# we want to include stopwords here

if os.path.isfile('data/train_gram_feats.pickle'):
    train_gram_feats_df = Extract.gram_feats(traindata.text)

    # need to remember so that val/test process
    # does not add additional columns
    GRAM_FEAT_LIST = list(train_gram_feats_df.columns)

    val_gram_feats_df = Extract.gram_feats(testdata.text, GRAM_FEAT_LIST)
    test_gram_feats_df = Extract.gram_feats(valdata.text, GRAM_FEAT_LIST)

    # there are 21 columns excluding sequence columns 
    # 7ary sequence columns can generate up to 2187
    
    train_gram_feats_df.to_pickle('data/train_gram_feats.pickle')
    val_gram_feats_df.to_pickle('data/val_gram_feats.pickle')
    test_gram_feats_df.to_pickle('data/test_gram_feats.pickle')

else:
    train_gram_feats_df = pd.read_pickle('data/train_gram_feats_df.pickle')
    val_gram_feats_df = pd.read_pickle('data/val_gram_feats_df.pickle')
    test_gram_feats_df = pd.read_pickle('data/test_gram_feats_df.pickle')


ValueError: labels ['SYM_count'] not contained in axis

In [70]:
# removes a singleton feature
for df in train_gram_feats_df, val_gram_feats_df, test_gram_feats_df:
    if 'SYM_count' in list(df.columns):
        df.drop('SYM_count', axis=1, inplace=True)
        
print(train_gram_feats_df.shape)
print(val_gram_feats_df.shape)
print(test_gram_feats_df.shape)

set(GRAM_FEAT_LIST) - set(list(val_gram_feats_df.columns))

(15664, 23)
(1957, 23)
(1958, 23)


{'SYM_count'}

In [49]:
# textual feature engineering
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words=STOPWORDS, max_features=100)

train_text_feats = vectorizer.fit_transform(traindata.text)
Y_train = traindata.author 

val_text_feats = vectorizer.transform(valdata.text) 
Y_val = list(valdata.author)

test_text_feats = vectorizer.transform(testdata.text) 
Y_test = list(testdata.author)



In [62]:
#convert to pandas
print(train_text_feats.shape)
print(val_text_feats.shape)
print(test_text_feats.shape)

cols = ["text_" + str(x) for x in range(train_text_feats.shape[1])]

train_text_feats_df = pd.DataFrame(train_text_feats.todense(), index=None, columns=cols)
val_text_feats_df = pd.DataFrame(val_text_feats.todense(), index=None, columns=cols)
test_text_feats_df = pd.DataFrame(test_text_feats.todense(), index=None, columns=cols)

(15664, 100)
(1958, 100)
(1957, 100)


In [71]:
#train_df.text

In [72]:
# concatenating features

X_train = train_gram_feats_df.join(train_text_feats_df)
X_val = val_gram_feats_df.join(val_text_feats_df)
X_test = test_gram_feats_df.join(test_text_feats_df)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)


(15664, 123)
(1957, 123)
(1958, 123)


In [None]:
import numpy as np
import gensim

# http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

# this glove code is wrong type for later work
# with open("data/glove.42B.300d.txt", "rb") as lines:
#     w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
#            for line in lines}


# let X be a list of tokenized texts (i.e. list of lists of tokens)
# sentences param is token lists

# model = gensim.models.Word2Vec(short_sents, size=100)
# w2v = dict(zip(model.wv.index2word, model.wv.vectors))
# type(w2v)

In [None]:
#list(w2v.keys())[9990:9999]
# [b'iconic',
#  b'erp',
#  b'crest',
#  b'radius',
#  b'spiral',
#  b'nyse',
#  b'lotion',
#  b'oriental',
#  b'admire']

In [None]:
# a_key = list(w2v.keys())[9999]
# a_key.decode("utf-8")

In [None]:
#type(w2v[str.encode('owl')])

In [None]:
#https://radimrehurek.com/gensim/models/doc2vec.html
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
#TaggedDocument does not filter or stem

# documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(short_sents)]
# model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# vector = model.infer_vector(["system", "response"])
# vector

In [None]:
# order of ops:
# just grammar features
# just tfidf vectorizer
# both

# kmeans

In [35]:
# linear svc
from sklearn.svm import LinearSVC

lin_clf = LinearSVC()
lin_clf.fit(X_train, Y_train) 

preds = lin_clf.predict(X_val)
preds

array(['EAP', 'HPL', 'EAP', ..., 'MWS', 'MWS', 'EAP'], dtype=object)

In [36]:
# evaluation

accuracy = Eval.get_accuracy(preds, Y_val)
print("Accuracy: ", accuracy)

Accuracy:  0.8212461695607763


In [None]:
X_train[0]

In [32]:
# import importlib
# from Extract import Extract
#importlib.reload(Extract)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peterkong/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import time
timenow = time.asctime( time.localtime(time.time()) )
print("Finished at: ", timenow)

In [43]:
arr = np.ones((3,3))
arr[1,2] = 9
arr

array([[1., 1., 1.],
       [1., 1., 9.],
       [1., 1., 1.]])

In [45]:
cols = ["text_" + str(x) for x in range(arr.shape[1])]
df = pd.DataFrame(arr, index=None, columns=cols)
df

Unnamed: 0,text_0,text_1,text_2
0,1.0,1.0,1.0
1,1.0,1.0,9.0
2,1.0,1.0,1.0


In [56]:
p1 = pd.DataFrame({'f1': [1,2], 'f2': [1,2]})
p2 = pd.DataFrame({'f3': [1,2], 'f4': [9,2]})

p3 = p1.join(p2)
p3.head()

Unnamed: 0,f1,f2,f3,f4
0,1,1,1,9
1,2,2,2,2
