In [1]:
import numpy as np
import pandas as pd
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from sklearn import model_selection

tok_and_tag = lambda x: pos_tag(word_tokenize(x))

PATH = '../input/'
cols_to_drop = ['id', 'text', 'author']

print('Loading data...')
train = pd.read_csv(PATH + 'train.csv')

Loading data...


In [2]:
train['tags'] = train['text'].apply(tok_and_tag)
train.head()

Unnamed: 0,id,text,author,tags
0,id26305,"This process, however, afforded me no means of...",EAP,"[(This, DT), (process, NN), (,, ,), (however, ..."
1,id17569,It never once occurred to me that the fumbling...,HPL,"[(It, PRP), (never, RB), (once, RB), (occurred..."
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[(In, IN), (his, PRP$), (left, JJ), (hand, NN)..."
3,id27763,How lovely is spring As we looked from Windsor...,MWS,"[(How, WRB), (lovely, RB), (is, VBZ), (spring,..."
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,"[(Finding, VBG), (nothing, NN), (else, RB), (,..."


In [3]:
def update_tag_dictionary(tags, d, d2):
    for i,t in enumerate([('start','start')] + tags[:-1]):
        d[(t[1], tags[i][1])]=d[(t[1], tags[i][1])]+1
        d2[t[1]]=d2[t[1]]+1

## Use log prob since multiplying lots of small numbers will always be 0
def get_markov_prob(tags, d, ALL):
    log_prob = 0
    for i,t in enumerate([('start','start')] + tags[:-1]):
        if d[(t[1], tags[i][1])]:
            log_prob= log_prob + np.log(d[(t[1], tags[i][1])])
        else:
            log_prob= log_prob +np.log(ALL[(t[1], tags[i][1])])
    return log_prob

In [4]:
from collections import defaultdict

ALL1, ALL2 = defaultdict(lambda: 0), defaultdict(lambda: 0.0)
train['tags'].apply((lambda x: update_tag_dictionary(x, ALL1,ALL2)))
ALL = defaultdict(int,{k:v/ALL2[k[0]] for k, v in ALL1.items()})

kf = model_selection.KFold(n_splits=3, shuffle=False, random_state=2017)
for dev_index, val_index in kf.split(train):
    EAP1,EAP2  = defaultdict(lambda: 0), defaultdict(lambda: 0.0)
    HPL1, HPL2 = defaultdict(lambda: 0), defaultdict(lambda: 0.0)
    MWS1, MWS2 = defaultdict(lambda: 0), defaultdict(lambda: 0.0)
    
    train.loc[dev_index][train.author=='EAP']['tags'].apply((lambda x: update_tag_dictionary(x, EAP1,EAP2)))
    EAP = defaultdict(int,{k:v/EAP2[k[0]] for k, v in EAP1.items()})
    train.loc[dev_index][train.author=='HPL']['tags'].apply((lambda x: update_tag_dictionary(x, HPL1,HPL2)))
    HPL = defaultdict(int,{k:v/HPL2[k[0]] for k, v in HPL1.items()})
    train.loc[dev_index][train.author=='MWS']['tags'].apply((lambda x: update_tag_dictionary(x, MWS1,MWS2)))
    MWS = defaultdict(int,{k:v/MWS2[k[0]] for k, v in MWS1.items()})

    train.ix[val_index,'markov_prob_pos_EAP'] = train.loc[val_index]['tags'].apply(( lambda x: get_markov_prob(x, EAP, ALL) - get_markov_prob(x, ALL, ALL) ))
    train.ix[val_index,'markov_prob_pos_HPL'] = train.loc[val_index]['tags'].apply(( lambda x: get_markov_prob(x, HPL, ALL) - get_markov_prob(x, ALL, ALL) ))
    train.ix[val_index,'markov_prob_pos_MWS'] = train.loc[val_index]['tags'].apply(( lambda x: get_markov_prob(x, MWS, ALL) - get_markov_prob(x, ALL, ALL) ))

  del sys.path[0]
  from ipykernel import kernelapp as app
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [6]:
train.head(10)

Unnamed: 0,id,text,author,tags,markov_prob_pos_EAP,markov_prob_pos_HPL,markov_prob_pos_MWS
0,id26305,"This process, however, afforded me no means of...",EAP,"[(This, DT), (process, NN), (,, ,), (however, ...",-0.43524,-2.635048,0.398003
1,id17569,It never once occurred to me that the fumbling...,HPL,"[(It, PRP), (never, RB), (once, RB), (occurred...",-0.216245,-0.918306,0.758205
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,"[(In, IN), (his, PRP$), (left, JJ), (hand, NN)...",0.850827,-2.012366,-2.508112
3,id27763,How lovely is spring As we looked from Windsor...,MWS,"[(How, WRB), (lovely, RB), (is, VBZ), (spring,...",-3.803572,2.204441,-0.074885
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,"[(Finding, VBG), (nothing, NN), (else, RB), (,...",0.562495,-1.836441,-1.247158
5,id22965,"A youth passed in solitude, my best years spen...",MWS,"[(A, DT), (youth, NN), (passed, VBN), (in, IN)...",-5.341822,-8.664172,5.931581
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP,"[(The, DT), (astronomer, NN), (,, ,), (perhaps...",1.339918,-1.57938,-1.535304
7,id13515,The surcingle hung in ribands from my body.,EAP,"[(The, DT), (surcingle, NN), (hung, NN), (in, ...",-0.068862,-0.140097,-0.053983
8,id19322,I knew that you could not say to yourself 'ste...,EAP,"[(I, PRP), (knew, VBD), (that, IN), (you, PRP)...",-3.215652,-0.277007,-1.312699
9,id00912,I confess that neither the structure of langua...,MWS,"[(I, PRP), (confess, VBP), (that, IN), (neithe...",-1.478349,0.27745,0.707077
