In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, Normalizer

### Loading Data

In [3]:
# with open('new_train/hard.cor') as f:
#     for line in f.readlines():
#         m = re.search("<s>(.*)<\/s>", line)
#         if m:
#             m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
#             if m2:
#                 meaning = m2.group(1)
#                 word = m2.group(2)
#                 sentence = m.group(1)
#                 sentence = sentence.replace(m2.group(0), '')
#                 print word, meaning
#                 print sentence
#                 print '----'
                

Create a Dictionary for each type of HARD

In [4]:
with open('new_train/line.cor') as f:
    d = {'HARD1':[], 'HARD2':[], 'HARD3':[]}
    
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace('<s>', '')
                sentence = sentence.replace('</s>', '')
                sentence = sentence.replace('<p>', '')
                sentence = sentence.replace('</p>', '')
                sentence = sentence.replace('<@>', '')
                sentence = sentence.replace(m2.group(0), '')
                d[meaning].append(sentence)

In [5]:
print len(d['HARD1'])
print len(d['HARD2'])
print len(d['HARD3'])

3455
502
376


Create a documents of all sentences, regardless of meaning

In [6]:
train_documents = [sentence for value in d.values() for sentence in value]
# [item for sublist in l for item in sublist]

In [7]:
len(train_documents)

4333

In [8]:
type(train_documents[0])

str

## Lemmatizing

In [9]:
import gensim.utils

In [10]:
def lem(documents):
    lem_documents = []
    for doc in documents:
        no_tag_words = [w[:-3] for w in gensim.utils.lemmatize(doc)]
        lem_documents.append(' '.join(no_tag_words)) 
    return lem_documents

In [11]:
lem_train_documents = lem(train_documents)

In [12]:
lem_train_documents[0]

'lose popular support someone have kill defeat do'

### TF-IDF

In [13]:
def Tfidf(documents):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3)).fit(documents)
    vectors = vectorizer.transform(documents)
    return vectors, vectorizer

In [14]:
vectors, vectorizer = Tfidf(lem_train_documents)

In [15]:
vectorizer

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
vectors

<4333x73935 sparse matrix of type '<type 'numpy.float64'>'
	with 106933 stored elements in Compressed Sparse Row format>

## Multinomial NB

In [17]:
from sklearn.naive_bayes import MultinomialNB

### Resample

In [18]:
# Limit HARD1 to only 700 samples
hard1 = list(np.random.choice(d['HARD1'], 700, replace=False))
hard2 = d['HARD2']
hard3 = d['HARD3']
resamp = hard1 + hard2 + hard3

In [19]:
len(hard1), len(hard2), len(hard3)

(700, 502, 376)

In [20]:
len(resamp)

1578

In [21]:
vectors, vectorizer = Tfidf(resamp)

In [22]:
vectors

<1578x35504 sparse matrix of type '<type 'numpy.float64'>'
	with 45970 stored elements in Compressed Sparse Row format>

In [23]:
y1 = np.repeat('HARD1', len(hard1))
y2 = np.repeat('HARD2', len(hard2))
y3 = np.repeat('HARD3', len(hard3))

y = np.concatenate((y1, y2, y3), axis=0)

In [24]:
NB = MultinomialNB()
NB.fit(vectors,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Predict

In [25]:
s = "I work hard"
vector = vectorizer.transform([sentence])
NB.predict(vector)

array(['HARD3'], 
      dtype='|S5')

In [26]:
def predict_NB(sentence, vectorizer, nb):
    vector = vectorizer.transform([sentence]) # use the vectorizer from training
    return nb.predict(vector)

In [27]:
predict_NB(s, vectorizer, NB)

array(['HARD2'], 
      dtype='|S5')

#### Train test split

In [28]:
# from sklearn.cross_validation import train_test_split, cross_val_score

In [29]:
# train_df1 = df1.sample(frac=0.6)
# train_df2 = df2.sample(frac=0.6)
# train_df3 = df3.sample(frac=0.6)

In [30]:
# X1 = pd.Series(d['HARD1'])
# y1 = np.repeat('HARD1', len(d['HARD1']))

# X2 = pd.Series(d['HARD2'])
# y2 = np.repeat('HARD2', len(d['HARD2']))

# X3 = pd.Series(d['HARD3'])
# y3 = np.repeat('HARD3', len(d['HARD3']))

#### Scoring

In [31]:
pred_score_hard1, pred_score_hard2, pred_score_hard3 = 0, 0, 0

for sent in d['HARD1']:

    if predict_NB(sent, vectorizer, NB) == 'HARD1':
        pred_score_hard1 += 1
print "HARD1 score: ", pred_score_hard1, '/', len(d['HARD1'])

for sent in d['HARD2']:
    if predict_NB(sent, vectorizer, NB) == 'HARD2':
        pred_score_hard2 += 1
print "HARD2 score: ", pred_score_hard2, '/', len(d['HARD2'])

for sent in d['HARD3']:
    if predict_NB(sent, vectorizer, NB) == 'HARD3':
        pred_score_hard3 += 1
print "HARD3 score: ", pred_score_hard3, '/', len(d['HARD3'])

HARD1 score:  3345 / 3455
HARD2 score:  499 / 502
HARD3 score:  368 / 376


#### Cross validation

In [32]:
from sklearn.cross_validation import cross_val_score

In [33]:
cross_val_score(NB, vectors, y, scoring='accuracy')

array([ 0.51325758,  0.5047619 ,  0.51238095])

In [34]:
cross_val_score(NB, vectors, y, scoring='precision')

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


array([ 0.75738055,  0.75253834,  0.7558569 ])

In [35]:
cross_val_score(NB, vectors, y, scoring='recall')

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


array([ 0.51325758,  0.5047619 ,  0.51238095])