In [14]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, Normalizer

### Loading Data

Create a Dictionary for each type of HARD

In [15]:
with open('new_train/hard.cor') as f:
    d = {'HARD1':[], 'HARD2':[], 'HARD3':[]}
    
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace(m2.group(0), '')
                d[meaning].append(sentence)

In [16]:
print len(d['HARD1'])
print len(d['HARD2'])
print len(d['HARD3'])

3455
502
376


Create a documents of all sentences, regardless of meaning

In [17]:
train_documents = [sentence for value in d.values() for sentence in value]
# [item for sublist in l for item in sublist]

In [18]:
len(train_documents)

4333

In [19]:
type(train_documents[0])

str

## Lemmatizing

In [20]:
import gensim.utils

In [21]:
def lem(documents):
    lem_documents = []
    for doc in documents:
        no_tag_words = [w[:-3] for w in gensim.utils.lemmatize(doc)]
        lem_documents.append(' '.join(no_tag_words)) 
    return lem_documents

In [22]:
lem_train_documents = lem(train_documents)

In [23]:
lem_train_documents[0]

'lose popular support someone have kill defeat do'

### TF-IDF

In [24]:
def Tfidf(documents):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3)).fit(documents)
    vectors = vectorizer.transform(documents)
    return vectors, vectorizer

In [25]:
vectors, vectorizer = Tfidf(lem_train_documents)

In [26]:
vectorizer

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [27]:
vectors

<4333x73935 sparse matrix of type '<type 'numpy.float64'>'
	with 106933 stored elements in Compressed Sparse Row format>

### Resample

In [49]:
# Limit HARD1 to only 700 samples
# hard1 = list(np.random.choice(d['HARD1'], 700, replace=False))
hard1 = d['HARD1']
hard2 = d['HARD2']
hard3 = d['HARD3']
resamp = hard1 + hard2 + hard3

In [50]:
len(hard1), len(hard2), len(hard3)

(3455, 502, 376)

In [51]:
len(resamp)

4333

In [52]:
vectors, vectorizer = Tfidf(resamp)

In [53]:
vectors

<4333x81873 sparse matrix of type '<type 'numpy.float64'>'
	with 114606 stored elements in Compressed Sparse Row format>

In [54]:
y1 = np.repeat('HARD1', len(hard1))
y2 = np.repeat('HARD2', len(hard2))
y3 = np.repeat('HARD3', len(hard3))

y = np.concatenate((y1, y2, y3), axis=0)

In [55]:
y

array(['HARD1', 'HARD1', 'HARD1', ..., 'HARD3', 'HARD3', 'HARD3'], 
      dtype='|S5')

## LDA

In [56]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
lda = LDA()
lda.fit(vectors.toarray(),y)

In [None]:
def predict_LDA(sentence, vectorizer, nb):
    vector = vectorizer.transform([sentence]) # use the vectorizer from training
    return nb.predict(vector)

In [None]:
pred_score_hard1, pred_score_hard2, pred_score_hard3 = 0, 0, 0

for sent in d['HARD1']:

    if predict_LDA(sent, vectorizer, lda) == 'HARD1':
        pred_score_hard1 += 1
print "HARD1 score: ", pred_score_hard1, '/', len(d['HARD1'])

for sent in d['HARD2']:
    if predict_LDA(sent, vectorizer, lda) == 'HARD2':
        pred_score_hard2 += 1
print "HARD2 score: ", pred_score_hard2, '/', len(d['HARD2'])

for sent in d['HARD3']:
    if predict_LDA(sent, vectorizer, lda) == 'HARD3':
        pred_score_hard3 += 1
print "HARD3 score: ", pred_score_hard3, '/', len(d['HARD3'])

### Cross validaiton

In [None]:
from sklearn.cross_validation import cross_val_score

In [None]:
cross_val_score(lda, vectors.toarray(), y, scoring='accuracy')

In [None]:
cross_val_score(lda, vectors.toarray(), y, scoring='precision')

In [None]:
cross_val_score(lda, vectors.toarray(), y, scoring='recall')