In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, Normalizer

### Loading Data

In [2]:
# with open('new_train/hard.cor') as f:
#     for line in f.readlines():
#         m = re.search("<s>(.*)<\/s>", line)
#         if m:
#             m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
#             if m2:
#                 meaning = m2.group(1)
#                 word = m2.group(2)
#                 sentence = m.group(1)
#                 sentence = sentence.replace(m2.group(0), '')
#                 print word, meaning
#                 print sentence
#                 print '----'
                

Create a Dictionary for each type of HARD

In [3]:
with open('new_train/hard.cor') as f:
    d = {'HARD1':[], 'HARD2':[], 'HARD3':[]}
    
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace(m2.group(0), '')
                d[meaning].append(sentence)

In [4]:
print len(d['HARD1'])
print len(d['HARD2'])
print len(d['HARD3'])

3455
502
376


Create a documents of all sentences, regardless of meaning

In [5]:
train_documents = [sentence for value in d.values() for sentence in value]
# [item for sublist in l for item in sublist]

In [6]:
len(train_documents)

4333

In [7]:
type(train_documents[0])

str

## Lemmatizing

In [8]:
import gensim.utils

In [9]:
def lem(documents):
    lem_documents = []
    for doc in documents:
        no_tag_words = [w[:-3] for w in gensim.utils.lemmatize(doc)]
        lem_documents.append(' '.join(no_tag_words)) 
    return lem_documents

In [10]:
lem_train_documents = lem(train_documents)

In [11]:
lem_train_documents[0]

'lose popular support someone have kill defeat do'

### TF-IDF

In [12]:
def Tfidf(documents):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3)).fit(documents)
    vectors = vectorizer.transform(documents)
    return vectors, vectorizer

In [13]:
vectors, vectorizer = Tfidf(lem_train_documents)

In [14]:
vectorizer

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [15]:
vectors

<4333x73935 sparse matrix of type '<type 'numpy.float64'>'
	with 106933 stored elements in Compressed Sparse Row format>

### KMeans

In [135]:
from sklearn.cluster import KMeans

In [136]:
km = KMeans(n_clusters = 3, n_init=100)
km.fit(vectors)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=100,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [137]:
km.labels_

array([0, 0, 0, ..., 0, 2, 0], dtype=int32)

In [138]:
np.bincount(km.labels_)

array([3317,  917,   99])

## Create Data Frame (maybe don't need this section)

In [None]:
# df1 = pd.DataFrame(d['HARD1'], np.repeat('HARD1', len(d['HARD1'])), columns=['text']).\
#     reset_index().rename(columns = {'index': 'meaning'})
# df2 = pd.DataFrame(d['HARD2'], np.repeat('HARD2', len(d['HARD2'])), columns=['text']).\
#     reset_index().rename(columns = {'index': 'meaning'})
# df3 = pd.DataFrame(d['HARD3'], np.repeat('HARD3', len(d['HARD3'])), columns=['text']).\
#     reset_index().rename(columns = {'index': 'meaning'})


In [None]:
# frames = [df1, df2, df3]
# df = pd.concat(frames)

In [None]:
# df.head()

In [None]:
# print len(d['HARD1']), len(d['HARD2']), len(d['HARD3'])

The labels look correct. Let's do train test split

## Scoring

### Score by comparing km.labels_ to true labels (with down sample, this method cannot run anymore)

In [139]:
np.bincount(km.labels_)

array([3317,  917,   99])

In [140]:
actual = km.labels_ * 0;
actual
actual[3455:3957] = 1
actual[3957:4333] = 2

In [141]:
pd.crosstab(km.labels_, actual)

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2609,408,300
1,801,89,27
2,45,5,49


train_documents is in the same order of the dictionary (already checked). Now calculate the error

In [None]:
score_hard1, score_hard2, score_hard3 = 0, 0, 0

for i in xrange(3455):
    if km.labels_[i] == 0:
        score_hard1 += 1
for j in xrange(3455, 3957):
    if km.labels_[j] == 1:
        score_hard2 += 1
for k in xrange(3957,4333):
    if km.labels_[k] == 2:
        score_hard3 += 1
        
print "HARD1 score: ", score_hard1, '/', len(d['HARD1'])
print "HARD2 score: ", score_hard2, '/', len(d['HARD2'])
print "HARD3 score: ", score_hard3, '/', len(d['HARD3'])

### Score by predictions

### Resample

In [116]:
# Limit HARD1 to only 700 samples
hard1 = list(np.random.choice(d['HARD1'], 500, replace=False))
hard2 = d['HARD2']
hard3 = d['HARD3']
resamp = hard1 + hard2 + hard3

In [117]:
len(hard1), len(hard2), len(hard3)

(500, 502, 376)

In [118]:
len(resamp)

1378

In [122]:
def train(train_documents):
    lem_train_documents = lem(train_documents)
    vectors, vectorizer = Tfidf(lem_train_documents)
    km = KMeans(n_clusters = 3, n_init=10)
    km.fit(vectors)
    return km, vectorizer

In [123]:
model, vtr = train(resamp)

In [124]:
np.bincount(model.labels_)

array([1218,   56,  104])

### Resample

In [80]:
# Limit HARD1 to only 700 samples
hard1 = list(np.random.choice(d['HARD1'], 700, replace=False))
hard2 = d['HARD2']
hard3 = d['HARD3']
resamp = hard1 + hard2 + hard3

In [81]:
len(hard1), len(hard2), len(hard3)

(700, 502, 376)

In [82]:
len(resamp)

1578

In [83]:
vectors, vectorizer = Tfidf(resamp)

In [84]:
vectors

<1578x35928 sparse matrix of type '<type 'numpy.float64'>'
	with 46598 stored elements in Compressed Sparse Row format>

In [85]:
y1 = np.repeat('HARD1', len(hard1))
y2 = np.repeat('HARD2', len(hard2))
y3 = np.repeat('HARD3', len(hard3))

y = np.concatenate((y1, y2, y3), axis=0)

In [86]:
NB = MultinomialNB()
NB.fit(vectors,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Predict

In [87]:
s = "I work hard"
vector = vectorizer.transform([sentence])
NB.predict(vector)

array(['HARD3'], 
      dtype='|S5')

In [88]:
def predict_NB(sentence, vectorizer, nb):
    vector = vectorizer.transform([sentence]) # use the vectorizer from training
    return nb.predict(vector)

In [89]:
predict_NB(s, vectorizer, NB)

array(['HARD2'], 
      dtype='|S5')

#### Train test split

In [None]:
# from sklearn.cross_validation import train_test_split, cross_val_score

In [None]:
# train_df1 = df1.sample(frac=0.6)
# train_df2 = df2.sample(frac=0.6)
# train_df3 = df3.sample(frac=0.6)

In [None]:
# X1 = pd.Series(d['HARD1'])
# y1 = np.repeat('HARD1', len(d['HARD1']))

# X2 = pd.Series(d['HARD2'])
# y2 = np.repeat('HARD2', len(d['HARD2']))

# X3 = pd.Series(d['HARD3'])
# y3 = np.repeat('HARD3', len(d['HARD3']))

#### Scoring

In [90]:
pred_score_hard1, pred_score_hard2, pred_score_hard3 = 0, 0, 0

for sent in d['HARD1']:

    if predict_NB(sent, vectorizer, NB) == 'HARD1':
        pred_score_hard1 += 1
print "HARD1 score: ", pred_score_hard1, '/', len(d['HARD1'])

for sent in d['HARD2']:
    if predict_NB(sent, vectorizer, NB) == 'HARD2':
        pred_score_hard2 += 1
print "HARD2 score: ", pred_score_hard2, '/', len(d['HARD2'])

for sent in d['HARD3']:
    if predict_NB(sent, vectorizer, NB) == 'HARD3':
        pred_score_hard3 += 1
print "HARD3 score: ", pred_score_hard3, '/', len(d['HARD3'])

HARD1 score:  3360 / 3455
HARD2 score:  499 / 502
HARD3 score:  366 / 376


#### Cross validation

In [92]:
from sklearn.cross_validation import cross_val_score

In [96]:
cross_val_score(NB, vectors, y, scoring='accuracy')

array([ 0.50568182,  0.5047619 ,  0.50666667])

In [97]:
cross_val_score(NB, vectors, y, scoring='precision')

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


array([ 0.70977907,  0.76594224,  0.73441427])

In [98]:
cross_val_score(NB, vectors, y, scoring='recall')

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


array([ 0.50568182,  0.5047619 ,  0.50666667])

## LDA

In [99]:
from sklearn.lda import LDA



In [101]:
lda = LDA()
lda.fit(vectors.toarray(),y)

MemoryError: 

In [None]:
def predict_LDA(sentence, vectorizer, nb):
    vector = vectorizer.transform([sentence]) # use the vectorizer from training
    return nb.predict(vector)