In [57]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD

### Loading Data

In [58]:
with open('new_train/hard.cor') as f:
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace(m2.group(0), '')
                print word, meaning
                print sentence
                print '----'
                

Create a Dictionary for each type of HARD

In [59]:
with open('new_train/hard.cor') as f:
    d = {'HARD1':[], 'HARD2':[], 'HARD3':[]}
    
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace(m2.group(0), '')
                d[meaning].append(sentence)

In [60]:
print len(d['HARD1'])
print len(d['HARD2'])
print len(d['HARD3'])

3455
502
376


Create a documents of all sentences, regardless of meaning

In [61]:
train_documents = [sentence for value in d.values() for sentence in value]
# [item for sublist in l for item in sublist]

In [62]:
len(train_documents)

4333

In [63]:
type(train_documents[0])

str

## Lemmatizing

In [64]:
import gensim.utils

In [65]:
def lem(documents):
    lem_documents = []
    for doc in documents:
        no_tag_words = [w[:-3] for w in gensim.utils.lemmatize(doc)]
        lem_documents.append(' '.join(no_tag_words)) 
    return lem_documents

In [66]:
lem_train_documents = lem(train_documents)

In [67]:
lem_train_documents[0]

'lose popular support someone have kill defeat do'

### TF-IDF

In [68]:
def Tfidf(documents):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3)).fit(documents)
    vectors = vectorizer.transform(documents)
    return vectors, vectorizer

In [69]:
vectors, vectorizer = Tfidf(lem_train_documents)

In [70]:
vectorizer

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [71]:
vectors

<4333x73935 sparse matrix of type '<type 'numpy.float64'>'
	with 106933 stored elements in Compressed Sparse Row format>

### Agglomerative Clustering (slow, try later)

In [86]:
hr = AgglomerativeClustering(n_clusters = 3,\
                             affinity = 'cosine', linkage = 'average')


In [87]:
hr.fit(vectors.toarray())

MemoryError: 

In [None]:
hr.labels_

In [None]:
np.bincount(hr.labels_)

### LSA (not needed)

In [16]:
terms = vectorizer.get_feature_names()

In [19]:
lsa = TruncatedSVD(n_components=3, n_iter=100)

In [20]:
lsa.fit(vectors)

TruncatedSVD(algorithm='randomized', n_components=3, n_iter=100,
       random_state=None, tol=0.0)

In [22]:
lsa.components_[0]

array([  3.57578530e-04,   3.57578530e-04,   1.75950254e-03, ...,
         6.64178733e-05,   9.81887023e-04,   9.81887023e-04])

In [27]:
for i, comp in enumerate(lsa.components_):
    termsInComp = zip(terms, comp)
    sortedTerms = sorted(termsInComp, key=lambda x:x[1], reverse=True)[:10]
    print
    print "Definition {}".format(i)
    print
    for term in sortedTerms:
        print term[0],term[1]


Definition 0

say 0.858520318023
time 0.207678846075
make 0.130699933761
work 0.122372523466
believe 0.112140637236
person 0.0957758618771
thing 0.0913846805106
know 0.0847157219489
year 0.0741899806109
just 0.0681002575173

Definition 1

time 0.59315204836
work 0.297873739131
believe 0.236515828503
make 0.211861636655
know 0.161117209931
year 0.16005761853
person 0.12537488547
look 0.0954599211502
just 0.091039995218
tell 0.0765940247125

Definition 2

time 0.672428635773
say 0.104738750404
say say 0.0472741539846
ababa 0.0206744521531
child 0.0134730648086
state 0.0132905975697
coach 0.00806845794342
say exactly 0.00730645675311
deal 0.00720455653182
turn 0.00638606652702


### KMeans

In [72]:
from sklearn.cluster import KMeans

In [73]:
km = KMeans(n_clusters = 3, n_init=100)
km.fit(vectors)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=100,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [74]:
km.labels_

array([0, 1, 0, ..., 0, 0, 0], dtype=int32)

In [75]:
np.bincount(km.labels_)

array([3107,  379,  847])

### Predict

In [80]:
sentence = 'This rock is hard'

In [224]:
def predict(sentence):
    vector = vectorizer.transform([sentence]) # use the vectorizer from training
    return km.predict(vector)

In [225]:
predict(sentence)

array([0], dtype=int32)

## Create Data Frame (maybe don't need this section)

In [192]:
df1 = pd.DataFrame(d['HARD1'], np.repeat('HARD1', len(d['HARD1'])), columns=['text']).\
    reset_index().rename(columns = {'index': 'meaning'})
df2 = pd.DataFrame(d['HARD2'], np.repeat('HARD2', len(d['HARD2'])), columns=['text']).\
    reset_index().rename(columns = {'index': 'meaning'})
df3 = pd.DataFrame(d['HARD3'], np.repeat('HARD3', len(d['HARD3'])), columns=['text']).\
    reset_index().rename(columns = {'index': 'meaning'})


In [193]:
frames = [df1, df2, df3]
df = pd.concat(frames)

In [194]:
df.head()

Unnamed: 0,meaning,text
0,HARD1,"`` He may lose all popular support , but som..."
1,HARD1,Clever White House `` spin doctors '' are hav...
2,HARD1,I find it to believe that the Sacramento R...
3,HARD1,Now when you get bad credit data or are confu...
4,HARD1,'A great share of responsibility for this nat...


In [195]:
print len(d['HARD1']), len(d['HARD2']), len(d['HARD3'])

3455 502 376


The labels look correct. Let's do train test split

## Train test split (with same ratio in each class)

In [216]:
from sklearn.cross_validation import train_test_split, cross_val_score

In [205]:
# train_df1 = df1.sample(frac=0.6)
# train_df2 = df2.sample(frac=0.6)
# train_df3 = df3.sample(frac=0.6)

In [215]:
X1 = pd.Series(d['HARD1'])
y1 = np.repeat('HARD1', len(d['HARD1']))

X2 = pd.Series(d['HARD2'])
y2 = np.repeat('HARD2', len(d['HARD2']))

X3 = pd.Series(d['HARD3'])
y3 = np.repeat('HARD3', len(d['HARD3']))

## Scoring

In [231]:
score_hard1, score_hard2, score_hard3 = 0, 0, 0

for sent in d['HARD1']:
    if predict(sent) == 0:
        score_hard1 += 1
print "HARD1 score: ", score_hard1, '/', len(d['HARD1'])
print

for sent in d['HARD2']:
    if predict(sent) == 2:
        score_hard2 += 1
print "HARD2 score: ", score_hard2, '/', len(d['HARD2'])
print

for sent in d['HARD3']:
    if predict(sent) == 1:
        score_hard3 += 1
print "HARD3 score: ", score_hard3, '/', len(d['HARD3'])
print

HARD1 score:  2998 / 3455

HARD2 score:  12 / 502

HARD3 score:  12 / 376



In [237]:
predict("")

array([0], dtype=int32)

## Try other words to see if more balanced ("serve", "interest")

In [249]:
with open('new_train/interest.cor') as f:
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace(m2.group(0), '')
                print word, meaning
                print sentence
                print '----'
                

interest interest_6
 yields  on  money-market mutual funds  continued to slide , amid  signs  that  portfolio managers  expect  further declines  in     rates  . 
----
interest interest_6
 longer maturities  are thought to indicate  declining    rates  because  they  permit  portfolio managers  to retain relatively  higher rates  for  a longer period  . 
----
interest interest_6
 nevertheless , said  brenda malizia negus  ,  editor  of  money fund report  , yields `` may  blip  up again before  they   blip  down '' because of  recent rises  in  short-term    rates  . 
----
interest interest_5
 j.p. bolduc  ,  vice chairman  of  w.r. grace  &  co.  ,  which  holds  a 83.4 %     in  this energy-services company  , was elected  a director  . 
----
interests interest_5
 finmeccanica  is  an italian state-owned holding company  with      in  the mechanical engineering industry  . 
----
interest interest_6
 in  august  ,  the commission  ruled that between  $ 190 million  and  $ 195 million 

In [252]:
with open('new_train/interest.cor') as f:
    d = {}
    for i in xrange(6):
        key = 'interest_{}'.format(i+1)
        d[key] = []
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace(m2.group(0), '')
                d[meaning].append(sentence)

In [253]:
for key in d.keys():
    print len(d[key])

500
178
1252
361
66
11
