In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD

### Loading Data

In [2]:
with open('new_train/hard.cor') as f:
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace(m2.group(0), '')
                print word, meaning
                print sentence
                print '----'
                

HARD HARD1
 `` He may lose all popular support ,  but someone has to kill him to defeat him and that 's    to do. '' 
----
HARD HARD1
 Clever White House `` spin doctors '' are having a    time helping President Bush explain away the economic bashing that low-and middle-income workers are taking these days . 
----
HARD HARD1
 I find it    to believe that the Sacramento River will ever be quite the same ,  although I certainly wish that I'm wrong . 
----
HARD HARD1
 Now when you get bad credit data or are confused with another person ,  the    part in correcting the mistake is not even knowing where it is recorded ,  let alone having access . 
----
HARDER HARD1
 'A great share of responsibility for this national tragedy unquestionably lies with the president of the country. '-- Eduard Shevardnadze ,  former foreign minister ;  'We are so deep in this crisis that all this business about leaving the party ,  not leaving the party -- that will never get us out. '-- Natasha ,  a Moscow book

Create a Dictionary for each type of HARD

In [3]:
with open('new_train/hard.cor') as f:
    d = {'HARD1':[], 'HARD2':[], 'HARD3':[]}
    
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace(m2.group(0), '')
                d[meaning].append(sentence)

In [4]:
print len(d['HARD1'])
print len(d['HARD2'])
print len(d['HARD3'])

3455
502
376


Create a documents of all sentences, regardless of meaning

In [5]:
train_documents = [sentence for value in d.values() for sentence in value]
# [item for sublist in l for item in sublist]

In [6]:
len(train_documents)

4333

In [7]:
type(train_documents[0])

str

## Lemmatizing

In [8]:
import gensim.utils

In [9]:
def lem(documents):
    lem_documents = []
    for doc in documents:
        no_tag_words = [w[:-3] for w in gensim.utils.lemmatize(doc)]
        lem_documents.append(' '.join(no_tag_words)) 
    return lem_documents

In [11]:
lem_train_documents = lem(train_documents)

In [12]:
lem_train_documents[0]

'lose popular support someone have kill defeat do'

### TF-IDF

In [13]:
def Tfidf(documents):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3)).fit(documents)
    vectors = vectorizer.transform(documents)
    return vectors, vectorizer

In [14]:
vectors, vectorizer = Tfidf(lem_train_documents)

In [15]:
vectorizer

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
vectors

<4333x73935 sparse matrix of type '<type 'numpy.float64'>'
	with 106933 stored elements in Compressed Sparse Row format>

### KMeans

In [None]:
from sklearn.cluster import KMeans

In [None]:
km = KMeans(n_clusters = 3, n_init=100)
km.fit(vectors)

In [None]:
km.labels_

In [None]:
np.bincount(km.labels_)

### Predict

In [None]:
sentence = 'This rock is hard'

In [None]:
def predict(sentence):
    vector = vectorizer.transform([sentence]) # use the vectorizer from training
    return km.predict(vector)

In [None]:
predict(sentence)

## Create Data Frame (maybe don't need this section)

In [None]:
df1 = pd.DataFrame(d['HARD1'], np.repeat('HARD1', len(d['HARD1'])), columns=['text']).\
    reset_index().rename(columns = {'index': 'meaning'})
df2 = pd.DataFrame(d['HARD2'], np.repeat('HARD2', len(d['HARD2'])), columns=['text']).\
    reset_index().rename(columns = {'index': 'meaning'})
df3 = pd.DataFrame(d['HARD3'], np.repeat('HARD3', len(d['HARD3'])), columns=['text']).\
    reset_index().rename(columns = {'index': 'meaning'})


In [None]:
frames = [df1, df2, df3]
df = pd.concat(frames)

In [None]:
df.head()

In [None]:
print len(d['HARD1']), len(d['HARD2']), len(d['HARD3'])

The labels look correct. Let's do train test split

## Train test split (with same ratio in each class)

In [None]:
from sklearn.cross_validation import train_test_split, cross_val_score

In [None]:
# train_df1 = df1.sample(frac=0.6)
# train_df2 = df2.sample(frac=0.6)
# train_df3 = df3.sample(frac=0.6)

In [None]:
X1 = pd.Series(d['HARD1'])
y1 = np.repeat('HARD1', len(d['HARD1']))

X2 = pd.Series(d['HARD2'])
y2 = np.repeat('HARD2', len(d['HARD2']))

X3 = pd.Series(d['HARD3'])
y3 = np.repeat('HARD3', len(d['HARD3']))

## Scoring

In [None]:
score_hard1, score_hard2, score_hard3 = 0, 0, 0

for sent in d['HARD1']:
    if predict(sent) == 0:
        score_hard1 += 1
print "HARD1 score: ", score_hard1, '/', len(d['HARD1'])
print

for sent in d['HARD2']:
    if predict(sent) == 2:
        score_hard2 += 1
print "HARD2 score: ", score_hard2, '/', len(d['HARD2'])
print

for sent in d['HARD3']:
    if predict(sent) == 1:
        score_hard3 += 1
print "HARD3 score: ", score_hard3, '/', len(d['HARD3'])
print