In [143]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, Normalizer

### Loading Data

#### "HARD"

Create a Dictionary for each type of HARD

In [127]:
with open('new_train/hard.cor') as f:
    d = {'HARD1':[], 'HARD2':[], 'HARD3':[]}
    
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace('<s>', '')
                sentence = sentence.replace('</s>', '')
                sentence = sentence.replace('<p>', '')
                sentence = sentence.replace('</p>', '')
                sentence = sentence.replace('<@>', '')
                sentence = sentence.replace(m2.group(0), '')
                d[meaning].append(sentence)

In [128]:
print len(d['HARD1'])
print len(d['HARD2'])
print len(d['HARD3'])

3455
502
376


#### "INTEREST"

In [144]:
with open('new_train/interest.cor') as f:
    d = {}
    for i in xrange(6):
        key = 'interest_{}'.format(i+1)
        d[key] = []
    for line in f.readlines():
        m = re.search("<s>(.*)<\/s>", line)
        if m:
            m2 = re.search("<tag \"(.*)\">(.*)<\/>", line)
            if m2:
                meaning = m2.group(1)
                word = m2.group(2)
                sentence = m.group(1)
                sentence = sentence.replace(m2.group(0), '')
                d[meaning].append(sentence)

In [145]:
for key in d.keys():
    print key, len(d[key])

interest_5 500
interest_4 178
interest_6 1252
interest_1 361
interest_3 66
interest_2 11


In [146]:
train_documents = [sentence for value in d.values() for sentence in value]

Create a documents of all sentences, regardless of meaning

In [147]:
train_documents = [sentence for value in d.values() for sentence in value]
# [item for sublist in l for item in sublist]

In [148]:
len(train_documents)

2368

In [149]:
type(train_documents[0])

str

## Lemmatizing

In [150]:
import gensim.utils

In [151]:
def lem(documents):
    lem_documents = []
    for doc in documents:
        no_tag_words = [w[:-3] for w in gensim.utils.lemmatize(doc)]
        lem_documents.append(' '.join(no_tag_words)) 
    return lem_documents

In [152]:
lem_train_documents = lem(train_documents)

In [153]:
lem_train_documents[0]

'bolduc vice chairman grace co hold energy service company be elect director'

### TF-IDF

In [154]:
def Tfidf(documents):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3)).fit(documents)
    vectors = vectorizer.transform(documents)
    return vectors, vectorizer

In [155]:
vectors, vectorizer = Tfidf(lem_train_documents)

In [156]:
vectorizer

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [157]:
vectors

<2368x50531 sparse matrix of type '<type 'numpy.float64'>'
	with 83625 stored elements in Compressed Sparse Row format>

## Multinomial NB

In [158]:
from sklearn.naive_bayes import MultinomialNB

### Resample (don't need)

In [159]:
# hard1 = list(np.random.choice(d['HARD1'], 700, replace=False))1 = d['HARD1']
i1 = d['interest_1']
i2 = d['interest_2']
i3 = d['interest_3']
i4 = d['interest_4']
i5 = d['interest_5']
i6 = d['interest_6']


resamp = i1+i2+i3+i4+i5+i6

In [160]:
# train_documents == resamp
print len(train_documents), len(resamp)
print train_documents[100]
print '-----'
print resamp[100]

2368 2368
 ``  it  's horrible to say , but  it  's unfortunate that  earthquake was n't  in  phoenix  --  it  might have knocked out  some  of  our empty buildings  , '' said  c.w. jackson  ,  a prominent arizona businessman  with      in  real estate  , banking and  many other businesses  . 
-----
 he  predicts  a downward move  in  dollar-mark trade  and  a less dramatic slip  in  dollar-yen  , noting that  there  continues to be  a large pool  of  japanese investor     in  u.s. securities  ,  which  could provide  a solid base  for  the dollar  at around  140 yen  . 


In [161]:
len(resamp)

2368

In [162]:
vectors, vectorizer = Tfidf(resamp)

In [163]:
vectors

<2368x58122 sparse matrix of type '<type 'numpy.float64'>'
	with 92159 stored elements in Compressed Sparse Row format>

In [164]:
y1 = np.repeat('INTEREST1', len(i1))
y2 = np.repeat('INTEREST2', len(i2))
y3 = np.repeat('INTEREST3', len(i3))
y4 = np.repeat('INTEREST4', len(i4))
y5 = np.repeat('INTEREST5', len(i5))
y6 = np.repeat('INTEREST6', len(i6))

y = np.concatenate((y1, y2, y3, y4, y5, y6), axis=0)

In [165]:
NB = MultinomialNB(alpha= 0.12)
NB.fit(vectors,y)

MultinomialNB(alpha=0.12, class_prior=None, fit_prior=True)

#### Predict

In [170]:
s = "I work hard"
vector = vectorizer.transform([s])
NB.predict(vector)

array(['INTEREST6'], 
      dtype='|S9')

In [167]:
def predict_NB(sentence, vectorizer, nb):
    vector = vectorizer.transform([sentence]) # use the vectorizer from training
    return nb.predict(vector)

In [168]:
predict_NB(s, vectorizer, NB)

array(['INTEREST6'], 
      dtype='|S9')

#### Scoring

In [67]:
# Hard data
pred_score_hard1, pred_score_hard2, pred_score_hard3 = 0, 0, 0

for sent in d['HARD1']:

    if predict_NB(sent, vectorizer, NB) == 'HARD1':
        pred_score_hard1 += 1
print "HARD1 score: ", pred_score_hard1, '/', len(d['HARD1'])

for sent in d['HARD2']:
    if predict_NB(sent, vectorizer, NB) == 'HARD2':
        pred_score_hard2 += 1
print "HARD2 score: ", pred_score_hard2, '/', len(d['HARD2'])

for sent in d['HARD3']:
    if predict_NB(sent, vectorizer, NB) == 'HARD3':
        pred_score_hard3 += 1
print "HARD3 score: ", pred_score_hard3, '/', len(d['HARD3'])

HARD1 score:  3454 / 3455
HARD2 score:  495 / 502
HARD3 score:  376 / 376


In [169]:
# Interest data
pred_score_hard1, pred_score_hard2, pred_score_hard3 = 0, 0, 0

for sent in d['interest_1']:

    if predict_NB(sent, vectorizer, NB) == 'interest_1':
        pred_score_hard1 += 1
print "interest_1 score: ", pred_score_hard1, '/', len(d['interest_1'])

# for sent in d['HARD2']:
#     if predict_NB(sent, vectorizer, NB) == 'HARD2':
#         pred_score_hard2 += 1
# print "HARD2 score: ", pred_score_hard2, '/', len(d['HARD2'])

# for sent in d['HARD3']:
#     if predict_NB(sent, vectorizer, NB) == 'HARD3':
#         pred_score_hard3 += 1
# print "HARD3 score: ", pred_score_hard3, '/', len(d['HARD3'])

interest_1 score:  0 / 361


#### Cross validation

In [68]:
from sklearn.cross_validation import cross_val_score

In [69]:
cross_val_score(NB, vectors, y, scoring='accuracy')

array([ 0.77316736,  0.78947368,  0.78031878])

In [70]:
cross_val_score(NB, vectors, y, scoring='precision')

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


array([ 0.77787688,  0.78712895,  0.782662  ])

In [71]:
cross_val_score(NB, vectors, y, scoring='recall')

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


array([ 0.77316736,  0.78947368,  0.78031878])