In [1]:
import sklearn
import pickle
import random

In [2]:
categories = ['ANGER', 'FEAR', 'JOY', 'LOVE', 'SADNESS', 'SURPRISE']

In [3]:
with open('raw.pickle', 'r') as f:
    data = pickle.load(f)

In [4]:
data_newformat = {'data':[], 'target_names':[], 'target':[]}
data_test = {'data':[], 'target_names':[], 'target':[]}
for i in range(len(data['info'])):
    if random.uniform(0,1) <0.2:
        data_test['target'].append(data['info'][i]['label'])
        data_test['target_names'].append(categories[data['info'][i]['label']])
        data_test['data'].append(str(data['texts'][i]))
    else:
        data_newformat['target'].append(data['info'][i]['label'])
        data_newformat['target_names'].append(categories[data['info'][i]['label']])
        data_newformat['data'].append(str(data['texts'][i]))

In [None]:
data_newformat

In [5]:
twenty_train = data_newformat

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train['data'])
X_train_counts[0]

<1x26918 sparse matrix of type '<type 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(48008, 26918)

In [8]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train['target'])

In [9]:
docs_new = ['I am afraid of you', 'I want to go to school', 'I want you', 'that is very interesting, I am coming with you']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [10]:
predicted=clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print("%r => %s" % (doc, twenty_train['target_names'][category]))

'I am afraid of you' => SADNESS
'I want to go to school' => SADNESS
'I want you' => SADNESS
'that is very interesting, I am coming with you' => FEAR


In [11]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [12]:
text_clf.fit(twenty_train['data'], twenty_train['target'])

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [13]:
import numpy as np
doc_test = data_test['data']
predicted = text_clf.predict(doc_test)
np.mean(predicted == data_test['target'])

0.8624916611074049

In [14]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None))
                    ])
text_clf.fit(twenty_train['data'], twenty_train['target'])
predicted = text_clf.predict(doc_test)
np.mean(predicted == data_test['target'])

0.9142761841227485

In [18]:
doc_test

['im not feeling joyful i know there are some songs that i can listen to that help change my emotional disposition in a way that merely reading words on a page cannot',
 'i get the overflow of hormones that cause me to feel annoyed and get mad over every little thing i see and there are times when i feel sad just looking at the color that looks the same as my exs underwear just a stupid example it never really happen though',
 'i get a view of berkeley s picturesque elmwood neighborhood a bagel and a mocha and the place is big enough that i never feel like i m taking up valuable real estate i can loiter as long as i want',
 'i feel anxious because i m too incompetent to find them',
 'i love to walk until the end of the road to find the best food for my stomach hehe i feel so impressed with all them because it is not easy to keep on surviving in this area of business',
 'i love that its near amazing beaches and on a little patch of land a bit like an island with one long road in and out

In [30]:
docs_new = ['....We are a long way from conclusion on North Korea, maybe things will work out, and maybe they won’t - only time will tell....But the work I am doing now should have been done a long time ago!']
predicted=text_clf.predict(docs_new)
for doc, category in zip(docs_new, predicted):
    print("%r => %s" % (doc, twenty_train['target_names'][category]))


'....We are a long way from conclusion on North Korea, maybe things will work out, and maybe they won\xe2\x80\x99t - only time will tell....But the work I am doing now should have been done a long time ago!' => JOY


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'vector__ngram_range':[(1,1), (1,2)],
              'tfidf__use_idf':(True, False),
              'clf__alpha':(1e-2, 1e-3),}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

In [None]:
gs_clf=gs_clf.fit(twenty_train['data'][:400], twenty_train['target'][:400])