In [1]:
import sklearn

In [7]:
data = []
targets = []
emotions = ['ANGER', 'FEAR', 'JOY', 'LOVE', 'SADNESS', 'SURPRISE']

for emotion in emotions:
    with open('NLP_Training_Data/{0}'.format(emotion)) as f:
        content = f.readlines()
    for x in content:
        data.append(x.strip())
        targets.append(emotions.index(emotion))
    print('[{0}]Number of entry: {1} '.format(emotion, len(content)))

[ANGER]Number of entry: 57317 
[FEAR]Number of entry: 47712 
[JOY]Number of entry: 141067 
[LOVE]Number of entry: 34554 
[SADNESS]Number of entry: 121188 
[SURPRISE]Number of entry: 14972 
['i have been hearing rumours that you have not been allowing people in your room he said this sentence really shot my heart i failed to find the right words', 'i was at a friends place for lunch and she fell ill and vomited', 'the smell of garlic in rushhour bus', 'my father died my natural father contacted me a year after my father died and this angered me more as i thought he should be dead and not my father', 'years ago i served in the army once a collegue denounced me because of a delict', 'letter words on a tv programme roche while we were entertaining conservative relations who wanted to see the first episode', 'a group of youngsters dressed in fads talked foul language on a bus they also insulted the pedestrians on the road and were impolite to the passengers of the bus', 'a higher status col

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, targets, shuffle=True, test_size=0.2,random_state=11)

['i realized how mean ive been to you when your feeling for me was so sincere', 'i have to admit i feel pretty much like that every time i read someones blog', 'i feel ok i feel good', 'i do want to share with you my personal discovery and though it is very small and probably insignificant i feel tramendous joy as i finally resolved the issue that was bothering me for quite some time', 'i am not a very psychic kind of person but i could definitely feel an unpleasant energy coming off this man', 'i feel isolated find company and if i am tired take a break', 'i emailed kendra smith who i know is going to be a fabulous trainer and though her roster was already overflowing she must have heard my urgency to feel vital again and agreed to meet me for a series of sessions leading up to the retreat', 'i know you cant just ged rid of your feelings but seriously i dont see your parents supporting you dating a guy who s their age', 'i feel the warmth of the amazed smile because', 'i left feeling 

In [35]:
for i in range(6):
    print(emotions[i], ':', y_train.count(i))

ANGER : 45865
FEAR : 38073
JOY : 112971
LOVE : 27689
SADNESS : 96835
SURPRISE : 12015


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts[0]

<1x67786 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(333448, 67786)

In [16]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [30]:
docs_new = ['i left feeling highly impressed and motivated']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [31]:
predicted=clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print("%r => %s" % (doc, emotions[category]))

'i left feeling highly impressed and motivated' => JOY


In [36]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [37]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [39]:
import numpy as np
doc_test = X_test
predicted = text_clf.predict(doc_test)
np.mean(predicted == y_test)

0.73998944363139085

In [40]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None))
                    ])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(doc_test)
np.mean(predicted == y_test)

0.87929752165255148

In [56]:
docs_new =['what? I cannot believe that']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted=text_clf.predict(docs_new)
for doc, category in zip(docs_new, predicted):
    print("%r => %s" % (doc, emotions[category]))

'what? I cannot believe that' => JOY


In [58]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range':[(1,1), (1,2)],
              'tfidf__use_idf':(True, False),
              'clf__alpha':(1e-2, 1e-3),}

In [59]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [None]:
gs_clf=gs_clf.fit(X_train, y_train)

In [61]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [62]:
gs_clf.best_score_

0.90000000000000002