In [61]:
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('2018-EI-oc-En-sadness-dev.txt', sep="\t", header=None, skiprows=1)
dataset.columns = ['date', 'text', 'emotion', 'level']

print(dataset)

date                                               text  \
0    2018-En-01406  We have the opportunity to move away from the ...   
1    2018-En-00716  @badpostyoongi I know for a fact they'll eithe...   
2    2018-En-02394  @LeahRemini  A 'religion' that pushes people t...   
3    2018-En-00825  You could have over a hundred million follower...   
4    2018-En-03817  @elebelfiore Isn't it always about control and...   
..             ...                                                ...   
392  2018-En-00994  @JeffBezos @amazon Who can I talk to about bei...   
393  2018-En-03770       Brown envelopes can induce panic. #posttruth   
394  2018-En-02331  I wont rt things that might offend your faves ...   
395  2018-En-04002  and after i got home in such a horrible mood m...   
396  2018-En-02112                    hit by a sudden wave of sadness   

     emotion                                          level  
0    sadness                  0: no sadness can be inferred  
1    sadness 

In [62]:
# Cleaning the text

import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

corpus = []
ps = PorterStemmer()

for index, row in dataset.iterrows():
    tweet = row['text']
    tweet = tweet.split()
    # stemming and stop word removal
    tweet = ' '.join([ps.stem(w) for w in tweet if not w in set(stopwords.words('english'))])
    corpus.append(tweet)

print(corpus)

 #solareclipse2017 #weird #annoy', '@livedart r.i.p friend, cant get head round this, #devast x', 'If go nurse, learn nice patient person, jesu 🙃', "work go kill 5 o'clock nowher near 😟", 'total scare upcom result .', "@iaindal @lbc the word wors 'moist' opinion 'scrape'. #ugh #shudder", '@timmyldn look dreadful... 👀', '#teamchristin bc tana done provok tweet shadi shit tri hard bitch beg fight', '@620wtmj serious @620wtmj !? thi news you?!? #sad \\n\\nwhi focu import issues??', '@nitashakaul @snehakaul2kaul beauti dear, thanks,everybodi know benefit india &amp; goi done terror attack', 'lot univers activ discourag even one part-tim job neg impact studies--', '@keithrothfu your silence=complicity. obvious problem russian interference. #sad #russianhack #treason', 'wanna sober u', "'we need something. someth must done!!!!!'\\n\\nyour anxieti amusing. noth done. despair.", "@theview joy comedian. she' bulli fat shame governor. great exampl set grandson.", "[ @thechicmystiqu ] — hurt badl

In [63]:
# Create bag-of-words model

from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer(max_features = 1500)
X = vectorizer.fit_transform(corpus).toarray()
y = []
for index, row in dataset.iterrows():
    y.append(int(row['level'][0]))
y = np.array(y)
print(vectorizer.get_feature_names())
print(X.shape, y.shape)

['00', '20', '25', '30', '50', '500', '620wtmj', 'absolut', 'act', 'actual', 'afraid', 'africa', 'ago', 'al', 'alarm', 'alas', 'aleesha', 'alex', 'alicet120', 'all', 'allman', 'allyiahsfac', 'almost', 'along', 'alonzo', 'alot', 'alreadi', 'already', 'alright', 'also', 'alway', 'am', 'amala', 'amaz', 'amazon', 'amelia', 'amen', 'american', 'amp', 'an', 'and', 'anger', 'angri', 'annoy', 'anoth', 'answer', 'anthem', 'anxieti', 'anyon', 'appar', 'appear', 'appreci', 'are', 'armi', 'around', 'arsehol', 'ass', 'at', 'aw', 'away', 'awful', 'back', 'bad', 'bc', 'be', 'beauti', 'becom', 'bed', 'believ', 'best', 'better', 'bewar', 'big', 'birthday', 'bitter', 'blah', 'bleak', 'blue', 'book', 'boy', 'bring', 'bro', 'broke', 'broken', 'brown', 'bt', 'build', 'bulli', 'bum', 'burn', 'burrito', 'burst', 'burstcoin', 'busi', 'buster_espn', 'but', 'butterscotch', 'buy', 'buyer', 'bxchpls03', 'by', 'caffeine', 'caleb', 'call', 'calls', 'callummay', 'came', 'can', 'cancel', 'cant', 'cantshakethi', 'capi

In [64]:
# Split dataset into training and test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(317, 1500) (317,)
(80, 1500) (80,)


In [77]:
# Fit Naive Bayes to the training set

from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [78]:
# Predict test set results

y_pred = classifier.predict(X_test)

print(y_pred)

[2 3 0 2 3 0 3 0 3 2 3 3 0 2 0 1 3 3 0 1 0 2 0 1 2 1 3 1 1 0 1 1 3 2 2 3 3
 1 2 2 2 2 0 1 1 3 1 2 1 0 2 0 1 2 0 2 3 0 0 1 0 0 2 1 3 3 0 3 0 2 3 0 0 0
 2 2 0 3 2 0]


In [70]:
# Generate metrics

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# confusion matrix
print(confusion_matrix(y_test, y_pred))

# accuracy
print('Accuracy: ', accuracy_score(y_test, y_pred))

# precision
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))

# recall
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))

# f1
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[10 12  8  7]
 [ 7  1  8  6]
 [ 5  2  1  4]
 [ 2  1  4  2]]
Accuracy:  0.175
Precision:  0.175
Recall:  0.175
F1:  0.175


In [81]:
# SVM

from sklearn.svm import SVC

classifier = SVC()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[37  0  0  0]
 [22  0  0  0]
 [12  0  0  0]
 [ 8  0  1  0]]
Accuracy:  0.4625
Precision:  0.2166139240506329
Recall:  0.4625
F1:  0.29504310344827583
  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[24  2 11  0]
 [14  2  4  2]
 [ 5  4  3  0]
 [ 6  0  2  1]]
Accuracy:  0.375
Precision:  0.355280612244898
Recall:  0.375
F1:  0.3416812015503875


In [83]:
# Perceptron

from sklearn.linear_model import Perceptron

classifier = Perceptron()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[19  7  9  2]
 [ 9  4  7  2]
 [ 6  1  2  3]
 [ 3  1  3  2]]
Accuracy:  0.3375
Precision:  0.3614010989010989
Recall:  0.3375
F1:  0.34353896103896103


In [84]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[25  5  7  0]
 [14  3  3  2]
 [ 4  2  5  1]
 [ 1  1  2  5]]
Accuracy:  0.475
Precision:  0.4522142379679145
Recall:  0.475
F1:  0.4533944356797637


In [91]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[33  1  2  1]
 [18  2  0  2]
 [ 8  0  2  2]
 [ 5  1  0  3]]
Accuracy:  0.5
Precision:  0.4931640625
Recall:  0.5
F1:  0.4217412974329108


In [93]:
# Simple test

import os
import numpy as np

tweet = input("Enter tweet: ")
tweet = re.sub('[^a-zA-Z]', ' ', rev).split()
tweet = ' '.join([ps.stem(w) for w in tweet])
X = vectorizer.transform([tweet]).toarray()

print(X.shape)
print(X)

print("Sentiment level: ", classifier.predict(X))

(1, 1500)
[[0 0 0 ... 0 0 0]]
Sentiment level:  [0]
