In [7]:
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('2018-EI-oc-En-sadness-dev.txt', sep="\t", header=None, skiprows=1)
dataset.columns = ['date', 'text', 'emotion', 'level']


In [8]:
# Cleaning the text

import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import emoji

corpus = []
ps = PorterStemmer()

for index, row in dataset.iterrows():
    tweet = row['text']

    #handle users
    tweet = re.sub('@.*', '@user', tweet) 

    
    
    tweet = tweet.split()
    #tweet = nltk.word_tokenize(tweet)
    
    # stemming and stop word removal
    tweet = ' '.join([ps.stem(w) for w in tweet if not w in set(stopwords.words('english'))])
    
    #emojis
    tweet = emoji.demojize(tweet)
    
    corpus.append(tweet)

print(corpus)


booti', '@user', '@user', 'fuck small talk tell age start disappoint parent', "mom' work discount riot fest ticket $147/3 day i'm #sadden", '@user', "462 appear PL 200 goals!! 2.31 game per goal. that' nearli goal everi 2 game #crap :face_with_tears_of_joy::face_with_tears_of_joy: :face_with_rolling_eyes::face_with_rolling_eyes::thinking_face::thinking_face: #rooney", "Do peopl notic say 'you'r pretty' I make-up on. Is offense! \\n&amp; I take note never say I don't.", 'peplamb: stevenfurtick #comfort #mourn, To consol #mourn #zion, To give #beauti #ashes, the #oil #joy for…', 'alaina I 90 day snap streak. so?', 'unit airlin newark need kiosk peopl miss cut time. and hire ppl too. #horribl', 'If @user', '@user', "I feel exhaust ...\\ni tri good job, appar god see right. i'm weary! I exist! :worried_face::broken_heart::loudspeaker:\\n#exist #weari #misonorotta", '@user', '@user', '@user', '@user', '@user', 'the pessimist complain wind; optimist expect change; realist adjust sails.', '@u

In [9]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer=TfidfVectorizer(use_idf=True)
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(corpus)

print(tfidf_vectorizer_vectors)

(0, 278)	0.28159131428253475
  (0, 338)	0.3009830535592798
  (0, 946)	0.3009830535592798
  (0, 303)	0.3009830535592798
  (0, 1042)	0.3009830535592798
  (0, 224)	0.3009830535592798
  (0, 866)	0.3009830535592798
  (0, 508)	0.25716061207447155
  (0, 114)	0.3009830535592798
  (0, 820)	0.2678326563067407
  (0, 917)	0.3009830535592798
  (0, 1388)	0.2410685199167384
  (1, 1359)	1.0
  (2, 1359)	1.0
  (3, 1085)	0.22656138633216732
  (3, 207)	0.3279711119122229
  (3, 1379)	0.24958718646104283
  (3, 1343)	0.2802192711008592
  (3, 959)	0.27071771263949884
  (3, 502)	0.3279711119122229
  (3, 1213)	0.2557253657236868
  (3, 468)	0.3068405857337669
  (3, 791)	0.3279711119122229
  (3, 610)	0.3279711119122229
  (3, 259)	0.29184823881795485
  :	:
  (394, 441)	0.38190914014247707
  (394, 1429)	0.38190914014247707
  (394, 899)	0.35730349412529966
  (394, 135)	0.33984551044511535
  (394, 788)	0.33984551044511535
  (394, 1284)	0.32630404627259624
  (394, 1076)	0.35730349412529966
  (394, 148)	0.3398455104451

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer(max_features = 1500)
#X = vectorizer.fit_transform(corpus).toarray()
X = tfidf_vectorizer_vectors.toarray()
y = []
for index, row in dataset.iterrows():
    y.append(int(row['level'][0]))
y = np.array(y)
#print(vectorizer.get_feature_names())
print(X.shape, y.shape)

(397, 1466) (397,)


In [11]:
# Split dataset into training and test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(317, 1466) (317,)
(80, 1466) (80,)


In [19]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[12 13  6  6]
 [ 6  8  7  1]
 [ 2  7  3  0]
 [ 2  3  2  2]]
Accuracy:  0.3125
Precision:  0.37324046920821113
Recall:  0.3125
F1:  0.3261544611448673


In [21]:
# SVM

from sklearn.svm import SVC

classifier = SVC()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[37  0  0  0]
 [21  1  0  0]
 [12  0  0  0]
 [ 9  0  0  0]]
Accuracy:  0.475
Precision:  0.49161392405063287
Recall:  0.475
F1:  0.31895614692653673


In [22]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[37  0  0  0]
 [21  1  0  0]
 [12  0  0  0]
 [ 9  0  0  0]]
Accuracy:  0.475
Precision:  0.49161392405063287
Recall:  0.475
F1:  0.31895614692653673


# Perceptron

from sklearn.linear_model import Perceptron

classifier = Perceptron()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

In [23]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[31  2  2  2]
 [13  3  2  4]
 [ 9  1  1  1]
 [ 4  0  1  4]]
Accuracy:  0.4875
Precision:  0.4549441786283892
Recall:  0.4875
F1:  0.4256484295845998


In [16]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[36  0  1  0]
 [18  1  1  2]
 [12  0  0  0]
 [ 8  0  0  1]]
Accuracy:  0.475
Precision:  0.5375
Recall:  0.475
F1:  0.3426630434782608


In [24]:
# Simple test

import os
import numpy as np

tweet = input("Enter tweet: ")
tweet = re.sub('[^a-zA-Z]', ' ', rev).split()
tweet = ' '.join([ps.stem(w) for w in tweet])
X = vectorizer.transform([tweet]).toarray()

print(X.shape)
print(X)

print("Sentiment level: ", classifier.predict(X))

NameError: name 'rev' is not defined