In [137]:
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('2018-EI-oc-En-sadness-dev.txt', sep="\t", header=None, skiprows=1)
dataset.columns = ['date', 'text', 'emotion', 'level']


In [138]:
# Cleaning the text

import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

corpus = []
ps = PorterStemmer()

for index, row in dataset.iterrows():
    tweet = row['text']
    tweet = tweet.split()
    # stemming and stop word removal
    tweet = ' '.join([ps.stem(w) for w in tweet if not w in set(stopwords.words('english'))])
    corpus.append(tweet)


In [140]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer=TfidfVectorizer(use_idf=True)
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(corpus)

print(tfidf_vectorizer_vectors)

(0, 438)	0.2877819856528303
  (0, 523)	0.3076000444894192
  (0, 1392)	0.3076000444894192
  (0, 472)	0.2877819856528303
  (0, 1535)	0.3076000444894192
  (0, 344)	0.3076000444894192
  (0, 1283)	0.3076000444894192
  (0, 761)	0.26281418431903225
  (0, 171)	0.2877819856528303
  (0, 1215)	0.2539027902990142
  (0, 1348)	0.3076000444894192
  (0, 2058)	0.22427648022225397
  (1, 180)	0.28446761885362387
  (1, 363)	0.28446761885362387
  (1, 340)	0.2661399361743082
  (1, 1922)	0.28446761885362387
  (1, 915)	0.25313623836902727
  (1, 583)	0.2661399361743082
  (1, 1086)	0.2218048578844307
  (1, 1901)	0.24304978673943
  (1, 655)	0.5689352377072477
  (1, 1032)	0.19650931363533206
  (1, 184)	0.28446761885362387
  (2, 851)	0.3158987057832549
  (2, 1903)	0.269576963752573
  :	:
  (394, 2114)	0.40472621005543613
  (394, 669)	0.37865050562274655
  (394, 202)	0.3457990031984914
  (394, 1169)	0.33407379446512053
  (394, 1588)	0.37865050562274655
  (394, 223)	0.33407379446512053
  (394, 1328)	0.34579900319849

In [141]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer(max_features = 1500)
#X = vectorizer.fit_transform(corpus).toarray()
X = tfidf_vectorizer_vectors.toarray()
y = []
for index, row in dataset.iterrows():
    y.append(int(row['level'][0]))
y = np.array(y)
#print(vectorizer.get_feature_names())
print(X.shape, y.shape)

(397, 2162) (397,)


In [142]:
# Split dataset into training and test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(317, 2162) (317,)
(80, 2162) (80,)


In [147]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

classifier = GaussianNB()
classifier.fit(X_train, y_train)
#y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

[[12 10  8  7]
 [ 8  3  9  2]
 [ 6  1  2  3]
 [ 4  0  3  2]]
Accuracy:  0.2375
Precision:  0.2736363636363636
Recall:  0.2375
F1:  0.24871725133921183


In [None]:
# SVM

from sklearn.svm import SVC

classifier = SVC()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

In [None]:
# Perceptron

from sklearn.linear_model import Perceptron

classifier = Perceptron()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('F1: ', f1_score(y_test, y_pred, average='weighted'))

In [None]:
# Simple test

import os
import numpy as np

tweet = input("Enter tweet: ")
tweet = re.sub('[^a-zA-Z]', ' ', rev).split()
tweet = ' '.join([ps.stem(w) for w in tweet])
X = vectorizer.transform([tweet]).toarray()

print(X.shape)
print(X)

print("Sentiment level: ", classifier.predict(X))