In [1]:
import pandas as pd
import contractions
import numpy as np
import os
import re
import nltk

In [2]:
emotion_dict = {'happy': 1, 'sadness': 2, 'anger': 3, 'fear': 4, 'love': 5, 'surprise': 6}
def read_data(url):
    train = pd.read_csv(url)
    for index, row in train.iterrows():
        emotion = row['Emotion']
        train.loc[index, 'Emotion'] = emotion_dict[emotion]
    return(train)

In [3]:
train = read_data("https://raw.githubusercontent.com/ishantjuyal/Emotions-Detection/main/Data/Emotion_final.csv")
train.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,2
1,i can go from feeling so hopeless to so damned...,2
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,5
4,i am feeling grouchy,3


In [4]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [5]:
def preprocess(df):
    for index, row in df.iterrows():
        filter_sentence = ''
        sentence = row['Text']
        sentence = contractions.fix(sentence)

        # Cleaning the sentence with regex
        sentence = re.sub(r'[^\w\s]', '', sentence)

        # Tokenization
        words = nltk.word_tokenize(sentence)

        # Stopwords removal
        words = [w for w in words if not w in stop_words]
        
        for words in words:
            filter_sentence = filter_sentence  + ' ' + str(lemmatizer.lemmatize(words)).lower()
        
        df.loc[index, 'Text'] = filter_sentence
    df = df[['Text', 'Emotion']]
    return(df)

In [6]:
train.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,2
1,i can go from feeling so hopeless to so damned...,2
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,5
4,i am feeling grouchy,3


In [7]:
train = preprocess(train)
train.head()

Unnamed: 0,Text,Emotion
0,feel humiliated,2
1,go feeling hopeless damned hopeful around som...,2
2,i grabbing minute post feel greedy wrong,3
3,ever feeling nostalgic fireplace know still p...,5
4,feeling grouchy,3


In [8]:
X = np.array(train['Text'])
Y = np.array(train['Emotion'])
Y = Y.astype('int')

In [9]:
from sklearn.pipeline import Pipeline
from sklearn import ensemble
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', ensemble.RandomForestClassifier()),
])

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [12]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])

In [13]:
y_pred = pipeline.predict(X_test)

In [14]:
y_test = np.array(y_test)

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print("Accuracy score for training set:")
print(accuracy_score(np.array(y_train), pipeline.predict(X_train)))

print("\nAccuracy score for test set:")
print(accuracy_score(y_test, y_pred))

print("\nConfusion Matrix for test set:")
print(confusion_matrix(y_test, y_pred))

Accuracy score for training set:
0.9972621890837071

Accuracy score for test set:
0.880475302889096

Confusion Matrix for test set:
[[1285   35   19   10   30    5]
 [  45 1125   37   22    4    1]
 [  27   36  529   15    2    1]
 [  13   32   24  460    2   14]
 [  80    2    2    2  238    1]
 [  13    7    7   25    0  142]]


In [16]:
def preprocess_sentence(sentence):
    sentence = contractions.fix(sentence)
    sentence = re.sub(r'[^\w\s]', '', sentence)
    
    words = nltk.word_tokenize(sentence)
    
    words = [w for w in words if not w in stop_words]
    
    filter_sentence = ''
    for words in words:
        filter_sentence = filter_sentence  + ' ' + str(lemmatizer.lemmatize(words)).lower()
        
    return(filter_sentence)

In [17]:
def detect(text):
    text = preprocess_sentence(text)
    emotion_label = pipeline.predict([text])
    label_to_emotion = {1: "happy", 2: "sad", 3: "anger", 4: "fear", 5: "love", 6: "surprise"}
    return(label_to_emotion[emotion_label[0]])

In [18]:
detect("I feel tired, sore, and lonely. I just want somebody to hold")

'sad'