In [86]:
#import pandas library
import pandas as pd

#read the dataset with name "Emotion_classify_Data.csv" and store it in a variable df

df =  pd.read_csv('Emotion_classify_Data.csv')
#print the shape of dataframe
print(df.shape)

#print top 5 rows
df.head()

(5937, 2)


Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


# imbalanced data handling using under sampling

In [87]:
#check the distribution of Emotion
df.Emotion.value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [88]:
min_value = 1937
df_anger = df[df['Emotion'] == 'anger'].sample(min_value)
df_joy = df[df['Emotion'] == 'joy'].sample(min_value)
df_fear = df[df['Emotion'] == 'fear'].sample(min_value)

In [89]:
df = pd.concat([df_anger,df_joy,df_fear], axis=0)

In [90]:
df.Emotion.value_counts()

Emotion
anger    1937
joy      1937
fear     1937
Name: count, dtype: int64

In [91]:
#Add the new column "Emotion_num" which gives a unique number to each of these Emotions
#joy --> 0, fear --> 1, anger --> 2
df['Emotion_num'] = df.Emotion.map({
    'anger':0,
    'joy':1,
    'fear':2,
    
})

#checking the results by printing top 5 rows
df.head(5)

Unnamed: 0,Comment,Emotion,Emotion_num
2360,i know i shouldn t feel offended but i do,anger,0
1594,i feel very dissatisfied with myself,anger,0
2822,i am feeling that bitter sweetness that comes ...,anger,0
1859,i mustered up energy to feel christmassy i rem...,anger,0
3588,i am feeling spiteful,anger,0


In [92]:
df.Emotion_num.value_counts()

Emotion_num
0    1937
1    1937
2    1937
Name: count, dtype: int64

# modeling without preprocessing text data.

In [93]:

#import train-test split

from sklearn.model_selection import train_test_split


#Do the 'train-test' splitting with test size of 20%
#Note: Give Random state 2022 and also do the stratify sampling

In [101]:
X = df.Comment
y = df.Emotion_num
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

In [102]:
#print the shapes of X_train and X_test
X_train.shape, X_test.shape

((4648,), (1163,))

# Attempt 1:
using the sklearn pipeline module create a classification pipeline to classify the Data.

# note: 

using CountVectorizer with only trigrams.

use RandomForest as the classifier.

print the classification report.

In [105]:
#1. create a pipeline object

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (3, 3))),                       #using the ngram_range parameter 
    ('random_forest', (RandomForestClassifier()))         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.78      0.56       407
           1       0.53      0.28      0.36       368
           2       0.64      0.38      0.48       388

    accuracy                           0.49      1163
   macro avg       0.53      0.48      0.47      1163
weighted avg       0.53      0.49      0.47      1163



# Attempt 2:
using the sklearn pipeline module create a classification pipeline to classify the data.

# note:
using CountVectorizer with both unigram and bigrams.

use Multinomial Naive Bayes as the classifier.

print the classification report.

In [107]:
#import MultinomialNB from sklearn

from sklearn.naive_bayes import MultinomialNB

#1. create a pipeline object

clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('Mult', MultinomialNB())
])


#2. fit with X_train and y_train

clf.fit(X_train,y_train)

#3. get the predictions for X_test and store it in y_pred

prediction2 = clf.predict(X_test)
#4. print the classfication report
print(classification_report(y_test, prediction2))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88       407
           1       0.87      0.85      0.86       368
           2       0.84      0.88      0.86       388

    accuracy                           0.87      1163
   macro avg       0.87      0.87      0.87      1163
weighted avg       0.87      0.87      0.87      1163



# Attempt 3:
using the sklearn pipeline module create a classification pipeline to classify the Data.

# note:
using CountVectorizer with both unigram and Bigrams.

use RandomForest as the classifier.

print the classification report.

In [108]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('rf', RandomForestClassifier())
])


#2. fit with X_train and y_train

clf.fit(X_train,y_train)

#3. get the predictions for X_test and store it in y_pred

prediction3 = clf.predict(X_test)

#4. print the classfication report
print(classification_report(y_test,prediction3))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90       407
           1       0.85      0.96      0.90       368
           2       0.95      0.91      0.93       388

    accuracy                           0.91      1163
   macro avg       0.92      0.91      0.91      1163
weighted avg       0.92      0.91      0.91      1163



 # Attempt 3:
 using the sklearn pipeline module create a classification pipeline to classify the Data.

# Note:
using TF-IDF vectorizer for Pre-processing the text.

use RandomForest as the classifier.

print the classification report.

In [110]:

#import TfidfVectorizer from sklearn

from sklearn.feature_extraction.text import TfidfVectorizer

#1. create a pipeline object

clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

#2. fit with X_train and y_train

clf.fit(X_train,y_train)

#3. get the predictions for X_test and store it in y_pred

prediction4 =clf.predict(X_test)
#4. print the classfication report
print(classification_report(y_test, prediction4))

              precision    recall  f1-score   support

           0       0.94      0.89      0.92       407
           1       0.89      0.92      0.90       368
           2       0.94      0.94      0.94       388

    accuracy                           0.92      1163
   macro avg       0.92      0.92      0.92      1163
weighted avg       0.92      0.92      0.92      1163



# Use text pre-processing to remove stop words, punctuations and apply lemmatization
nn

In [111]:
import spacy

nlp = spacy.load('en_core_web_sm')
#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [112]:
# create a new column "preprocessed_comment" and use the utility function above to get the clean data
df['preprocessed_comment'] = df.Comment.apply(preprocess)
# this will take some time, please be patient

In [113]:
df.head()

Unnamed: 0,Comment,Emotion,Emotion_num,preprocessed_comment
2360,i know i shouldn t feel offended but i do,anger,0,know shouldn t feel offended
1594,i feel very dissatisfied with myself,anger,0,feel dissatisfied
2822,i am feeling that bitter sweetness that comes ...,anger,0,feel bitter sweetness come deep recess soul
1859,i mustered up energy to feel christmassy i rem...,anger,0,muster energy feel christmassy remember feel k...
3588,i am feeling spiteful,anger,0,feel spiteful


# Build a model with preprocessed text

In [115]:
#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
#Note: Use the preprocessed_Comment
from sklearn.model_selection import train_test_split
X = df.preprocessed_comment
y = df.Emotion_num
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

# Let's check the scores with our best model till now
Randomforest

# Attempt 1:
using the sklearn pipeline module create a classification pipeline to classify the Data.

# Note:
using CountVectorizer with both unigrams and bigrams.

use RandomForest as the classifier.

print the classification report.

In [119]:
#1. create a pipeline object

clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('rf', RandomForestClassifier())
])


#2. fit with X_train and y_train

clf.fit(X_train,y_train)

#3. get the predictions for X_test and store it in y_pred

preprocessed_pred = clf.predict(X_test)

#4. print the classfication report
print(classification_report(y_test, preprocessed_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       407
           1       0.93      0.95      0.94       368
           2       0.96      0.93      0.94       388

    accuracy                           0.94      1163
   macro avg       0.94      0.94      0.94      1163
weighted avg       0.94      0.94      0.94      1163



# Attempt 2:
sing the sklearn pipeline module create a classification pipeline to classify the data.

# Note:

using TF-IDF vectorizer for pre-processing the text.

use RandomForest as the classifier.

print the classification report.

In [120]:

#1. create a pipeline object

clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

#2. fit with X_train and y_train

clf.fit(X_train,y_train)

#3. get the predictions for X_test and store it in y_pred

tf_idf_prediction = clf.predict(X_test)

#4. print the classfication report
print(classification_report(y_test, tf_idf_prediction))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93       407
           1       0.93      0.93      0.93       368
           2       0.95      0.95      0.95       388

    accuracy                           0.93      1163
   macro avg       0.93      0.93      0.93      1163
weighted avg       0.93      0.93      0.93      1163

