In [2]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve,classification_report
from nltk.tokenize import RegexpTokenizer

'''Reading the file'''
df_train = pd.read_csv('train.txt', names=['Text', 'Emotion'], sep=';')

'''It splits strings into tokens'''
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
'''used to convert a collection of text documents to a vector of term/token counts.'''
cv = CountVectorizer(stop_words='english', ngram_range=(1,1), tokenizer = token.tokenize)

'''It scale's the training data and also learn the scaling parameters of that data'''
text = cv.fit_transform(df_train['Text'])
'''splits datas'''
X_train, X_test, y_train, y_test = train_test_split(text,df_train['Emotion'], test_size=0.20, random_state=5)

'''Naive Bayes classifier'''
mnb = MultinomialNB()
'''Fitting data to model'''
mnb.fit(X_train, y_train)
'''Predicting values'''
ynb=mnb.predict(X_test)
ynb

array(['joy', 'sadness', 'fear', ..., 'love', 'joy', 'joy'], dtype='<U8')

In [3]:
print('###### confusion matrix Naive Bayes Classifier  ######')

cmnb = confusion_matrix(y_test, ynb)
print(cmnb)



###### confusion matrix Naive Bayes Classifier  ######
[[ 287   13   33    0   77    1]
 [  19  245   57    1   78    2]
 [   9    7 1009    8   55    0]
 [   6    2  127   91   42    0]
 [  12    8   41    1  854    1]
 [   1   18   51    1   29   14]]


In [4]:
print('###### classification report Naive Bayes Classifier ######')
print(classification_report(y_test, ynb))



###### classification report Naive Bayes Classifier ######
              precision    recall  f1-score   support

       anger       0.86      0.70      0.77       411
        fear       0.84      0.61      0.71       402
         joy       0.77      0.93      0.84      1088
        love       0.89      0.34      0.49       268
     sadness       0.75      0.93      0.83       917
    surprise       0.78      0.12      0.21       114

    accuracy                           0.78      3200
   macro avg       0.81      0.60      0.64      3200
weighted avg       0.79      0.78      0.76      3200



In [5]:
print('Naive Bayes Classifier ')
print('\n accuracy  :', accuracy_score(y_test, ynb))

recall = np.diag(cmnb) / np.sum(cmnb, axis = 1)
precision = np.diag(cmnb) / np.sum(cmnb, axis = 0)

print('\n recall  :',np.mean(recall))
print('\n precision :',np.mean(precision))

Naive Bayes Classifier 

 accuracy  : 0.78125

 recall  : 0.6047993742428704

 precision : 0.8138950548399304


In [6]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(max_iter=1000)
logisticRegr.fit(X_train, y_train)
ylr=logisticRegr.predict(X_test)

In [7]:
print('###### confusion matrix Logistic regression######')

cml = confusion_matrix(y_test, ylr)
print(cml)

###### confusion matrix Logistic regression######
[[ 356   11   13    1   30    0]
 [  17  329   20    3   27    6]
 [   5    9 1006   38   28    2]
 [   2    2   55  205    4    0]
 [  23   14   30    3  844    3]
 [   2   19   10    2    3   78]]


In [8]:
print('###### classification report Logistic regression######')
print(classification_report(y_test, ylr))

###### classification report Logistic regression######
              precision    recall  f1-score   support

       anger       0.88      0.87      0.87       411
        fear       0.86      0.82      0.84       402
         joy       0.89      0.92      0.91      1088
        love       0.81      0.76      0.79       268
     sadness       0.90      0.92      0.91       917
    surprise       0.88      0.68      0.77       114

    accuracy                           0.88      3200
   macro avg       0.87      0.83      0.85      3200
weighted avg       0.88      0.88      0.88      3200



In [9]:
print('Logistic regression')
print('\n accuracy  :', accuracy_score(y_test, ylr))

recall = np.diag(cml) / np.sum(cml, axis = 1)
precision = np.diag(cml) / np.sum(cml, axis = 0)

print('\n recall  :',np.mean(recall))
print('\n precision :',np.mean(precision))

Logistic regression

 accuracy  : 0.880625

 recall  : 0.8297914742944702

 precision : 0.8690857265090646


In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
yrf = clf.predict(X_test)
yrf

array(['joy', 'sadness', 'fear', ..., 'love', 'joy', 'joy'], dtype=object)

In [11]:
print('###### confusion matrix Random Forest######')

cmrf = confusion_matrix(y_test, yrf)
print(cmrf)

###### confusion matrix Random Forest######
[[367   6   9   1  27   1]
 [ 32 326   8   2  21  13]
 [ 10  12 970  54  40   2]
 [  2   1  37 223   5   0]
 [ 31  18  24   2 839   3]
 [  0  24   7   1   4  78]]


In [12]:
print('###### classification report Random Forest ######')
print(classification_report(y_test, yrf))

###### classification report Random Forest ######
              precision    recall  f1-score   support

       anger       0.83      0.89      0.86       411
        fear       0.84      0.81      0.83       402
         joy       0.92      0.89      0.91      1088
        love       0.79      0.83      0.81       268
     sadness       0.90      0.91      0.91       917
    surprise       0.80      0.68      0.74       114

    accuracy                           0.88      3200
   macro avg       0.85      0.84      0.84      3200
weighted avg       0.88      0.88      0.88      3200



In [13]:
print('Random Forest')
print('\n accuracy  :', accuracy_score(y_test, ylr))

recall = np.diag(cmrf) / np.sum(cmrf, axis = 1)
precision = np.diag(cmrf) / np.sum(cmrf, axis = 0)

print('\n recall  :',np.mean(recall))
print('\n precision :',np.mean(precision))

Random Forest

 accuracy  : 0.880625

 recall  : 0.8377789217621977

 precision : 0.846767063519387


In [18]:
from sklearn.tree import DecisionTreeClassifier
cdt = DecisionTreeClassifier()
cdt.fit(X_train, y_train)
ydt = cdt.predict(X_test)
ydt

array(['joy', 'sadness', 'fear', ..., 'love', 'joy', 'sadness'],
      dtype=object)

In [19]:
print('###### confusion matrix Decision tree######')

cmdt = confusion_matrix(y_test, ydt)
print(cmdt)

###### confusion matrix Random Forest######
[[368  10   5   3  24   1]
 [ 27 349   4   1  13   8]
 [ 13  17 881  58 116   3]
 [  2   2  44 218   2   0]
 [ 40  22  16   5 825   9]
 [  1  20  11   1   3  78]]


In [20]:
print('###### classification report Decision tree ######')
print(classification_report(y_test, ydt))

###### classification report Random Forest ######
              precision    recall  f1-score   support

       anger       0.82      0.90      0.85       411
        fear       0.83      0.87      0.85       402
         joy       0.92      0.81      0.86      1088
        love       0.76      0.81      0.79       268
     sadness       0.84      0.90      0.87       917
    surprise       0.79      0.68      0.73       114

    accuracy                           0.85      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.85      0.85      0.85      3200



In [21]:
print('Decision Tree')
print('\n accuracy  :', accuracy_score(y_test, ydt))

recall = np.diag(cmdt) / np.sum(cmdt, axis = 1)
precision = np.diag(cmdt) / np.sum(cmdt, axis = 0)

print('\n recall  :',np.mean(recall))
print('\n precision :',np.mean(precision))

Random Forest

 accuracy  : 0.8496875

 recall  : 0.8284325313945186

 precision : 0.8255090640943088
