In [55]:
import numpy as np 
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import pickle
from sklearn.metrics import classification_report
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization

# Reading Data 

In [56]:
df = pd.read_excel('PROJECT_DATA_CLEANED.xlsx')
df.head()

Unnamed: 0,Text,label
0,omg star parivar awards take whole new level ...,surprise
1,hahaha guys rushed dorm room scrambled put cl...,fear
2,guess would good time start paper need least ...,joy
3,skipping philosophy best idea ever,joy
4,wait eaten coconut lmao not remember not decade,fear


In [57]:
df['Text'] = df['Text'].astype(str)

In [58]:
df = df[df['Text'].str.strip().apply(len) > 0]

#Importing Lemmatizer and Lemmatization

In [59]:
lemmatizer = WordNetLemmatizer()

In [60]:
def preprcs_text(text):
    
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_words)

In [61]:
df['lemmatized'] = df['Text'].apply(preprcs_text)

In [62]:
df.head(20)

Unnamed: 0,Text,label,lemmatized
0,omg star parivar awards take whole new level ...,surprise,omg star parivar award take whole new level co...
1,hahaha guys rushed dorm room scrambled put cl...,fear,hahaha guy rushed dorm room scrambled put clothes
2,guess would good time start paper need least ...,joy,guess would good time start paper need least d...
3,skipping philosophy best idea ever,joy,skipping philosophy best idea ever
4,wait eaten coconut lmao not remember not decade,fear,wait eaten coconut lmao not remember not decade
5,ashanaye far meanest person walk planet love t...,anger,ashanaye far meanest person walk planet love t...
6,death paternal grandmother close died cancer,sadness,death paternal grandmother close died cancer
7,weeks hard work mi ute presentation,neutral,week hard work mi ute presentation
8,yes curtain dream,fear,yes curtain dream
9,happiness lies joy achievement thrill creative...,joy,happiness lie joy achievement thrill creative ...


In [63]:
df.drop(['Text'],axis=1,inplace=True)
df.head()

Unnamed: 0,label,lemmatized
0,surprise,omg star parivar award take whole new level co...
1,fear,hahaha guy rushed dorm room scrambled put clothes
2,joy,guess would good time start paper need least d...
3,joy,skipping philosophy best idea ever
4,fear,wait eaten coconut lmao not remember not decade


#Creating Tokenization with Vocabulary of 10K words and Out of Words token

In [64]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")

In [65]:
tokenizer.fit_on_texts(df['lemmatized'])

In [66]:
word_index = tokenizer.word_index
print(word_index)



#Checking count of words in Our Vocabulary 

In [67]:
word_counts = tokenizer.word_counts

In [68]:
print(word_counts)



In [69]:
type(word_counts)

collections.OrderedDict

In [70]:
word_counts = dict(word_counts)

In [71]:
print(word_counts)



#Saving Tokenizer

In [18]:
with open('tokenizer10k.pkl', 'wb') as handle:
     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

#Priniting vector of tokenized words

In [72]:
sequences = tokenizer.texts_to_sequences(df['lemmatized'])

In [73]:
sequences

[[558, 832, 1, 1763, 74, 174, 94, 625, 2864, 9378, 1],
 [1701, 89, 1324, 4079, 172, 1, 134, 882],
 [257, 10, 31, 7, 140, 425, 47, 342, 162, 170, 99],
 [9379, 4348, 77, 309, 76],
 [251, 2130, 5994, 1269, 2, 80, 2, 1853],
 [1, 345, 1, 68, 333, 1041, 15, 135],
 [225, 1, 973, 212, 193, 980],
 [92, 147, 42, 4349, 1, 1960],
 [350, 4827, 359],
 [355, 559, 228, 2587, 3981, 1119, 925, 5024, 3875],
 [1081, 630, 51, 262],
 [253, 58, 16, 3982, 584, 43, 3151, 2276, 58],
 [82, 498, 1, 23, 151, 109, 92, 113, 5995, 658, 535, 1351, 1830, 151, 109, 17],
 [374, 165, 1, 23, 863, 79],
 [402, 381, 5, 21, 2343],
 [353, 1067, 56, 61, 105],
 [841,
  92,
  141,
  19,
  395,
  110,
  19,
  112,
  468,
  304,
  146,
  133,
  386,
  115,
  45,
  1787,
  190,
  90,
  132,
  10,
  2,
  27,
  19,
  73,
  127],
 [4217, 1764, 77, 163, 2, 9380, 87, 58, 2865, 58, 2161],
 [25, 77, 1648, 4506, 136],
 [386, 167, 8, 3983, 2183, 269, 507, 1702, 562, 364],
 [374, 3561, 167, 543, 2209, 73, 369, 5996],
 [9381, 260, 1854, 824, 19

#Finiding maximum length of this sequence element

In [74]:
max_len = max(len(seq) for seq in sequences)
print(max_len)

46


In [75]:
max_pad = 50

#Padding All vector with pre-padding with 0

In [None]:
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [77]:
padded_sequences

array([[   0,    0,    0, ..., 2864, 9378,    1],
       [   0,    0,    0, ...,    1,  134,  882],
       [   0,    0,    0, ...,  162,  170,   99],
       ...,
       [   0,    0,    0, ..., 2294,  290, 5313],
       [   0,    0,    0, ..., 1045, 3228, 1370],
       [   0,    0,    0, ...,   53, 1055, 3928]])

In [78]:
padded_sequences[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0, 1701,   89, 1324, 4079,  172,    1,
        134,  882])

In [79]:
len(padded_sequences)

93007

In [80]:
len(df)

93007

#Creating GLOVE embedding Matrix

In [81]:
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

#Creating own matrix

In [82]:
vocab_size = len(word_index)

In [83]:
print(vocab_size)

34167


In [84]:
embedding_matrix = np.zeros((vocab_size+1, 100))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [85]:
embedding_matrix.shape

(34168, 100)

In [86]:
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.19103999  0.17601     0.36919999 ... -0.59680003  0.080843
   0.27866   ]
 ...
 [-0.18628    -0.29938     0.44789001 ... -0.29466999 -0.045813
  -0.29076001]
 [-0.14793    -0.0043382   0.07197    ... -0.037839   -0.028775
  -0.61922997]
 [-0.22397999  0.41304001 -0.31589001 ... -0.21483    -0.025332
   0.32852   ]]


In [87]:
word_index

{'<OOV>': 1,
 'not': 2,
 'feel': 3,
 'feeling': 4,
 'like': 5,
 'angry': 6,
 'time': 7,
 'people': 8,
 'one': 9,
 'would': 10,
 'get': 11,
 'day': 12,
 'know': 13,
 'life': 14,
 'love': 15,
 'really': 16,
 'year': 17,
 'no': 18,
 'friend': 19,
 'surprised': 20,
 'going': 21,
 'thing': 22,
 'make': 23,
 'go': 24,
 'got': 25,
 'could': 26,
 'see': 27,
 'still': 28,
 'want': 29,
 'back': 30,
 'good': 31,
 'shocked': 32,
 'think': 33,
 'even': 34,
 'way': 35,
 'discouraged': 36,
 'much': 37,
 'can': 38,
 'little': 39,
 'never': 40,
 'god': 41,
 'work': 42,
 'today': 43,
 'man': 44,
 'u': 45,
 'first': 46,
 'need': 47,
 'something': 48,
 'felt': 49,
 'right': 50,
 'world': 51,
 'home': 52,
 'made': 53,
 'say': 54,
 'come': 55,
 'always': 56,
 'someone': 57,
 'quot': 58,
 'great': 59,
 'night': 60,
 'look': 61,
 'school': 62,
 'well': 63,
 'thought': 64,
 'baby': 65,
 'video': 66,
 'amp': 67,
 'person': 68,
 'girl': 69,
 'every': 70,
 'many': 71,
 'said': 72,
 'last': 73,
 'take': 74,
 'give

#Encoded LAbels

In [88]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [89]:
df['encoded_labels'] = label_encoder.fit_transform(df['label'])

In [90]:
print(label_encoder.classes_)

['anger' 'disgust' 'fear' 'joy' 'neutral' 'sadness' 'surprise']


In [91]:
num_classes = len(label_encoder.classes_)
num_classes

7

In [92]:
y = to_categorical(df['encoded_labels'], num_classes=num_classes)

#Splitting Data into Train and Test(validation)

In [93]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

In [94]:
y_val

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#Model Architecture 


In [95]:
model = Sequential([
    Embedding(input_dim=vocab_size+1, output_dim=100, weights=[embedding_matrix],   #Adding Embedding Layer 
              input_length=max_len, trainable=True),
    LSTM(128, return_sequences=False, activation='tanh'),BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu'),BatchNormalization(),  # Fully connected layer
    Dropout(0.3),
    Dense(64, activation='relu'), BatchNormalization(), # Fully connected layer
    Dropout(0.3),
    Dense(32, activation='relu'),BatchNormalization(),  # Fully connected layer
    Dropout(0.3),
    Dense(num_classes, activation='sigmoid')  #To get individual prabability of each class
])



In [96]:

model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])


In [97]:
model.summary()

In [98]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
[1m2326/2326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 37ms/step - accuracy: 0.3856 - loss: 1.7126 - val_accuracy: 0.6051 - val_loss: 1.0435
Epoch 2/10
[1m2326/2326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 36ms/step - accuracy: 0.6547 - loss: 1.0007 - val_accuracy: 0.6941 - val_loss: 0.8684
Epoch 3/10
[1m2326/2326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 36ms/step - accuracy: 0.7106 - loss: 0.8607 - val_accuracy: 0.7091 - val_loss: 0.8323
Epoch 4/10
[1m2326/2326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 36ms/step - accuracy: 0.7437 - loss: 0.7668 - val_accuracy: 0.7146 - val_loss: 0.8147
Epoch 5/10
[1m2326/2326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 36ms/step - accuracy: 0.7694 - loss: 0.7027 - val_accuracy: 0.7140 - val_loss: 0.8197
Epoch 6/10
[1m2326/2326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 37ms/step - accuracy: 0.7890 - loss: 0.6435 - val_accuracy: 0.7196 - val_loss: 0.8170
Epoc

<keras.src.callbacks.history.History at 0x1554614bdd0>

In [46]:
model.save("lstm2m.h5")



In [99]:
train_loss, train_accuracy = model.evaluate(X_train, y_train)

[1m2326/2326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.8965 - loss: 0.3184


In [100]:
y_pred=model.predict(X_val)

[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step


In [49]:
y_pred

array([[0.7751326 , 0.53785515, 0.8871247 , ..., 0.4171374 , 0.33824575,
        0.1636273 ],
       [0.42588374, 0.27947423, 0.431554  , ..., 0.2363996 , 0.26796553,
        0.9998168 ],
       [0.9500773 , 0.6303022 , 0.7215893 , ..., 0.18500374, 0.3688983 ,
        0.1361198 ],
       ...,
       [0.9997814 , 0.2764934 , 0.2269246 , ..., 0.10484591, 0.28358397,
        0.23788795],
       [0.98022896, 0.58107626, 0.58484685, ..., 0.16338006, 0.38149667,
        0.10920776],
       [0.16785504, 0.06991527, 0.286181  , ..., 0.5296452 , 0.5753131 ,
        0.2855981 ]], dtype=float32)

#Evaluating predicted values

In [50]:
test_label=[]
for val in y_val:
    
    classes = np.argmax(val)
    test_label.append(classes)

In [51]:
train_label=[]
for val in y_pred:
    
    classes = np.argmax(val)
    train_label.append(classes)

In [52]:
classes_list = label_encoder.classes_

In [53]:


report = classification_report(test_label,train_label,target_names=classes_list)

In [54]:
print(report)

              precision    recall  f1-score   support

       anger       0.83      0.86      0.85      3793
     disgust       0.66      0.55      0.60       781
        fear       0.53      0.61      0.57      2701
         joy       0.74      0.73      0.73      3330
     neutral       0.77      0.71      0.74      2937
     sadness       0.64      0.65      0.64      2563
    surprise       0.78      0.72      0.75      2497

    accuracy                           0.72     18602
   macro avg       0.71      0.69      0.70     18602
weighted avg       0.72      0.72      0.72     18602



In [104]:
import pandas as pd 
import numpy as np
from nltk.corpus import stopwords
import emoji
import re
from nltk.tokenize import word_tokenize
from langdetect import detect
import nltk
from sklearn import preprocessing
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
import contractions

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [105]:
# ['anger'-0 'disgust'-1 'fear'-2 'joy'-3 'neutral'-4 'sadness'-5 'surprise'-6]
lemmatizer = WordNetLemmatizer()
textc = "she was not very happy "

stopwords = stopwords.words('english')
swdr_neg = set(stopwords) - set(['no','nor','not','never','against'])

def remove_emojis(text):
    return emoji.replace_emoji(text, replace="")

def remove_num(comm):
    return re.sub(r"[0-9]", "", comm).strip()

def remove_tags(comm):
    return re.sub(r'[@#]\S+', '', comm).strip()

def remove_splch(comm):
    return re.sub(r'[^A-Za-z0-9\s]', ' ', comm)



def remove_stp(txt):
    st=""
    for val in list(txt.split(" ")):
        if val not in swdr_neg:
            st=st+val+" "
    return st

def preprcs_text(text):
    
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_words)


def preprocess_pipeline(text):
    text = contractions.fix(text)
    text = remove_emojis(text)
    text = remove_num(text)
    text = remove_tags(text)
    text = remove_splch(text)
    text = remove_stp(text)
    text = preprcs_text(text)
    return text

textc = preprocess_pipeline(textc)

seq= tokenizer.texts_to_sequences([textc])

padded_comm = pad_sequences(seq, maxlen=max_len, padding='post')

print(padded_comm)


comm_em = model.predict(padded_comm)

print(comm_em)
print(np.argmax(comm_em))

    

[[ 2 79  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[[0.34275177 0.10202202 0.76252735 0.37793076 0.8301382  0.34911248
  0.46998608]]
4


# Evaluating classes count for y_train

In [57]:
y_train
y_trained_classes = []
for val in y_train:
    maxy = np.argmax(val)
    y_trained_classes.append(maxy)

df_trained_class = pd.DataFrame(y_trained_classes,columns=["label"])
df_trained_class=pd.DataFrame(label_encoder.inverse_transform(df_trained_class['label']) ,columns=["label"])

In [58]:
y_train_pred = model.predict(X_train)

[1m2326/2326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 11ms/step


In [59]:
y_trained_classes = []
for val in y_train_pred:
    maxy = np.argmax(val)
    y_trained_classes.append(maxy)

df_trained_pred_class = pd.DataFrame(y_trained_classes,columns=["label"])
df_trained_pred_class=pd.DataFrame(label_encoder.inverse_transform(df_trained_pred_class['label']) ,columns=["label"])

In [60]:
df_trained_class.value_counts()

label   
anger       14919
joy         13580
neutral     11561
fear        10834
sadness     10318
surprise    10123
disgust      3070
Name: count, dtype: int64

In [61]:
df_trained_pred_class.value_counts()

label   
anger       15418
joy         13316
fear        11782
neutral     11065
sadness     10524
surprise     9677
disgust      2623
Name: count, dtype: int64

#Evaluating Classes count for y_val

In [62]:
y_test_classes = []
for val in y_val:
    maxy = np.argmax(val)
    y_test_classes.append(maxy)

df_test_class = pd.DataFrame(y_test_classes,columns=["label"])
df_test_class=pd.DataFrame(label_encoder.inverse_transform(df_test_class['label']) ,columns=["label"])

In [63]:
y_test_pred_classes = []
for val in y_pred:
    maxy = np.argmax(val)
    y_test_pred_classes.append(maxy)

df_test_pred_class = pd.DataFrame(y_test_pred_classes,columns=["label"])
df_test_pred_class=pd.DataFrame(label_encoder.inverse_transform(df_test_pred_class['label']) ,columns=["label"])

In [64]:
df_test_class.value_counts()

label   
anger       3793
joy         3330
neutral     2937
fear        2701
sadness     2563
surprise    2497
disgust      781
Name: count, dtype: int64

In [65]:
df_test_pred_class.value_counts()

label   
anger       3922
joy         3295
fear        3097
neutral     2705
sadness     2624
surprise    2309
disgust      650
Name: count, dtype: int64