In [2]:
import pandas as pd 
import numpy as np
import nltk
from tensorflow.keras.utils import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from tensorflow.keras.layers import Embedding , LSTM , Dense , SimpleRNN , Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import string
# from textblob import TextBlob
import joblib

In [4]:
len(open("train.txt","r").readlines())

16000

In [5]:
len(open("test.txt","r").readlines())

2000

In [7]:
open("val.txt","r").readlines()[0]

'im feeling quite sad and sorry for myself but ill snap out of it soon;sadness\n'

In [9]:
## reading the data from the text files
train = open("train.txt","r").readlines()
test = open("test.txt","r").readlines()
val = open("val.txt","r").readlines()

In [10]:
full_data = train + test + val
len(full_data)

20000

In [11]:
## creating independent and dependent variables
x = []
y = []
for item in full_data:
    text , label = item.split(";")
    label = label.replace("\n","")
    x.append(text)
    y.append(label)

In [12]:
## text cleaning steps
# 1 - doing loweercase
# 2 - perform tokenization
# 3 - remove stop words
# 4 - perform stemming or lemma
# then wee will get clean data


def text_cleaning(sentences):
    clean_text = []
    for sent in sentences:
        sent = sent.lower()
        sent = nltk.word_tokenize(sent)
        sent = [word for word in sent if word not in stopwords.words("english")]
        ps = PorterStemmer()
        sent = [ps.stem(word) for word in sent]
        clean_text.append(" ".join(sent))
    return clean_text

In [13]:
cleaned_data = text_cleaning(x)

In [14]:
x[0]

'i didnt feel humiliated'

In [15]:
cleaned_data[0]

'didnt feel humili'

In [16]:
## tokenization 
tokenizer = Tokenizer(
    oov_token = "<nothing>"
)
tokenizer.fit_on_texts(cleaned_data)

In [17]:
tokenizer.word_index
joblib.dump(tokenizer.word_index,"tokenizer.lb")

['tokenizer.lb']

In [18]:
## to check the no of counts of words in tokenizer
tokenizer.word_counts

OrderedDict([('didnt', 334),
             ('feel', 21204),
             ('humili', 69),
             ('go', 1101),
             ('hopeless', 81),
             ('damn', 54),
             ('hope', 320),
             ('around', 382),
             ('someon', 335),
             ('care', 254),
             ('awak', 22),
             ('im', 3055),
             ('grab', 22),
             ('minut', 79),
             ('post', 234),
             ('greedi', 79),
             ('wrong', 152),
             ('ever', 260),
             ('nostalg', 63),
             ('fireplac', 3),
             ('know', 1192),
             ('still', 743),
             ('properti', 5),
             ('grouchi', 35),
             ('ive', 723),
             ('littl', 932),
             ('burden', 95),
             ('late', 167),
             ('wasnt', 119),
             ('sure', 310),
             ('take', 403),
             ('milligram', 1),
             ('time', 1215),
             ('recommend', 19),
             ('amoun

In [19]:
## to check the total no of sent 
tokenizer.document_count

20000

In [20]:
## tokenize the input sent as the seq
sequences = tokenizer.texts_to_sequences(cleaned_data)
sequences[0:5]

[[61, 2, 522],
 [10, 2, 419, 682, 67, 50, 60, 96, 1229],
 [4, 1230, 431, 107, 2, 432, 192],
 [92, 2, 592, 3696, 7, 21, 2844],
 [2, 918]]

In [21]:
## to equalise the lenghs of the input 
sequences = pad_sequences(sequences, maxlen=35,padding="post")
sequences[0:3]

array([[  61,    2,  522,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [  10,    2,  419,  682,   67,   50,   60,   96, 1229,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   4, 1230,  431,  107,    2,  432,  192,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0]])

In [22]:
## to check the unique labels and no of class 
unique_label = list(pd.Series(np.array(y)).unique())
no_of_class = len(unique_label)
unique_label , no_of_class

(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'], 6)

In [23]:
## label encoding 
label_dist ={
    'sadness':0 , 'anger':1,'love':2,'surprise':3,'fear':4,'joy':5
}
def label_encoder(labels):
    label = []
    for lab in labels:
        label.append(label_dist[lab])
    label = np.array(label)
    return label


In [24]:
label = label_encoder(y)
label[0:5]

array([0, 0, 1, 2, 1])

In [25]:
## Splittng the data into train and test 

X_train,X_test,y_train,y_test =train_test_split(sequences,label,test_size=0.2,random_state=42)



In [26]:
model = Sequential()
model.add(SimpleRNN(32,input_shape = (35,1),return_sequences=False))
model.add(Dense(no_of_class, activation="softmax"))

model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy",
    metrics = ["accuracy"]
)
model.summary()

  super().__init__(**kwargs)


In [27]:
model.fit(X_train, y_train,epochs=5,batch_size=32)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.3218 - loss: 1.6065
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3292 - loss: 1.5824
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3329 - loss: 1.5768
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3276 - loss: 1.5770
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3412 - loss: 1.5758


<keras.src.callbacks.history.History at 0x20c51f38950>

In [28]:
## expanding the dimensions 
np.expand_dims(sequences[0], axis=0)

array([[ 61,   2, 522,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [29]:
pred = model.predict(np.expand_dims(sequences[0], axis=0))
pred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step


array([[0.28309166, 0.15092625, 0.0785109 , 0.03792012, 0.12433436,
        0.32521665]], dtype=float32)