In [24]:
!pip install nlp



In [25]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nlp
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
validation = pd.read_csv('./validation.csv')
train = pd.read_csv('./training.csv')
test = pd.read_csv('./test.csv')

In [27]:
train_text = train['text']
train_labels = train['label']

In [28]:
import plotly.express as px

px.bar(train_labels.value_counts(ascending=True),template='plotly_white')

In [29]:
train_text[0], train_labels[0]

('i didnt feel humiliated', 0)

In [30]:
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>', split=' ', char_level=False)
tokenizer.fit_on_texts(train_text)

In [31]:
train_text[0]

'i didnt feel humiliated'

In [32]:
tokenizer.texts_to_sequences([train_text[0]])

[[2, 139, 3, 679]]

In [33]:
maxlen=50
def get_sequences(tokenizer, train_text):
    sequences = tokenizer.texts_to_sequences(train_text)
    padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=maxlen)
    return padded

In [34]:
padded_train_seq = get_sequences(tokenizer, train_text)

In [35]:
padded_train_seq[0]

array([  2, 139,   3, 679,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [36]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter


print("Before undersampling:", Counter(train_labels))

class_counts = Counter(train_labels)

# Set the target number of samples per class
target_samples = 1000

# Determine classes needing oversampling and undersampling
classes_to_resample = {cls: target_samples for cls, count in class_counts.items() if count < target_samples}
classes_to_keep = {cls: target_samples for cls, count in class_counts.items() if count >= target_samples}
print(classes_to_resample)
print(classes_to_keep)
# Oversample minority classes
oversampler = RandomOverSampler(sampling_strategy=classes_to_resample)

# Undersample majority classes
undersampler = RandomUnderSampler(sampling_strategy=classes_to_keep)

# Apply both oversampling and undersampling
train_text_resampled, train_labels_resampled = oversampler.fit_resample(padded_train_seq, train_labels)
print(Counter(train_labels_resampled))


train_text_resampled, train_labels_resampled = undersampler.fit_resample(train_text_resampled, train_labels_resampled)

# Check the class distribution after resampling
print(Counter(train_labels_resampled))


Before undersampling: Counter({1: 5362, 0: 4666, 3: 2159, 4: 1937, 2: 1304, 5: 572})
{5: 1000}
{0: 1000, 3: 1000, 2: 1000, 4: 1000, 1: 1000}
Counter({1: 5362, 0: 4666, 3: 2159, 4: 1937, 2: 1304, 5: 1000})
Counter({0: 1000, 1: 1000, 2: 1000, 3: 1000, 4: 1000, 5: 1000})


In [57]:
px.bar(train_labels_resampled.value_counts(ascending=True),template='plotly_white')
print(train_text_resampled[0], train_labels_resampled[0])

[  2 269   5 383   8 882 393 179 246   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0] 0


In [38]:
print(train_text_resampled)

[[  2 269   5 ...   0   0   0]
 [  2  24 136 ...   0   0   0]
 [  2 231   3 ...   0   0   0]
 ...
 [  2  72   3 ...   0   0   0]
 [  2  47  15 ...   0   0   0]
 [  2 323   6 ...   0   0   0]]


In [39]:
print(train_labels_resampled)

0       0
1       0
2       0
3       0
4       0
       ..
5995    5
5996    5
5997    5
5998    5
5999    5
Name: label, Length: 6000, dtype: int64


In [40]:
classes = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
class_to_index = dict((c,i) for i, c in enumerate(classes))
index_to_class = dict((v,k) for k, v in class_to_index.items())
names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])

In [41]:
print(classes)

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [42]:
class_to_index

{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}

In [43]:
index_to_class

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

In [44]:
print(train_labels_resampled[0])

0


In [45]:
val_text = validation['text']
val_labels = validation['label']
val_seq = get_sequences(tokenizer, val_text)
print(val_seq)

[[ 17   8 157 ...   0   0   0]
 [  2   3  14 ...   0   0   0]
 [  2   3  14 ...   0   0   0]
 ...
 [  2   3  79 ...   0   0   0]
 [  2 395   3 ...   0   0   0]
 [  2   3  14 ...   0   0   0]]


In [46]:
import plotly.express as px

px.bar(val_labels.value_counts(ascending=True),template='plotly_white')
print()

In [59]:
acc = tf.keras.metrics.SparseCategoricalAccuracy(
    name="accuracy", dtype=None
)

model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(10000,16,input_length=maxlen),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(
     loss='sparse_categorical_crossentropy',
     #optimizer='adam',
     optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
     metrics=[acc]
)
model.summary()

h = model.fit(
     #train_text_resampled, train_labels_resampled,
     padded_train_seq, train_labels,
     batch_size = 20,
     validation_data=(val_seq, val_labels),

     epochs=30,
     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0.01, patience=5)]
)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 50, 16)            160000    
                                                                 
 bidirectional_12 (Bidirect  (None, 50, 40)            5920      
 ional)                                                          
                                                                 
 bidirectional_13 (Bidirect  (None, 40)                9760      
 ional)                                                          
                                                                 
 dense_6 (Dense)             (None, 6)                 246       
                                                                 
Total params: 175926 (687.21 KB)
Trainable params: 175926 (687.21 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/30
Epo

KeyboardInterrupt: 

In [52]:
test_text = test['text']
test_labels=test['label']
test_seq = get_sequences(tokenizer, test_text)

print(test_seq)

model.evaluate(test_seq, test_labels)

[[ 17   8 203 ...   0   0   0]
 [ 17   1  11 ...   0   0   0]
 [  2 145  80 ...   0   0   0]
 ...
 [  2   3   9 ...   0   0   0]
 [ 17   8  38 ...   0   0   0]
 [  2   3  36 ...   0   0   0]]


[0.5716076493263245, 0.8429999947547913]

In [49]:
model.save("./emotion_model.h5")


You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.

