#DATA PREP

In [None]:
!pip3 install ktrain



In [None]:
import pandas as pd
import numpy as np

import ktrain
from ktrain import text

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_train = pd.read_csv('/content/drive/MyDrive/MELD/train/train_sent_emo.csv', encoding='utf-8')
data_test = pd.read_csv('/content/drive/MyDrive/MELD/test_sent_emo.csv', encoding='utf-8')
data_dev = pd.read_csv('/content/drive/MyDrive/MELD/dev_sent_emo.csv', encoding='utf-8')

#BASELINE MODEL

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

X_train = data_train.Utterance.to_list()
X_dev = data_dev.Utterance.to_list()
X_test = data_test.Utterance.to_list()

y_train = data_train.Emotion.to_list()
y_dev = data_dev.Emotion.to_list()
y_test = data_test.Emotion.to_list()

class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral', 'disgust', 'surprise']
emotion_to_numeric = {'joy': 0, 'sadness': 1, 'fear': 2, 'anger': 3, 'neutral': 4, 'disgust': 5, 'surprise': 6}
y_train = [emotion_to_numeric[label] for label in y_train]
y_dev = [emotion_to_numeric[label] for label in y_dev]
y_test = [emotion_to_numeric[label] for label in y_test]

max_features_values = [100, 300, 500, 1000, 3000, 7000, 12000, 20000]
accuracy_scores = []
classification_reports = []

for max_features in max_features_values:
    vectorizer = TfidfVectorizer(max_features=max_features)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_dev_tfidf = vectorizer.transform(X_dev)

    classifier = MultinomialNB()
    classifier.fit(X_train_tfidf, y_train)

    y_pred = classifier.predict(X_dev_tfidf)

    accuracy = accuracy_score(y_dev, y_pred)
    accuracy_scores.append(accuracy)

    classification_report_str = classification_report(y_dev, y_pred, target_names=class_names, zero_division=1)
    classification_reports.append(classification_report_str)

max_accuracy_idx = np.argmax(accuracy_scores)
best_max_features = max_features_values[max_accuracy_idx]
best_accuracy = accuracy_scores[max_accuracy_idx]

print("Best max_features value:", best_max_features)
print("Best accuracy:", best_accuracy)
print("\nClassification Report for Best max_features value:")
print(classification_reports[max_accuracy_idx])


vectorizer = TfidfVectorizer(max_features=best_max_features)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)
y_pred_test = classifier.predict(X_test_tfidf)

accuracy_test = accuracy_score(y_test, y_pred_test)
classification_report_test = classification_report(y_test, y_pred_test, target_names=class_names, zero_division=1)

print("\nTest Accuracy:", accuracy_test)
print("\nClassification Report for Test Data:")
print(classification_report_test)



Best max_features value: 300
Best accuracy: 0.4580703336339044

Classification Report for Best max_features value:
              precision    recall  f1-score   support

         joy       0.48      0.09      0.15       163
     sadness       1.00      0.05      0.09       111
        fear       1.00      0.00      0.00        40
       anger       0.50      0.01      0.01       153
     neutral       0.45      0.97      0.61       470
     disgust       1.00      0.00      0.00        22
    surprise       0.58      0.21      0.31       150

    accuracy                           0.46      1109
   macro avg       0.72      0.19      0.17      1109
weighted avg       0.56      0.46      0.33      1109


Test Accuracy: 0.5191570881226054

Classification Report for Test Data:
              precision    recall  f1-score   support

         joy       0.65      0.11      0.19       402
     sadness       0.80      0.02      0.04       208
        fear       1.00      0.00      0.00        5

#PREPROCESSING

In [None]:
!pip install nltk
!pip install wordnet

Collecting wordnet
  Downloading wordnet-0.0.1b2.tar.gz (8.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting colorama==0.3.9 (from wordnet)
  Downloading colorama-0.3.9-py2.py3-none-any.whl (20 kB)
Building wheels for collected packages: wordnet
  Building wheel for wordnet (setup.py) ... [?25l[?25hdone
  Created wheel for wordnet: filename=wordnet-0.0.1b2-py3-none-any.whl size=10501 sha256=eba5f71584dc98d40af5785ad616372e3edb95d84d064696319e053e9dac8fd3
  Stored in directory: /root/.cache/pip/wheels/c0/a1/e8/4649c8712033dcdbd1e64a0fc75216a5d1769665852c36b4f9
Successfully built wordnet
Installing collected packages: colorama, wordnet
Successfully installed colorama-0.3.9 wordnet-0.0.1b2


In [None]:
import nltk
import wordnet
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

def text_preprocess(ds: pd.Series) -> pd.Series:
    ds_copy = ds.copy()
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    word_counts = Counter()

    for m in range(len(ds_copy)):
        main_words = re.sub('[^a-zA-Z]', ' ', ds_copy[m])
        main_words = main_words.lower().split()

        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]
        main_words = [stemmer.stem(w) for w in main_words]

        main_words = [w for w in main_words if w not in stop_words]
        main_words = [w if not w.isdigit() else 'NUM' for w in main_words]

        main_words = ' '.join(main_words)
        ds_copy[m] = main_words

    return ds_copy

data_train_processed = data_train
data_test_processed = data_test
data_train_processed['Utterance'] = text_preprocess(data_train_processed['Utterance'])
data_test_processed['Utterance'] = text_preprocess(data_test_processed['Utterance'])

In [None]:
fear_data = data_train[data_train['Emotion'] == 'fear']
disgust_data = data_train[data_train['Emotion'] == 'disgust']
sadness_data = data_train[data_train['Emotion'] == 'sadness']

desired_samples = 1100

add_fear = desired_samples - len(fear_data)
add_disgust = desired_samples - len(disgust_data)
add_sadness = desired_samples - len(sadness_data)

additional_fear_utterances = fear_data.sample(n=add_fear, replace=True)
additional_disgust_utterances = disgust_data.sample(n=add_disgust, replace=True)
additional_sadness_utterances = sadness_data.sample(n=add_sadness, replace=True)

augmented_data_train = pd.concat([data_train, additional_fear_utterances,
                                  additional_disgust_utterances, additional_sadness_utterances],
                                 ignore_index=True)

print(augmented_data_train['Emotion'].value_counts())

Emotion
neutral     4710
joy         1743
surprise    1205
anger       1109
fear        1100
sadness     1100
disgust     1100
Name: count, dtype: int64


In [None]:
X_train = data_train.Utterance.to_list()
X_test = data_test.Utterance.to_list()
X_val = data_dev.Utterance.to_list()

y_train = data_train.Emotion.to_list()
y_test = data_test.Emotion.to_list()
y_val = data_dev.Emotion.to_list()

data = data_train

print('size of training set: %s' % (len(data_train['Utterance'])))
print('size of validation set: %s' % (len(data_test['Utterance'])))
print(data.Emotion.value_counts())

data.head(5)

size of training set: 9989
size of validation set: 2610
Emotion
neutral     4710
joy         1743
surprise    1205
anger       1109
sadness      683
disgust      271
fear         268
Name: count, dtype: int64


Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also wa point person compani transit kl gr system,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,must hand full,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,let talk littl bit duti,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,duti right,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"


#REGULAR BERT

In [None]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350,
                                                                       max_features=35000)

downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en




task: text classification


In [None]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350




done.


In [None]:
learner = ktrain.get_learner(model, train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=12)

In [None]:
learner.fit_onecycle(2e-5, 3)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
 38/833 [>.............................] - ETA: 3:21:27 - loss: 2.2215 - accuracy: 0.1491

In [None]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.get_classes()

In [None]:
import time

message = 'I just broke my knee'

start_time = time.time()
prediction = predictor.predict(message)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))

```
        82/82 [==============================] - 845 969ms/ step
                      precision    recall  f1-score   support

                joy       0.57      0.60      0.58       402
            sadness       0.41      0.27      0.33       208
               fear       0.22      0.16      0.19        50
              anger       0.49      0.43      0.46       345
            neutral       0.75      0.81      0.78      1256
            disgust       0.34      0.16      0.22        68
           surprise       0.53      0.62      0.57       281

           accuracy                           0.63      2610
          macro avg       0.48      0.44      0.45      2610
       weighted avg       0.62      0.63      0.62      2610

         array([[ 235,   10,    2,   24,   98,    0,   33],
                [  16,   59,    9,   18,   88,    4,   14],
                [   3,    5,    7,    9,   19,    0,    7],
                [  43,   16,    6,  148,   76,    5,   51],
                [  73,   49,   14,   40, 1027,    3,   50],
                [   3,    6,    2,   12,   29,    9,    7],
                [  31,    9,    2,   31,   42,    1,  165]])
```

#RANDOM SAMPLING BERT

In [None]:
X_train = augmented_data_train.Utterance.to_list()
X_test = data_test.Utterance.to_list()
X_val = data_dev.Utterance.to_list()

y_train = augmented_data_train.Emotion.to_list()
y_test = data_test.Emotion.to_list()
y_val = data_dev.Emotion.to_list()

data = data_train

print('size of training set: %s' % (len(data_train['Utterance'])))
print('size of validation set: %s' % (len(data_test['Utterance'])))
print(data.Emotion.value_counts())

data.head(5)

In [None]:
(x_train,  y_train), (x_test, y_test), preproc2 = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350,
                                                                       max_features=35000)

In [None]:
model2 = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc2)

In [None]:
learner2 = ktrain.get_learner(model2, train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=12)

In [None]:
learner2.fit_onecycle(2e-5, 3)

In [None]:
learner2.validate(val_data=(x_test, y_test), class_names=class_names)

```
		82/82 [==============================] - 31s 262ms/step
		              precision    recall  f1-score   support

		         joy       0.58      0.58      0.58       402
		     sadness       0.38      0.28      0.33       208
		        fear       0.17      0.14      0.15        50
		       anger       0.52      0.43      0.47       345
		     neutral       0.74      0.82      0.78      1256
		     disgust       0.41      0.13      0.20        68
		    surprise       0.50      0.59      0.54       281

		    accuracy                           0.63      2610
		   macro avg       0.47      0.42      0.44      2610
		weighted avg       0.62      0.63      0.62      2610

		array([[ 235,   10,    2,   24,   98,    0,   33],
		       [  16,   59,    9,   18,   88,    4,   14],
		       [   3,    5,    7,    9,   19,    0,    7],
		       [  43,   16,    6,  148,   76,    5,   51],
		       [  73,   49,   14,   40, 1027,    3,   50],
		       [   3,    6,    2,   12,   29,    9,    7],
		       [  31,    9,    2,   31,   42,    1,  165]])

```



In [None]:
predictor2 = ktrain.get_predictor(learner2.model, preproc2)
predictor2.get_classes()

In [None]:
import time

message = 'I just broke my knee'

start_time = time.time()
prediction = predictor2.predict(message)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))

#PREPROCESSED BERT

In [None]:
X_train = data_train_processed.Utterance.to_list()
X_test = data_test_processed.Utterance.to_list()
X_val = data_dev.Utterance.to_list()

y_train = data_train_processed.Emotion.to_list()
y_test = data_test_processed.Emotion.to_list()
y_val = data_dev.Emotion.to_list()

data = data_train

print('size of training set: %s' % (len(data_train['Utterance'])))
print('size of validation set: %s' % (len(data_test['Utterance'])))
print(data.Emotion.value_counts())

data.head(5)

In [None]:
(x_train,  y_train), (x_test, y_test), preproc3 = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350,
                                                                       max_features=35000)

In [None]:
model3 = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc3)

In [None]:
learner3 = ktrain.get_learner(model3, train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=12)

In [None]:
learner3.fit_onecycle(2e-5, 3)

In [None]:
learner3.validate(val_data=(x_test, y_test), class_names=class_names)

In [None]:
predictor3 = ktrain.get_predictor(learner3.model, preproc3)
predictor3.get_classes()

In [None]:
import time

message = 'I just broke my knee'

start_time = time.time()
prediction = predictor.predict(message)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))

		```
    82/82 [==============================] - 25s 260ms/step
		              precision    recall  f1-score   support

		         joy       0.52      0.53      0.53       402
		     sadness       0.33      0.07      0.12       208
		        fear       0.06      0.20      0.09        50
		       anger       0.60      0.17      0.26       345
		     neutral       0.67      0.82      0.74      1256
		     disgust       0.48      0.15      0.22        68
		    surprise       0.43      0.48      0.45       281

		    accuracy                           0.56      2610
		   macro avg       0.44      0.35      0.34      2610
		weighted avg       0.57      0.56      0.54      2610

		array([[ 215,    4,    8,    8,  144,    1,   22],
		       [  15,   15,   24,    7,  116,    2,   29],
		       [   5,    1,   10,    2,   27,    0,    5],
		       [  58,    9,   30,   58,  132,    4,   54],
		       [  84,    8,   54,   14, 1031,    4,   61],
		       [   3,    4,    7,    3,   31,   10,   10],
		       [  34,    4,   40,    5,   63,    0,  135]])
           ```

#SAVING THE MODEL


In [None]:
# let's save the predictor for later use
predictor.save("models/bert_model")