In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
import json
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

# **Introducing the GoEmotion dataset and pre-processing to make it readily available for model training**

In [2]:
df_train = pd.read_csv("/kaggle/input/goemotions/data/train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
df_test = pd.read_csv("/kaggle/input/goemotions/data/test.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [3]:
df_train

Unnamed: 0,Text,Class,ID
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj
...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,18,edsb738
43406,Always thought that was funny but is it a refe...,6,ee7fdou
43407,What are you talking about? Anything bad that ...,3,efgbhks
43408,"More like a baptism, with sexy results!",13,ed1naf8


In [4]:
df_train['List of classes'] = df_train['Class'].apply(lambda x: x.split(','))
df_train['Len of classes'] = df_train['List of classes'].apply(lambda x: len(x))
df_test['List of classes'] = df_test['Class'].apply(lambda x: x.split(','))
df_test['Len of classes'] = df_test['List of classes'].apply(lambda x: len(x))

### **Mapping all annotated emotion set with core emotions using Ekman Mapping**

In [5]:
with open('../input/goemotions/data/ekman_mapping.json') as file:
    ekman_mapping = json.load(file)

In [6]:
ekman_mapping

{'anger': ['anger', 'annoyance', 'disapproval'],
 'disgust': ['disgust'],
 'fear': ['fear', 'nervousness'],
 'joy': ['joy',
  'amusement',
  'approval',
  'excitement',
  'gratitude',
  'love',
  'optimism',
  'relief',
  'pride',
  'admiration',
  'desire',
  'caring'],
 'sadness': ['sadness', 'disappointment', 'embarrassment', 'grief', 'remorse'],
 'surprise': ['surprise', 'realization', 'confusion', 'curiosity']}

In [7]:
emotion_file = open("/kaggle/input/goemotions/data/emotions.txt", "r")
emotion_list = emotion_file.read()
emotion_list = emotion_list.split("\n")
print(emotion_list)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [8]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(emotion_list[int(i)])
    return arr

In [9]:
df_train['Emotions'] = df_train['List of classes'].apply(idx2class)
df_test['Emotions'] = df_test['List of classes'].apply(idx2class)

In [10]:
def EmotionMapping(emotion_list):
    map_list = []
    
    for i in emotion_list:
        if i in ekman_mapping['anger']:
            map_list.append('anger')
        if i in ekman_mapping['disgust']:
            map_list.append('disgust')
        if i in ekman_mapping['fear']:
            map_list.append('fear')
        if i in ekman_mapping['joy']:
            map_list.append('joy')
        if i in ekman_mapping['sadness']:
            map_list.append('sadness')
        if i in ekman_mapping['surprise']:
            map_list.append('surprise')
        if i == 'neutral':
            map_list.append('neutral')
            
    return map_list

In [11]:
df_train['Mapped Emotions'] = df_train['Emotions'].apply(EmotionMapping)
df_test['Mapped Emotions'] = df_test['Emotions'].apply(EmotionMapping)

In [12]:
df_train['anger'] = np.zeros((len(df_train),1))
df_train['disgust'] = np.zeros((len(df_train),1))
df_train['fear'] = np.zeros((len(df_train),1))
df_train['joy'] = np.zeros((len(df_train),1))
df_train['sadness'] = np.zeros((len(df_train),1))
df_train['surprise'] = np.zeros((len(df_train),1))
df_train['neutral'] = np.zeros((len(df_train),1))

df_test['anger'] = np.zeros((len(df_test),1))
df_test['disgust'] = np.zeros((len(df_test),1))
df_test['fear'] = np.zeros((len(df_test),1))
df_test['joy'] = np.zeros((len(df_test),1))
df_test['sadness'] = np.zeros((len(df_test),1))
df_test['surprise'] = np.zeros((len(df_test),1))
df_test['neutral'] = np.zeros((len(df_test),1))

In [13]:
for i in ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise','neutral']:
    df_train[i] = df_train['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)
    df_test[i] = df_test['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)

In [14]:
df_train.head()

Unnamed: 0,Text,Class,ID,List of classes,Len of classes,Emotions,Mapped Emotions,anger,disgust,fear,joy,sadness,surprise,neutral
0,My favourite food is anything I didn't have to...,27,eebbqej,[27],1,[neutral],[neutral],0,0,0,0,0,0,1
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,[27],1,[neutral],[neutral],0,0,0,0,0,0,1
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,[2],1,[anger],[anger],1,0,0,0,0,0,0
3,To make her feel threatened,14,ed7ypvh,[14],1,[fear],[fear],0,0,1,0,0,0,0
4,Dirty Southern Wankers,3,ed0bdzj,[3],1,[annoyance],[anger],1,0,0,0,0,0,0


As observed in id[4], annoyance is mapped to a core emotion-anger

In [15]:
df_train.drop(['Class', 'List of classes', 'Len of classes', 'Emotions', 'Mapped Emotions'], axis=1, inplace=True)
df_test.drop(['Class', 'List of classes', 'Len of classes', 'Emotions', 'Mapped Emotions'], axis=1, inplace=True)

# **Pre-processing**

Till no tokenization

In [16]:
import nltk

nltk.download('punkt')  
nltk.download('stopwords')  
nltk.download('wordnet') 

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

#### Removing stopwords, punctuation and performing stemming

In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess(sentence):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    # Remove non-alphabetic characters
    sentence = re.sub('[^A-Za-z]', ' ', sentence)
    # Convert to lowercase and split into words
    words = sentence.lower().split()
    # Remove stopwords and apply lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

def text_preprocessing_pipeline(text):
    '''Cleaning and parsing the text.'''
    text = preprocess(text)
    return text

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
df_train['Text'] = df_train['Text'].apply(lambda x: preprocess(x))

In [20]:
df_test["Text"] = df_test["Text"].apply(text_preprocessing_pipeline)

In [21]:
pd.Series(df_train["Text"]).str.split().str.len().describe(percentiles=[0.05, 0.97])

count    43410.000000
mean         6.446487
std          3.453379
min          0.000000
5%           2.000000
50%          6.000000
97%         13.000000
max         33.000000
Name: Text, dtype: float64

In [22]:
df_train.head()

Unnamed: 0,Text,ID,anger,disgust,fear,joy,sadness,surprise,neutral
0,favourite food anything cook,eebbqej,0,0,0,0,0,0,1
1,everyone think he laugh screwing people instea...,ed00q6i,0,0,0,0,0,0,1
2,fuck bayless isoing,eezlygj,1,0,0,0,0,0,0
3,make feel threatened,ed7ypvh,0,0,1,0,0,0,0
4,dirty southern wanker,ed0bdzj,1,0,0,0,0,0,0


# **Model Training**

#### Using a pre-trained DistilRoBERTa-base model and fine-tuning it for emotion recognition

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "michellejieli/emotion_text_classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [24]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [25]:
train_dataset

Dataset({
    features: ['Text', 'ID', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral'],
    num_rows: 43410
})

#### Labelling the emotion column to a numeric format

In [26]:
label_mapping = {
    "anger": 0,
    "disgust": 1,
    "fear": 2,
    "joy": 3,
    "neutral": 4,
    "sadness": 5,
    "surprise": 6
}

def get_labels(batch):
    labels = []
    for i in range(len(batch['Text'])):  
        for emotion in label_mapping.keys():
            if batch[emotion][i] == 1:  
                labels.append(label_mapping[emotion])
                break
    return {'labels': labels}

train_dataset = train_dataset.map(
    get_labels,  
    batched=True,  
    batch_size=32  
)

test_dataset = test_dataset.map(
    get_labels,  
    batched=True,  
    batch_size=32  
)

print(train_dataset.column_names)


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

['Text', 'ID', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral', 'labels']


#### Tokenization

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['Text'], padding="max_length", truncation=True, max_length=128)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

#### Training

In [28]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",  
    evaluation_strategy="epoch",  
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,  
    num_train_epochs=6,  
    weight_decay=0.01,  
    logging_dir='./logs',  
    save_total_limit=2,  
    save_steps=500  
)



In [29]:
pip install evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
import evaluate

metric = evaluate.load("accuracy")

In [31]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [32]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics
)

In [33]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112942422222558, max=1.0…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0429,1.014379,0.636263
2,0.9241,1.03712,0.64087
3,0.8296,1.032981,0.640501
4,0.6876,1.147758,0.622996
5,0.543,1.320946,0.614336
6,0.4385,1.442044,0.60586


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

TrainOutput(global_step=16284, training_loss=0.7583301097220503, metrics={'train_runtime': 4001.3136, 'train_samples_per_second': 65.094, 'train_steps_per_second': 4.07, 'total_flos': 8626383791447040.0, 'train_loss': 0.7583301097220503, 'epoch': 6.0})

#### Saving and Reloading model

In [34]:
output_dir = "./emotion_model"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

model = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

print("Model and tokenizer reloaded successfully.")

Model and tokenizer saved to ./emotion_model
Model and tokenizer reloaded successfully.


# **Evaluating accuracy and Testing using the test data**

In [35]:
import torch
from evaluate import load
metric = load("accuracy")

labels = [
    "anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"
]

predictions = []
references = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

with torch.no_grad():
    for sample in tokenized_test_dataset:
        inputs = {
            "input_ids": torch.tensor(sample["input_ids"]).unsqueeze(0).to(device),
            "attention_mask": torch.tensor(sample["attention_mask"]).unsqueeze(0).to(device),
        }
        
        outputs = model(**inputs)
        logits = outputs.logits
        
        pred_class = torch.argmax(logits, dim=-1).item()
        predictions.append(pred_class)
        
        references.append(sample["labels"]) 

accuracy = metric.compute(predictions=predictions, references=references)
print(f"Accuracy: {accuracy['accuracy'] * 100:.2f}%")

for i in range(5):
    print(f"Text: {tokenizer.decode(tokenized_test_dataset[i]['input_ids'], skip_special_tokens=True)}")
    print(f"True Label: {labels[references[i]]}")
    print(f"Predicted Label: {labels[predictions[i]]}")
    print("-" * 40)

Accuracy: 60.59%
Text: really sorry situation although love name sapphira cirilla scarlett
True Label: surprise
Predicted Label: surprise
----------------------------------------
Text: wonderful awful
True Label: joy
Predicted Label: joy
----------------------------------------
Text: king fan good luck guy interesting game watch
True Label: joy
Predicted Label: joy
----------------------------------------
Text: know thank teaching something today
True Label: joy
Predicted Label: joy
----------------------------------------
Text: got bored haunting earth thousand year ultimately moved afterlife
True Label: sadness
Predicted Label: surprise
----------------------------------------


# **Response Generation**

#### Using a pre-trained GPT-2 model for adaptive response generation after classification 

In [131]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM

gpt2_tokenizer = AutoTokenizer.from_pretrained("SuramyaPokharel/gpt2-response_gen")
gpt2_model = AutoModelForSeq2SeqLM.from_pretrained("SuramyaPokharel/gpt2-response_gen")

In [132]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
emotion_model.to(device)
gpt2_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

#### Model and tokenizer of the LLM used for classification are used to predict emotion of a sentence used and then generate a response according to the input prompt

In [133]:
def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_idx = torch.argmax(logits, dim=1).item()
    emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']

    if predicted_class_idx < len(emotion_labels):
        predicted_emotion = emotion_labels[predicted_class_idx]
    else:
        predicted_emotion = 'unknown'  
    
    return predicted_emotion

#### A pre-prompt to lead the actual generated response

In [151]:
def generate_response(text, emotion):
    if emotion == "sadness":
        prompt = f"I'm sorry you're feeling down. Here's a comforting thought:"
    elif emotion == "anger":
        prompt = f"I understand you're angry. Let me help with that:"
    elif emotion == "joy":
        prompt = f"It's great that you're feeling joyful! Here's something to keep the mood up:"
    elif emotion == "fear":
        prompt = f"Don't be afraid. Let's find strength together:"
    elif emotion == "surprise":
        prompt = f"Wow, that sounds surprising! Let's dive into it:"
    elif emotion == "neutral":
        prompt = f"Here's a neutral perspective:"
    else:
        prompt = f"Based on the emotion {emotion}, here’s a response:"

    print(f"Prompt from Chatbot: {prompt}")
    inputs = gpt2_tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = gpt2_model.generate(inputs["input_ids"], max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)
    
    response = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

#### Example usage

In [152]:
input_text = "I'm scared of my teacher"
predicted_emotion = predict_emotion(input_text)
print(f"Predicted Emotion: {predicted_emotion}")

response = generate_response(input_text, predicted_emotion)
print(f"Generated Response: {response}")

Predicted Emotion: fear
Prompt from Chatbot: Don't be afraid. Let's find strength together:
Generated Response: Be strong.


In [155]:
input_text = "I am so excited for the party"
predicted_emotion = predict_emotion(input_text)
print(f"Predicted Emotion: {predicted_emotion}")

response = generate_response(input_text, predicted_emotion)
print(f"Generated Response: {response}")

Predicted Emotion: joy
Prompt from Chatbot: It's great that you're feeling joyful! Here's something to keep the mood up:
Generated Response: You're feeling happy!
