## Imports

In [None]:
import gradio as gr
import pandas as pd
import tensorflow as tf
import spacy
import matplotlib.pyplot as plt

from wordcloud import WordCloud
from timeit import default_timer as timer
from helper_functions import plot_pie_chart, preprocess_text
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder

In [None]:
DIR= 'kaggle/input/emotion-dataset-for-nlp/'
TRAIN_FILE = 'train.txt'
VAL_FILE = 'val.txt'
TEST_FILE = 'test.txt'

In [None]:
train_df = pd.read_csv(DIR+TRAIN_FILE, delimiter=';', header=None, names=['text', 'label'])
val_df = pd.read_csv(DIR+VAL_FILE, delimiter=';', header=None, names=['text', 'label'])
test_df = pd.read_csv(DIR+TEST_FILE, delimiter=';', header=None, names=['text', 'label'])

In [None]:
plot_pie_chart(train_df, 'Train Dataset')

In [None]:
plot_pie_chart(val_df, 'Validation Dataset')

## Preprocessing

In [None]:
%%writefile helper_functions.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import spacy

with open('SentimentAnalysis/class_names.txt', 'r') as f:
    labels = [emotion.strip() for emotion in f.readlines()] 
    
encoder= LabelEncoder()
encoder.fit(labels)

nlp = spacy.load("en_core_web_sm")


def plot_pie_chart(data_frame: pd.DataFrame, title: str) -> None:
    """
    Plot a pie chart to visualize label distribution in the provided DataFrame.

    Args:
        data_frame (pd.DataFrame): The DataFrame containing the data to visualize.
        title (str): The title for the pie chart.

    Returns:
        None
    """
    label_count = data_frame['label'].value_counts()
    plt.figure(figsize=(8, 8))
    sns.set_style("whitegrid")
    plt.pie(label_count, labels=label_count.index, colors=sns.color_palette("hls", len(label_count.index)), autopct='%1.1f%%', startangle=90)
    plt.title(f"{title} Label Distribution")
    plt.show()
    plt.close()

def preprocess_text(df: pd.DataFrame, emotions: list=['love', 'surprise']):
    """
    Preprocesses text data in a DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing 'sentence' and 'label' columns.
        encoder (LabelEncoder): Label encoder for the labels.
        emotions (list): List of emotions to drop from the DataFrame.

    Returns:
        pd.DataFrame: DataFrame with preprocessed text and encoded labels.
    """
    for i in emotions:
        df = df[df['label'] != i]

    df['processed_text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop and not token.is_punct and not token.is_space]))

    df['label_num'] = encoder.transform(df['label'])
    df.drop(columns=['text', 'label'], inplace=True)
    return df


In [None]:
train_df= preprocess_text(train_df)
val_df= preprocess_text(val_df)

## Visualisation

In [None]:
with open('SentimentAnalysis/class_names.txt', 'r') as f:
    labels = [emotion.strip() for emotion in f.readlines()] 

# Generate and plot word cloud for each label
for class_label, text in zip(labels, train_df['processed_text']):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for Label: {class_label}')
    plt.axis('off')
    plt.show()


## Model

In [None]:
VOCAB_SIZE= 15000
text_vect_layer= tf.keras.layers.TextVectorization(max_tokens= VOCAB_SIZE)
text_vect_layer.adapt(train_df['processed_text'])

In [None]:
EMBEDDING_UNITS= 64
UNITS= 64
DROPOUT= 0.2

model_v1= tf.keras.Sequential([
    text_vect_layer,
    layers.Embedding(input_dim=text_vect_layer.vocabulary_size(), output_dim= EMBEDDING_UNITS, mask_zero= True),
    layers.GRU(UNITS, dropout= DROPOUT, return_sequences= True),
    layers.GRU(UNITS, dropout= DROPOUT),
    layers.Dense(len(labels), activation= 'sigmoid')
])

In [None]:
model_v1.compile(optimizer= tf.keras.optimizers.Adam(),
                 loss= tf.keras.losses.SparseCategoricalCrossentropy(),
                 metrics=['accuracy'])

In [None]:
BATCH_SIZE= 32  
train_set= tf.data.Dataset.from_tensor_slices((train_df['processed_text'].values, train_df['label_num'].values)).batch(BATCH_SIZE)
val_set= tf.data.Dataset.from_tensor_slices((val_df['processed_text'].values, val_df['label_num'].values)).batch(BATCH_SIZE)

In [None]:
history= model_v1.fit(train_set, validation_data= val_set, epochs= 5)

In [None]:
eval= model_v1.evaluate(val_set)

In [None]:
file_path= 'SentimentAnalysis/models/gru_model.keras'
model_v1.save(file_path)

In [None]:
reconstructed_model= tf.keras.models.load_model(f'{file_path}')

## Predictions for a single sentence
To make predictions on a single sentence, we need to do the following steps:
1. Process it in the same manner to remove the extra token words such (as punctuations)
2. Expand the dimension.
3. Pass through the model.

In [None]:
with open('SentimentAnalysis/class_names.txt', 'r') as f:
    labels = [emotion.strip() for emotion in f.readlines()] 

with open('SentimentAnalysis/examples.txt', 'r') as f:
    example_list = [example.strip() for example in f.readlines()]

encoder= LabelEncoder()
encoder.fit(labels) 

model = tf.keras.models.load_model('SentimentAnalysis/models/gru_model.keras')

nlp = spacy.load("en_core_web_sm")
def preprocess_single_sentence(sentence):
    """
    Preprocesses a single sentence.

    Args:
        sentence (str): Input sentence.

    Returns:
        str: Preprocessed and tokenized sentence.
    """
    processed_text = ' '.join([token.lemma_ for token in nlp(sentence) if not token.is_stop and not token.is_punct and not token.is_space])
    return processed_text

def predict(text):
    """
    Make predictions on the given text using the trained model.

    Args:
        text (str): The text to make predictions on.

    Returns:
        list: A list of predictions.
    """
    start_time= timer()
    text= preprocess_single_sentence(text)
    text= tf.expand_dims(text, 0)

    probability = model.predict(text)
    pred_label_with_prob= {labels[i]: float(probability[0][i]) for i in range(len(labels))} 
    pred_time = round(timer() - start_time, 5)
    return pred_label_with_prob, pred_time

### Gradio App
input= gr.Textbox(lines=5, label="Enter text", placeholder="i like to have the same breathless feeling as a reader eager to see what will happen next")
outputs=[
        gr.Label(num_top_classes=len(labels), label="Predictions"),
        gr.Number(label="Prediction time (s)"),
    ]
title= ' Sentiment Analysis 🤣😱😡😢 '
description= 'The sentiment analysis model is a deep learning-based natural language processing (NLP) model designed to analyze and classify the sentiment expressed in text data. It is trained to understand the emotional tone of text and categorize it into predefined sentiment categories such as <b>anger, fear, saddness and joy.<b>'
demo= gr.Interface(fn=predict, 
                   inputs=input, 
                   outputs=outputs, 
                   title=title, 
                   description=description,
                   examples= example_list
                  )
demo.launch()