# Project Objective
Aim of this project is to determine the writer emotion based on their text that they had written.
We will use Natural Process Language to different the different types of emotions.

Emotions are classified into six categories: 
1. Sadness (0)
2. Joy (1)
3. Love (2)
4. Anger (3) 
5. Fear (4) 
6. Surprise (5)

In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [3]:
origin_dataset = pd.read_csv("raw_data/text_emotion_Dataset.csv")

## Data Exploration

### Raw data

In [None]:
print(f"Raw data consist of: {origin_dataset.shape[0]} rows")
print(f"Raw data consist of: {origin_dataset.shape[1]} columns")

In [None]:
origin_dataset.info()

In [None]:
origin_dataset.head()

In [None]:
origin_dataset[origin_dataset.duplicated()] # Good that we don't have duplicate rows or dulicapted dataset

In [None]:
origin_dataset['label'].value_counts()

In [None]:
origin_dataset.isnull().sum()

## Data wraggling/Cleaning

In [None]:
emotion_dataset = origin_dataset[['text','label']].copy()
emotion_dataset['label'] = emotion_dataset['label'].astype('category')
emotion_dataset.info()

In [None]:
emotion_dataset.head()

## Data Analysis

In [12]:
label_mapping = {
    0: 'Sadness',
    1: 'Joy',
    2: 'Love',
    3: 'Anger',
    4: 'Fear',
    5: 'Surprise'
}
emotion_dataset['emotion_label'] = emotion_dataset['label'].map(label_mapping)

In [None]:
emotion_dataset['emotion_label'].value_counts()

In [None]:
emotion_dataset['word_length'] = emotion_dataset['text'].apply(lambda x: len(x.split()))
emotion_dataset[['text','word_length','emotion_label','label']]

In [None]:
print(list(emotion_dataset['emotion_label'].unique()))

In [None]:
sns.histplot(data=emotion_dataset,x='emotion_label')
fig, axs =  plt.subplots(3,2, figsize = (15,18))
axs = axs.flatten()
for i, emotion in enumerate(list(emotion_dataset['emotion_label'].unique())):
    subplot = emotion_dataset[emotion_dataset['emotion_label'] == emotion]
    sns.histplot(data=subplot,
                 x = 'word_length',
                 ax=axs[i],
                 bins = 50)
    axs[i].set_title(f"Word length for {emotion}")
    axs[i].set_xlabel("Word length")
    axs[i].set_ylabel("Frequency")
    axs[i].legend([emotion])

## Data Preprocessing: Text proprocessing
- lowercase
- dealing with numbers, punctuation, and symbols
- splitting
- tokenizing
- removing "stopwords"
- lemmatizing

In [17]:
from nltk.corpus import stopwords

In [18]:
def preprocessing_text(text):
    text = text.str.lower()
    text = text.str.replace(r'http\S+', '', regex=True)
    text = text.str.replace(r'[^\w\s]', '', regex=True)
    text = text.str.replace(r'\s+', ' ', regex=True)
    text = text.str.replace(r'\d+', '', regex=True)
    text = text.str.strip()
    return text

def remove_stopwords(sentence):
    stop_words  = set(stopwords.words('english'))
    filtered_sentence = [word for word in sentence.split() if word not in stop_words]
    return ' '.join(filtered_sentence)

In [19]:
emotion_dataset['text'] = preprocessing_text(emotion_dataset['text'])
emotion_dataset['text'] = emotion_dataset['text'].apply(remove_stopwords)

In [20]:
emotion_dataset.drop(columns = ['emotion_label','word_length'], inplace=True)


# Model Preparation

#### Train Test Split

In [56]:
# from nltk.stem.snowball import SnowballStemmer
# from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
max_words= 50000


In [51]:
X = emotion_dataset['text']
y = emotion_dataset['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=input_size)
tokenizer.fit_on_texts(X_train)
# tokenizer.fit_on_texts(X_test)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
# Find the maximum lenght in the sequence set
maxlen = max(len(tokens) for tokens in X_train_sequences)
print("Maximum sequence length (maxlen):", maxlen)

In [40]:
# Perform padding on X_train and X_test sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=maxlen, padding='post',)
X_test_padded = pad_sequences(X_test_sequences, maxlen=maxlen, padding='post')

In [65]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [60]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=maxlen))
model.add(GRU(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))  # 6 classes for the emotions

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
early_stopping = EarlyStopping(
    monitor='val_accuracy',  # or 'val_accuracy'
    patience=3,          # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)

model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2, callbacks=[early_stopping])


##### Example of keras tokenizer

In [None]:
t  = Tokenizer()
fit_text = "The earth is an awesome place live"
t.fit_on_texts(fit_text)
test_text = "The earth is an great place live"
sequences = t.texts_to_sequences(test_text)

print("sequences : ",sequences,'\n')

print("word_index : ",t.word_index)