In [None]:
# Data Processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Evaluation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# Modeling
import catboost as cb

# Text Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords', download_dir='/kaggle/working/')
# wordnet configuration
import subprocess
nltk.download('wordnet', download_dir='/kaggle/working/')
command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
subprocess.run(command.split())
nltk.data.path.append('/kaggle/working/')

# BERT
import torch
from transformers import BertModel
from transformers import BertTokenizer

# Miscellanous
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/kaggle/input/emotion-detection-from-text/tweet_emotions.csv", index_col='tweet_id')
df.head()

# Data Overview

In [None]:
print(f"{df.shape[0]} rows")
print(f"{df.shape[1]} columns")

In [None]:
df['sentiment'].value_counts()

In [None]:
df.isna().sum()

# Text Processing

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# lowercase the tweet content
def lowercasing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['content'] = df['content'].str.lower()
    return df

# tokenize the tweet content
def tokenizing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['tokenized_content'] = df['content'].apply(word_tokenize)
    return df

# remove the stopwords
def remove_stopwords(row):
    wo_stopwords = [word for word in row if word not in stop_words]
    return wo_stopwords

# lemmatization
def lemmatizing(row):
    result = []
    for word in row:
        result.append(lemmatizer.lemmatize(word))
    return result

# label encoding
def transform_label(row):
    mapping = {
        "neutral": 0,
        "worry": 1,
        "happiness": 2,
        "sadness": 3,
        "love": 4, 
        "surprise": 5,
        "fun": 6,
        "relief": 7,
        "hate": 8,
        "empty": 9,
        "enthusiasm": 10,
        "boredom": 11,
        "anger": 12
    }
    return mapping[row]

# transform the data
def transform(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = lowercasing(df)
    df = tokenizing(df)
    df['wo_stopwords'] = df['tokenized_content'].apply(remove_stopwords)
    df['lemmatized_content'] = df['wo_stopwords'].apply(lemmatizing)
    df['sentiment'] = df['sentiment'].apply(transform_label)
    return df

In [None]:
df = transform(df)
df.head(3)

# Word Embeddings

## Tf Idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

# list to string
df['lemmatized_content'] = df['lemmatized_content'].apply(lambda x: ' '.join(x))

# vectorizing
tfidf_matrix = vectorizer.fit_transform(df['lemmatized_content'].values).toarray()
tfidf_matrix

## BERT

In [None]:
def add_special_tokens(row):
    return "[CLS] "+row+" [SEP]"

df['bert_content'] = df['content'].apply(add_special_tokens)
df.head(2)

In [None]:
# Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [None]:
def get_bert_word_embeddings(text, tokenizer, model):
    # Tokenize the input text
    tokenized_text_detail = tokenizer(text, return_tensors='pt')
    # Extract token IDs, token types, and attention mask
    token_ids = tokenized_text_detail["input_ids"]
    token_masks = tokenized_text_detail["attention_mask"]
    # Run the text through the BERT model
    outputs = model(input_ids=token_ids, attention_mask=token_masks)
    # Get the hidden states
    hidden_states = outputs[2]
    # Stack the hidden states
    stacked_hidden_states = torch.stack(hidden_states)
    # Permute the dimensions
    token_embeddings = stacked_hidden_states.permute(1, 2, 0, 3)
    # Sum the last 4 layers for each token
    token_vecs_sum = torch.sum(token_embeddings[-4:], dim=0)
    return token_vecs_sum

# Example usage:
text = df['bert_content'].loc[1956967341]
embeddings = get_bert_word_embeddings(text, tokenizer, model)
print(embeddings.shape)

In [None]:
# embeddings_list = []

# for text in df['bert_content']:
#     embeddings = get_bert_word_embeddings(text, tokenizer, model)
#     embeddings_list.append(embeddings)
    
# embeddings_array_list = [embedding.numpy() for embedding in embeddings_list]
# embeddings_df = pd.DataFrame(embeddings_array_list)
# embeddings_df.head()

# Modeling
I use catboost with different approach of word embeddings

In [None]:
def plot_training_loss(model):
    # plot training loss
    train_error = []
    for score in model.evals_result_['learn']['MultiClass']:
        train_error.append(score)
    # Plot the training and validation error during grid search
    plt.plot(train_error, label='Training Error')
    plt.xlabel('Iteration')
    plt.ylabel('MultiClass')
    plt.legend()
    plt.show()

## Tf Idf

In [None]:
X_tfidf_train, X_tfidf_val, y_tfidf_train, y_tfidf_val = train_test_split(
    tfidf_matrix, 
    df['sentiment'], 
    test_size=0.2, 
    random_state=12, 
    stratify=df['sentiment']
)

In [None]:
cbc = cb.CatBoostClassifier(verbose=50, loss_function='MultiClass')
cbc.fit(X_tfidf_train, y_tfidf_train)

In [None]:
plot_training_loss(cbc)

## Catboost Text Features

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    df[['content']], 
    df['sentiment'], 
    test_size=0.2, 
    random_state=12, 
    stratify=df['sentiment']
)

In [None]:
cbc2 = cb.CatBoostClassifier(verbose=50, loss_function='MultiClass', text_features=["content"])
cbc2.fit(X_train, y_train)

In [None]:
plot_training_loss(cbc2)

## BERT

In [None]:
X_bert_train, X_bert_val, y_bert_train, y_bert_val = train_test_split(
    embeddings_df, 
    df['sentiment'], 
    test_size=0.2, 
    random_state=12
)

In [None]:
cbc3 = cb.CatBoostClassifier(verbose=50, loss_function='MultiClass', text_features=["content"])
cbc3.fit(X_bert_train, y_bert_train)

In [None]:
plot_training_loss(cbc3)

# Predictions

In [None]:
def evaluation(model, X_val, y_val):
    y_pred = model.predict(X_val)
    print(f"Classification Report:\n {classification_report(y_val, y_pred)}\n\n")
    cm = confusion_matrix(y_val, y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot()
    plt.title('Confusion Matrix')
    plt.show()

In [None]:
# TF IDF
evaluation(cbc, X_tfidf_val, y_tfidf_val)

In [None]:
# catboost with text features
evaluation(cbc2, X_val, y_val)