
<h3>Emotion Classification of Natural Language</h3>



<h5>Imports </h5>

In [None]:
import os
import pandas as pd
import numpy as np
import torch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

# from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, RandomizedSearchCV

from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification, AdamW, get_scheduler, TrainingArguments, Trainer

from torch.utils.data import DataLoader

import multiprocessing
import cloudpickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.optimizers import Adam

<h5> Load the Dataset </h5>
Due to privacy concerns regarding Cornell University, the training and testing sets are not present. Only the code itself. 

In [None]:
train = pd.read_csv("train.csv")
train_text = train["text"]
train_label = train["label"]

test = pd.read_csv("test.csv")
test_id = test["id"]
test_text = test["text"]

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(train['text']).toarray()  
y = train['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=9)

<h4>Classical Methods</h4>


GRU Model 

In [None]:
#Gru split
X_train, X_val, y_train, y_val = train_test_split(train_text, train_label, test_size=0.2, random_state=123)

#Gru Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")  # Adjust num_words as needed
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_padded = pad_sequences(X_train_seq, maxlen=100, padding="post", truncating="post")  # Adjust maxlen if needed
X_val_padded = pad_sequences(X_val_seq, maxlen=100, padding="post", truncating="post")


In [None]:
#Gru Model

gru_model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),  # Match num_words and maxlen
    GRU(128, dropout=0.2, recurrent_dropout=0.2),  # GRU layer
    Dense(28, activation="softmax")  # 28 labels for sentiment classification
])

# Compile the model
gru_model.compile(optimizer=Adam(learning_rate=5e-4),
              loss="sparse_categorical_crossentropy",  # Sparse for integer labels
              metrics=["accuracy"])

In [None]:
#Gru Train

y_train = np.array(y_train)
y_val = np.array(y_val)
# Train the GRU model
history = gru_model.fit(
    X_train_padded,
    y_train,
    validation_data=(X_val_padded, y_val),
    epochs=10,  # Adjust epochs as needed
    batch_size=32  # Adjust batch size as needed
)

# Plot training and validation accuracy
import matplotlib.pyplot as plt

plt.plot(history.history["accuracy"], label="Train Accuracy")
plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


Primitive Logistic Regression 

In [None]:
# Convert text into numerical features using Term Frequency-Inverse Document Frequency (TF-IDF)

X_train, X_val, y_train, y_val = train_test_split(train_text, train_label, test_size=0.2, random_state=123)
y_train = np.array(y_train)
y_val = np.array(y_val)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)

In [None]:
#train model 1
log_reg_primitive_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, n_jobs=1, warm_start=True)

try:
    log_reg_primitive_model.fit(X_train, y_train)
    print("Model trained successfully!")
except Exception as e:
    print(f"Error during model fitting: {e}")


Refined Logistic Regression

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

param_grid = {
    'multi_class': ['ovr', 'multinomial'],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 2000, 5000]
}

log_reg_model = LogisticRegression()

# Set up GridSearchCV to optimize hyperparameters for Logistic Regression
grid_search = GridSearchCV(
    estimator=log_reg_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Print best parameters and cross-validation score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

In [None]:
#Train Refined Logistic Regression 
best_params = grid_search.best_params_
log_reg_model = LogisticRegression(**best_params)
log_reg_model.fit(X_train, y_train)

Primitive XGBBoost Model 

In [None]:
from xgboost import XGBClassifier

#train XGBoost with default parameters
xgb_primitive_model = XGBClassifier()
xgb_primitive_model.fit(X_train, y_train)

Refined XGBoost

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier


param_distributions = {
    'max_depth': [4, 6, 8],  # Tree depth
    'learning_rate': [0.05, 0.1, 0.2],  # Step size
    'n_estimators': [50, 100, 150],  # Number of trees
    'subsample': [0.8, 1.0],  # Fraction of samples used for training
    'colsample_bytree': [0.8, 1.0],  # Fraction of features used per tree
}


xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=28,
    n_jobs=4,
    verbosity=0
)


randomized_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=50,  # Test only 50 random combinations
    cv=3,
    scoring='accuracy',
    n_jobs=-1,  # Use all available CPU cores
    verbose=2  # Show progress
)


randomized_search.fit(X_train, y_train)

# Display the best parameters and score
print("Best Parameters:", randomized_search.best_params_)
print("Best Cross-Validation Accuracy:", randomized_search.best_score_)


In [None]:
from xgboost import XGBClassifier
#train XGBoost using optimal parameters
#best_params = randomized_search.best_params_ #Extract the best parameters
#xgb_model = XGBClassifier(**best_params)
xgb_model = XGBClassifier(subsample=1.0, n_estimators=150, max_depth=8, learning_rate=0.2, colsample_bytree=0.8) #Originally gave these outputs which give a higher accuracy score than the new outputs, so not using best_params, but did use code above to get these parameter values
xgb_model.fit(X_train, y_train)

In [None]:

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(train_text, train_label, test_size=0.2, random_state=123)
y_train = np.array(y_train)
y_val = np.array(y_val)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)

#Evaluate primitive Logistic REgression
y_pred = log_reg_primitive_model.predict(X_val)
print("Primitive Logistic Regression accuracy: " + str(accuracy_score(y_val, y_pred)))


#Evaluate Refined Logistic Regression
y_pred = log_reg_model.predict(X_val)
print("Optimized Logistic Regression accuracy: " + str(accuracy_score(y_val, y_pred)))


#Primitive XGBoost evaluation
y_pred = xgb_primitive_model.predict(X_val)
print("Primitive XGBoost Accuracy: " + str(accuracy_score(y_val, y_pred)))


#Refined XGboost evaluation
y_pred = xgb_model.predict(X_val)
print("Optimized XGBoost Accuracy: " + str(accuracy_score(y_val, y_pred)))

<h4> Creative Methods </h4>

Idea 1: Use a pre-trained BERT in order to get embeddings and then train a logistic regression on these embeddings 

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

In [None]:
tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
model = BertModel.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
model.eval()  

In [None]:
def generate_embeddings(texts, batch_size=32): #adjust batch size as needed 
    """
    Generate embeddings using a pretrained BERT model
    """
    device = torch.device('cpu')  # Use CPU
    model.to(device)
    
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            # Use CLS token embeddings
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)


In [None]:

X_train, X_temp, y_train, y_temp = train_test_split(train_text, train_label, test_size=0.4, random_state=101)   #split w/ placeholder for the testing & training data
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=101)   

X_train = list(X_train)
X_val = list(X_val)
X_test = list(X_test)


# Generate embeddings
print("Generating train embeddings...")
X_train_embeddings = generate_embeddings(X_train)

print("Generating validation embeddings...")
X_val_embeddings = generate_embeddings(X_val)

print("Generating test embeddings...")
X_test_embeddings = generate_embeddings(X_test)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Separate sub-grids for valid configurations
param_grid = [
    {'multi_class': ['ovr'], 'solver': ['liblinear'], 'C': [0.01, 0.1, 1, 10], 'max_iter': [1000, 2000]},
    {'multi_class': ['multinomial'], 'solver': ['lbfgs', 'saga'], 'C': [0.01, 0.1, 1, 10], 'max_iter': [1000, 2000]}
]

logistic_model = LogisticRegression()

grid_search = GridSearchCV(
    estimator=logistic_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_embeddings, y_train)

# Get best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

y_val_pred = grid_search.best_estimator_.predict(X_val_embeddings)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy (after tuning): {val_accuracy}")

In [None]:

clf = LogisticRegression(C=1, max_iter=1000, multi_class='multinomial', solver='saga') 
clf.fit(X_train_embeddings, y_train)


y_val_pred = clf.predict(X_val_embeddings)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")


Idea 2: Take a small-pretrained LLM and finetune it for emotion sentiment classification 

In [None]:
#Define Class for TextDataset in order to use the Hugging Face Trainer API
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts, truncation=True, padding=True, max_length=128
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)
test_dataset = TextDataset(X_test, y_test)

In [None]:
#Define Which version of BERT to use
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=28
)
model.to(device)


In [None]:
#Define Training Args for Finetuned BERT 
training_args = TrainingArguments(
    output_dir='.',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    learning_rate=2e-5,
    no_cuda=(False if torch.cuda.is_available() else True),
)

In [None]:
#train model (warning: this script takes ~6hrs to run w/ 4 2.3ghz intel core i9 cores. As such, the model is already trained and saved in the repo)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

In [None]:
#Test BERT Model on partition of train.csv that was saved for evaluation 
predictions = trainer.predict(test_dataset=test_dataset)
logits = predictions.predictions
labels = predictions.label_ids

predicted_classes = logits.argmax(axis=-1)
accuracy = accuracy_score(labels, predicted_classes)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
id = range(15000)
prediction = range(15000)
submission = pd.DataFrame({'id': id, 'label': prediction})
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
# You may use pandas to generate a dataframe with country, date and your predictions first 
# and then use to_csv to generate a CSV file.
import pandas as pd
from transformers import pipeline

test_data = pd.read_csv("test.csv")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_path = "checkpoint-4500" #adjust path as necessary 
model = BertForSequenceClassification.from_pretrained(model_path)

classification_pipeline = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

predictions = []
for text in test_data["text"]:
    prediction = classification_pipeline(text)
    predicted_label = prediction[0]["label"]
    predictions.append(predicted_label)


test_data["label"] = predictions
test_data["label"] = test_data["label"].str.replace("LABEL_", "").astype(int)



output_file = "submission.csv"
test_data[["id", "label"]].to_csv(output_file, index=False)