# October Code Jam

Prepared by Jeel Faldu, Jimmy Koester, and Raphael Lu

In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification, AutoTokenizer

from google.colab import output
output.disable_custom_widget_manager()

import urllib.request

## Introduction

This project explores sentiment analysis using `distilbert-base-uncased-finetuned-sst-2-english` and `twitter-roberta-base-sentiment` from the Hugging Face Transformers library. The goal is to automatically classify text as positive, negative, or neutral using pretrained transformer models.

We analyzed sentiment in social media posts, compared multiple models, and visualized their performance and confidence levels. We also applied the models to a creative dataset, showcasing how NLP can reveal insights from real-world text such as tweets, song lyrics, or news articles.


## Model and Device Loading

In [30]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_1 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model_2 = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

distil_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

## Data Preprocessing

In data exploration, no duplicate entries were found. In the orignal data set, Tweets were encoded as
```
'nocode', 'happy', 'not-relevant', 'angry', 'disgust|angry',
'disgust', 'happy|surprise', 'sad', 'surprise', 'happy|sad',
'sad|disgust', 'sad|angry', 'sad|disgust|angry'
```

Summary of Data Preprocessing:
* Feilds were renamed `id`, `text`, and `label`
* `id` feild was ultimately dropped
* Labels were recoded into arrays using OHE:
     * e.g. with sentiments   
     `['angry' 'disgust' 'happy' 'nocode' 'not-relevant' 'sad' 'surprise']`,   
     a tensor of `[1 0 0 0 0 1 0]`
     indicates an `angry|sad` tweet

In [31]:
# Load dataset
data_url = 'https://raw.githubusercontent.com/jeelfaldu7/transformer_sentiment_analysis/refs/heads/main/dataset.csv'
df = pd.read_csv(data_url, header=None, names=['id', 'text', 'label'], sep=',')

# Display first few rows of the dataset
display(df.head())

Unnamed: 0,id,text,label
0,611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
1,614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
2,614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
3,614877582664835073,@Sofabsports thank you for following me back. ...,happy
4,611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [32]:
# Display the summary of the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3085 entries, 0 to 3084
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      3085 non-null   int64 
 1   text    3085 non-null   object
 2   label   3085 non-null   object
dtypes: int64(1), object(2)
memory usage: 72.4+ KB


In [58]:
df = df.drop('id', axis = 1)

In [59]:
# Display the shape of all the DataFrame
n_rows, n_cols = df.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns")

The DataFrame has 3085 rows and 2 columns


In [51]:
# Check for duplicates in the 'final' DataFrame
df.duplicated().sum(),df['label'].unique()

(np.int64(0),
 array(['nocode', 'happy', 'not-relevant', 'angry', 'disgust|angry',
        'disgust', 'happy|surprise', 'sad', 'surprise', 'happy|sad',
        'sad|disgust', 'sad|angry', 'sad|disgust|angry'], dtype=object))

### Train-Test Split

In [72]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_df.drop('label', axis = 1)
X_test = train_df.drop('label', axis = 1)

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df['label'].str.split('|'))
y_test = mlb.transform(test_df['label'].str.split('|'))

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2468, 1) (2468, 7)
(2468, 1) (617, 7)


In [78]:
print(mlb.classes_)
print(y_train[92])

['angry' 'disgust' 'happy' 'nocode' 'not-relevant' 'sad' 'surprise']
[0 0 1 0 0 0 0]


### Data Pre-processing for roBERTa model

Tweets were processed using recommened method in model [documentation](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment).

In [16]:
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [44]:
df_roberta = pd.read_csv(data_url, header=None, names=['id', 'text', 'label'], sep=',')
df_roberta = df_roberta.drop('id', axis = 1)

In [45]:
for i in range(len(df_roberta)):
  df_roberta.loc[i, 'text'] = preprocess(df_roberta['text'][i])

In [46]:
df_roberta.head()

Unnamed: 0,text,label
0,@user @user @user Merci pour le partage! @user,nocode
1,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
2,@user @user ... Replace with your wish which t...,happy
3,@user thank you for following me back. Great t...,happy
4,@user @user What a beautiful jewel / portrait....,happy


In [57]:
xtrain_labels = mlb().fit_transform(df_roberta['label'].str.split('|'))

array([0, 0, 1, 0, 0, 0, 0])