# October Code Jam

Prepared by Jeel Faldu, Jimmy Koester, and Raphael Lu

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification, AutoTokenizer

## Introduction

This project explores sentiment analysis using Hugging Face Transformers, a leading library for Natural Language Processing (NLP). The goal is to automatically classify text as positive, negative, or neutral using pretrained transformer models.

We analyzed sentiment in social media posts, compared multiple models, and visualized their performance and confidence levels. We also applied the models to a creative dataset, showcasing how NLP can reveal insights from real-world text such as tweets, song lyrics, or news articles.


## Model and Device Loading

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_1 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model_2 = f"cardiffnlp/twitter-roberta-base-sentiment"

distil_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
roberta_tokenizer = AutoTokenizer.from_pretrained(model_2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

## Data Preprocessing

In [None]:
# Load dataset
data_url = 'https://raw.githubusercontent.com/jeelfaldu7/transformer_sentiment_analysis/refs/heads/main/dataset.csv'
data = pd.read_csv(data_url, header=None, names=['id', 'text', 'label'], sep=',')

# Display first few rows of the dataset
display(data.head())

Unnamed: 0,id,text,label
0,611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
1,614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
2,614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
3,614877582664835073,@Sofabsports thank you for following me back. ...,happy
4,611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [None]:
# Display the summary of the dataset
data.info()

# Display  statistics of the dataset
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3085 entries, 0 to 3084
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      3085 non-null   int64 
 1   text    3085 non-null   object
 2   label   3085 non-null   object
dtypes: int64(1), object(2)
memory usage: 72.4+ KB


Unnamed: 0,id
count,3085.0
mean,6.128461e+17
std,4950761000000000.0
min,3.871909e+17
25%,6.116572e+17
50%,6.130057e+17
75%,6.143229e+17
max,6.155956e+17


In [None]:
# Display the shape of all the DataFrame
n_rows, n_cols = data.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns")

The DataFrame has 3085 rows and 3 columns


In [None]:
# Display the missing values in the dataset
data.isna().sum()

Unnamed: 0,0
id,0
text,0
label,0


In [None]:
# Check for duplicates in the 'final' DataFrame
data.duplicated().sum()

np.int64(0)