<a href="https://colab.research.google.com/github/IdanKanat/COVID_NLP_Advanced_DL_Project/blob/main/AdvancedTopicsDL_Project_IdanKanat%26IdoShahar_COVID_NLP_21.8.2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Imports**

In [1]:
!pip install transformers
!pip install optuna
!pip install wandb
!pip install evaluate
!pip install huggingface_hub

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
# Relevant imports:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')
import shutil
from google.colab import files

import os
import json
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torch.optim import AdamW
from torch import nn, optim
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback, get_scheduler
from torch.nn.utils import prune

import optuna
import wandb
from datasets import Dataset, DatasetDict, load_from_disk, Value, Sequence, concatenate_datasets

from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# **Part A - Exploratory Data Analysis (EDA)**

#### Data Path (Relevant for running the files not from Drive) - **PLEASE FIRST DOWNLOAD THE [Project_COVID_NLP folder](https://drive.google.com/drive/folders/1egGGJ6F878xIk_bKUfjhyZStESiliwRC?usp=sharing) accessible from idankanat@gmail.com's Google drive!!**

In [None]:
# Basic Drive path we used for this project. Assuming Google Colab exists as well as mounting files to drive, user can change it accordingly as he downloads the Project_COVID_NLP folder as specified in the project_root below and documented above.
basic_drive_path = "/content/drive/MyDrive" # USER CAN CHANGE IT IF HE DOESN'T WORK IN DRIVE AND DOWNLOADS FROM DRIVE THE Project_COVID_NLP folder!!
project_root = f"{basic_drive_path}/Project_COVID_NLP" # Root project folder
data_path = f"{project_root}/data"

In [None]:
# Loading the Corona_NLP_train dataset:
df = pd.read_csv(f"{data_path}/Corona_NLP_train.csv", encoding='latin1')
df.head(10)

In [None]:
wandb.login()

### **Sentiments Distribution**

In [None]:
# Count sentiment frequencies
sentiment_counts = df['Sentiment'].value_counts()

# Define the custom order
custom_order = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']

# Reindex according to desired order
sentiment_counts = sentiment_counts.reindex(custom_order)

# Format long labels to be multi-line
sentiment_counts.index = sentiment_counts.index.str.replace("Extremely Positive", "Extremely\nPositive")
sentiment_counts.index = sentiment_counts.index.str.replace("Extremely Negative", "Extremely\nNegative")

# Plotting the general sentiment distribution
ax = sentiment_counts.plot(kind='bar', color='blue', edgecolor='black')

# Add title and labels
plt.title("Sentiment Distribution", fontweight='bold')
plt.xlabel("Sentiment")
plt.ylabel("Number of Tweets")
plt.xticks(rotation=0)

# Add top margin so numbers don't touch the edge
plt.ylim(0, sentiment_counts.max() + 1500)

# Add bold value labels for each sentiment category, with comma formatting
for i, count in enumerate(sentiment_counts):
    if pd.notna(count):
        plt.text(i, count + 200, f"{int(count):,}", ha='center', va='bottom', fontsize=10, fontweight = "bold")

# Show the plot
plt.tight_layout()
plt.show()

From the sentiment distribution shown above, we can draw a few conclusions:
1. **There are more positive tweets than negative tweets.**
2. **There are more extremely positive tweets than extremely negative tweets.** This ensures that even when combining the extremes of each sentiment, positive tweets outnumber negatives. The gap between positive and negative tweets enlarges as we add the extremes of each group.

### **Daily Tweet Counts**

In [None]:
# Standardize the 'TweetAt' date column:
df['TweetAt'] = pd.to_datetime(df['TweetAt'], dayfirst=False, errors='coerce')
df = df.dropna(subset=['TweetAt'])

# Create a new column 'YearMonth' for grouping by month
df['YearDay'] = df['TweetAt'].dt.date

# Add a column for tweet length:
df['TweetLength'] = df['OriginalTweet'].astype(str).apply(len)

# Define sentiment colors:
sentiment_order = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
colors = {
    'Extremely Negative': '#e74c3c',
    'Negative': '#e67e22',
    'Neutral': '#f1c40f',
    'Positive': '#2ecc71',
    'Extremely Positive': '#3498db',
    'All Tweets': 'gray'
}

# First, plotting all tweets:
fig, ax = plt.subplots(figsize=(10, 5))
day_counts_all = df.groupby('YearDay').size()
x_all = day_counts_all.index.astype(str)
y_all = day_counts_all.values
ax.plot(x_all, y_all, color='gray', marker='o', linewidth=2)
ax.set_title("Daily Tweet Counts", fontweight='bold', fontsize=14, pad=20)
ax.set_xlabel("Day")
ax.set_ylabel("Tweet Count")
ax.tick_params(axis='x', rotation=45)
# for spine in ax.spines.values():
    # spine.set_edgecolor('red')
    # spine.set_linewidth(3)
ax.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# Second, plotting stratified sentiment trends in one plot:
fig, ax = plt.subplots(figsize=(12, 6))

for sentiment in sentiment_order:
    data = df[df['Sentiment'] == sentiment]
    day_counts = data.groupby('YearDay').size()
    x = day_counts.index.astype(str)
    y = day_counts.values
    ax.plot(x, y, label=sentiment, color=colors[sentiment], marker='o', linewidth=2)

ax.set_title("Daily Tweet Counts by Sentiment", fontweight='bold', fontsize=14, pad=20)
ax.set_xlabel("Day")
ax.set_ylabel("Tweet Count")
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', linestyle='--', alpha=0.5)
ax.legend(title="Sentiment")
plt.tight_layout()
plt.show()

From the two plots above, we can conclude:
- As we could intuitively predict, there was a surge of tweets in March 2020 following the COVID-19 outburst.
- This massive surge in tweets wasn't attributed to any specific sentiment but rather all different sentiments indicated much more frequent tweets in March.

### **Tweet Length Distribution**

In [None]:
# Compute the number of characters in each tweet
df['TweetLength'] = df['OriginalTweet'].astype(str).str.len()

# Plotting a histogram of tweet lengths:
plt.figure(figsize=(8, 5))
plt.hist(df['TweetLength'], bins=50, color='purple', edgecolor='black')

plt.title("Distribution of Tweet Lengths (in characters)", fontweight='bold')
plt.xlabel("Number of Characters")
plt.ylabel("Number of Tweets")
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# Displaying summary statistics using the .describe() command:
length_stats = df['TweetLength'].describe().astype(int)
length_stats

From the tweet length distributions, several conclusions can be drawn:
- **Strong right skew up to the character limit -** There’s a visible increase in tweet counts as length increases, peaking around 240–280 characters.

- **A sharp drop after ~280 characters -** Reflects the Twitter character limit (likely 280) — tweets can't go longer, so the distribution is naturally cut off there.

In [None]:
from matplotlib.patches import Rectangle

# Sentiment labels and colors
sentiment_order = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']
colors = {
    'Extremely Negative': '#e74c3c',
    'Negative': '#e67e22',
    'Neutral': '#f1c40f',
    'Positive': '#2ecc71',
    'Extremely Positive': '#3498db',
    'All Tweets': 'gray'
}

# Manual plot order with 'All Tweets' in the center
plot_order = [
    'Extremely Negative', 'All Tweets', 'Negative',
    'Extremely Positive',            'Neutral',   'Positive'
]

# Create 2x3 subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(16, 10))
axes = axes.flatten()

# Plot each distribution
for i, label in enumerate(plot_order):
    if label == 'All Tweets':
        data = df['TweetLength']
    else:
        data = df[df['Sentiment'] == label]['TweetLength']

    axes[i].hist(data, bins=40, color=colors[label], edgecolor='black', alpha=0.9)
    axes[i].set_title(label, fontweight='bold')
    axes[i].set_xlabel("Tweet Length (characters)")
    axes[i].set_ylabel("Count")
    axes[i].grid(axis='y', linestyle='--', alpha=0.5)

    # Add bold border to the axes itself (cleaner than external patch)
    if label == 'All Tweets':
        for spine in axes[i].spines.values():
            spine.set_edgecolor('red')
            spine.set_linewidth(3)


# Title and layout
plt.suptitle("Tweet Length Distributions by Sentiment", fontsize=16, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

Comapring the stratified distribution charts above to the general tweet-length distribution, a few insights emerge:
1. The distributions of EACH of the non-neutral sentiments (i.e. both positive, negative, and extreme sentiments) seems to ***largely*** align with the general tweet length distribution - right skewed - i.e. a tail to the left. Long tweets are frequent.
2. The only distinctfully different stratified histogram is w.r.t to the ***neutral*** sentiment, where shorter tweet lengths are also common, as well as the longer tweets (which are frequent in the other histograms too).

In [None]:
# Ensure tweet lengths are computed
df['TweetLength'] = df['OriginalTweet'].astype(str).str.len()

# Define sentiment order + 'All'
all_labels = sentiment_order + ['All Tweets']

# Initialize dictionary to collect describe stats
summary_dict = {}

# Add describe() for each sentiment
for sentiment in sentiment_order:
    stats = df[df['Sentiment'] == sentiment]['TweetLength'].describe().astype(int)
    summary_dict[sentiment] = stats

# Add general (all tweets) stats
summary_dict['All Tweets'] = df['TweetLength'].describe().astype(int)

# Combine into a DataFrame
summary_df = pd.DataFrame(summary_dict)

# Optional: Reorder rows (metrics)
summary_df = summary_df.reindex(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])

# Highlight function
def highlight_extremes(row):
    is_max = row == row.max()
    is_min = row == row.min()
    return ['background-color: lightgreen' if v else
            'background-color: salmon' if m else '' for v, m in zip(is_max, is_min)]

# Styling the dataframe w.r.t to row's maximum (green) & minimum (red)
styled_df = summary_df.style.apply(highlight_extremes, axis=1)

styled_df

This table displays the key statistics of each of the stratified distributions (w.r.t to sentiment), as well as the general tweet length distribution. We can observe:

- The longest tweet belongs to the extremely negative sentiment group (355 tokens!), the longest extremely positive tweet consisted of 338 tokens, indicating that tweets invoking extreme emotions appear to be longer and they're potentially POSITIVELY associated with length.
- The neutral sentiment distribution has the largest S.D, aligning with bigger spread than the other distributions, as described above in the graphs.

### **Tweets by Region**

In [None]:
# Load the location data
df['Location'] = df['Location'].fillna("").str.lower()  # Standardize by lowercasing all location values

# Group key-words by region:
region_keywords = {
    "US": [
        "usa", "u.s.a", "u.s", "america", "united states of america", "united states", "texas", "tx", "austin",
        "houston", "abilene", "new york", "new york city", "nyc", "ny", "california", "ca", "florida", "fl",
        "washington", "dc", "washington dc", "washington d.c.", "alaska", "chicago", "illinois", "arizona", "az",
        "atlanta", "ga", "baltimore", "boston", "brooklyn", "manhattan", "queens", "bronx", "staten island",
        "il", "nc", "nj", "va", "tn", "oh", "ohio", "sc", "co", "colorado", "detroit", "mi", "hollywood",
        "los angeles", "san fransisco", "honolulu", "hi", "indiana", "in", "kansas", "philadelphia", "pa",
        "phoenix", "me", "or", "portland", "oregon", "las vegas", "nv", "maryland", "nevada", "massachusetts",
        "miami", "michigan", "minneapolis", "nashville", "new orleans", "new jersey", "salt lake city", "ut",
        "utah", "slc", "san diego", "seattle", "silicon valley"
    ],
    "UK & Commonwealth": [
        "england", "uk", "u.k", "united kingdom", "london", "essex", "leeds", "liverpool", "manchester",
        "canada", "toronto", "ontario", "alberta", "british columbia", "montreal", "quebec", "ottawa", "vancouver",
        "australia", "south australia", "canberra", "melbourne", "sydney", "adelaide", "victoria",
        "new zealand", "auckland", "scotland", "aberdeen", "edinburgh", "glasgow", "ireland", "dublin"
    ],
    "Europe": [
        "netherlands", "amsterdam", "nederland", "holland", "the netherlands",
        "germany", "berlin", "frankfurt", "munich", "hamburg", "dusseldorf", "deutschland",
        "france", "paris", "belgium", "brussels", "switzerland", "geneva", "zurich",
        "spain", "barcelona", "madrid", "italy", "milan", "milano", "rome", "roma",
        "portugal", "lisbon", "austria", "vienna", "russia", "moscow", "st. petersburg"
    ],
    "Africa": [
        "south africa", "cape town", "johannesburg", "ghana", "accra", "nigeria", "lagos",
        "kenya", "uganda", "kampala"
    ],
    "Asia": [
        "india", "mumbai", "new delhi", "delhi", "bangalore", "hong kong", "singapore",
        "japan", "tokyo", "pakistan", "malaysia", "china", "shanghai",
        "united arab emirates", "united arab emirate", "abu dhabi", "uae", "dubai"
    ]
}

# Reverse mapping: from keyword to region
keyword_to_region = {
    keyword: region for region, keywords in region_keywords.items() for keyword in keywords
}

# Assigning region per location using this function:
def assign_region(location):
    for keyword, region in keyword_to_region.items():
        if keyword in location:
            return region
    return None

# Map the locations to their corresponding defined regions:
df['Region'] = df['Location'].apply(assign_region)

# Count tweets per region:
region_counts = df['Region'].value_counts().reset_index()
region_counts.columns = ['Region', 'TweetCount']

# Plotting the pie chart:
plt.figure(figsize=(9, 9))
wedges, texts, autotexts = plt.pie(
    region_counts['TweetCount'],
    labels=[f"{region} ({count})" for region, count in zip(region_counts['Region'], region_counts['TweetCount'])],
    autopct="%1.1f%%",
    startangle=140,
    pctdistance=0.65,       # Move percentage labels further inward
    labeldistance=1.15,     # Move labels further out
    textprops={'fontsize': 12}
)

plt.title("Tweet Distribution by Region", fontsize=14, pad=35)
plt.axis('equal')
plt.tight_layout()
plt.show()

The dataset contains 41,158 tweets, but there are 8,594 tweets without location values (~20%).


So, out of the remaining 32,564 tweets with location values, we analyzed the location distribution of **86% of them (28,095 tweets)** as shown in the above pie chart.

The other 14% were non-indicative locations (gibberish, small & irrelevant cities without countries mentioned, etc.)

### **Data Cleaning**

To reduce noise in the tweet content, we prepared the Corona_NLP dataset (train & test) for sentiment analysis by standardizing the tweet text. This included:

- Expanded English contractions (e.g., don’t → do not) to standardize wording.

- Replaced URLs and user mentions with placeholders, while simplifying hashtags.

- Removed unnecessary punctuation and normalized whitespace.

- Lowercased text to ensure consistency across tokens.

The clean versions of the train & test datasets with an added CleanTweet column, were saved as new CSV files for further modeling.

In [None]:
train_df = pd.read_csv(f"{data_path}/Corona_NLP_train.csv", encoding='latin1')
test_df = pd.read_csv(f"{data_path}/Corona_NLP_test.csv", encoding='latin1')

import re
from datetime import datetime

# Minimal set to avoid external libs. Non-destructive if not present.
# We will apply this function inside the next function (clean_tweet_sentiment_friendly)
def basic_contractions_expand(text: str) -> str:
    mapping = {
        "can't": "can not", "won't": "will not", "don't": "do not", "doesn't": "does not",
        "didn't": "did not", "i'm": "i am", "it's": "it is", "that's": "that is",
        "there's": "there is", "they're": "they are", "we're": "we are", "you're": "you are",
        "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
        "shouldn't": "should not", "couldn't": "could not", "wouldn't": "would not",
        "i've": "i have", "we've": "we have", "they've": "they have", "who's": "who is",
        "what's": "what is", "let's": "let us", "i'll": "i will", "you'll": "you will",
        "he's": "he is", "she's": "she is"
    }
    # Replace using regex with word boundaries, case-insensitive
    def repl(m):
        s = m.group(0)
        return mapping.get(s.lower(), s)
    pattern = re.compile(r"\b(" + "|".join(map(re.escape, mapping.keys())) + r")\b", flags=re.IGNORECASE)
    return pattern.sub(repl, text)

#the main cleaning function of the dataset
def clean_tweet_sentiment_friendly(text: str) -> str:
    if pd.isna(text):
        return text
    t = text     # Preserve original sentiment cues as much as possible
    t = basic_contractions_expand(t)
    t = re.sub(r'http\S+|www\.\S+', ' URL ', t)             # URLs -> token
    t = re.sub(r'(?<=\s)RT\s+', ' ', t)                     # RT markers (Retweet sign) at word boundary
    t = re.sub(r'@\w+', ' @user ', t)                       # Mentions -> @user
    t = re.sub(r'#(\w+)', r'\1', t)                         # Remove hashtags but keep hashtag word
    t = re.sub(r"[\"$%^&*()\-_=+\[\]{};:|/\\<>]", " ", t)   # strip most punctuation and special characters, BESIDE ! and ?
    t = re.sub(r'\s+', ' ', t).strip()                      # Normalize whitespaces to only one whitespace
    t = t.lower()                                           # Lowercase
    return t


clean_train = train_df.copy()
clean_train["CleanTweet"] = clean_train["OriginalTweet"].apply(clean_tweet_sentiment_friendly)
clean_test = test_df.copy()
clean_test["CleanTweet"] = clean_test["OriginalTweet"].apply(clean_tweet_sentiment_friendly)
clean_train.to_csv(f"{data_path}/CLEAN_Corona_NLP_train.csv", index=False, encoding="utf-8")
clean_test.to_csv(f"{data_path}/CLEAN_Corona_NLP_test.csv",  index=False, encoding="utf-8")

# Basic inspections of the cleaned train & test datasets:
clean_train.head(200)
clean_test.head(200)

# **Part B - Training Pre-Trained HuggingFace models**

### **Data Splitting - Train and Validation**

We split the original training dataset into training and validation subsets, ensuring stratification which respects the original label / sentiment distributions, now in the new subsets. Besides, the new subsets contained only relevant info for classification, i.e. the cleaned tweet content and the labels / sentiments themselves.

In [None]:
# load the CLEAN datasets using ISO-8859-1 encoding due to UTF-8 decoding error
# We are loading the only two relevant columns for the classification (Sentiment label & Cleaned Tweet content)
train_df = pd.read_csv(f"{data_path}/CLEAN_Corona_NLP_train.csv", encoding='latin1',usecols=["Sentiment", "CleanTweet"] )
test_df = pd.read_csv(f"{data_path}/CLEAN_Corona_NLP_test.csv", encoding='latin1', usecols=["Sentiment", "CleanTweet"])

# Fixed label mapping for all of the data before splitting
label_order = ['Extremely Negative','Negative','Neutral','Positive','Extremely Positive']
label2id = {l:i for i,l in enumerate(label_order)}
id2label = {i:l for l,i in label2id.items()}

test_size = len(test_df)

# Split the training data to create a validation set of the same size as the test set, stratification was included to keep the same label distribution across the training & validation subsets
train_df_reduced, val_df = train_test_split(
    train_df,
    test_size=test_size, # Validation set's size equals the test set's size
    random_state=42,
    stratify=train_df['Sentiment'] # Stratified the subsets w.r.t labels - sentiments
)

train_df_reduced_size = len(train_df_reduced)
val_size = len(val_df)
total = train_df_reduced_size + val_size + test_size

# Create a summary table
summary = {
    "Dataset": ["Training after splitting", "Validation", "Test"],
    "Records": [train_df_reduced_size, val_size, test_size],
    "Percentage": [round(100 * train_df_reduced_size / total, 2),
                   round(100 * val_size / total, 2),
                   round(100 * test_size / total, 2)]
}


summary_df = pd.DataFrame(summary)
summary_df.head()