## Importing Libraries

In [17]:
# Pre-Processing
import pandas as pd
import re
import emoji
import contractions

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

import evaluate

import numpy as np

## Importing Data

In [18]:
data_A_fp = 'twitter_data/custom_data/COVIDSenti-main/COVIDSenti-A.csv'
data_B_fp = 'twitter_data/custom_data/COVIDSenti-main/COVIDSenti-B.csv'
data_C_fp = 'twitter_data/custom_data/COVIDSenti-main/COVIDSenti-C.csv'

In [19]:
data_A_df = pd.read_csv(data_A_fp)
data_B_df = pd.read_csv(data_B_fp)
data_C_df = pd.read_csv(data_C_fp)
data_full_df = pd.concat([data_A_df, data_B_df, data_C_df], ignore_index=True)

In [20]:
# Removal of links
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.replace_emoji(doc, replace='')

def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

def fix_contractions(doc):
    return contractions.fix(doc)

def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

def remove_amp(doc):
    return re.sub(r'\bamp\b', '', doc).strip() # strip removes the surrounding white space

def remove_special_character_combinations(doc):
    # Remove all combinations of \r and \n in any order
    cleaned_text = re.sub(r'[\r\n\xa0]+', '', doc)
    return cleaned_text

def remove_non_english_characters(doc):
    return re.sub(r'[^\x00-\x7F]+', '', doc)

In [21]:
# This function combines all the previous functions into one
def preprocess_tweet(doc):
    doc = remove_urls(doc)
    doc = convert_emojis(doc)
    doc = remove_hashtags(doc)
    doc = remove_numbers(doc)
    doc = remove_user_mentions(doc)
    doc = fix_contractions(doc)
    doc = remove_punctuation(doc)
    doc = remove_amp(doc)
    doc = remove_special_character_combinations(doc)
    doc = remove_non_english_characters(doc)
    return doc

In [22]:
# This applies the pre-processing
data_full_df['tweet'] = data_full_df['tweet'].apply(preprocess_tweet)

## Creating Train and Test Dataframes

In [23]:
df_pos = data_full_df[data_full_df['label'] == 'pos'].sample(n=5000, random_state=23)
df_neu = data_full_df[data_full_df['label'] == 'neu'].sample(n=5000, random_state=23)
df_neg = data_full_df[data_full_df['label'] == 'neg'].sample(n=5000, random_state=23)

filtered_df = pd.concat([df_pos, df_neu, df_neg], ignore_index=True)

label_mapping = {'neg': 0, 'neu': 1, 'pos': 2}
filtered_df['label'] = filtered_df['label'].map(label_mapping)

In [24]:
# Perform an 80/20 train-test split
train_df, test_val_df = train_test_split(filtered_df, test_size=0.4, random_state=23)
val_df, test_df = train_test_split(test_val_df, test_size=0.5, random_state=23)

## Retrain BERTweet on the tweet dataset

In [25]:
# Convert to HuggingFace dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [26]:
# Tokenize the datasets
tokenizer = AutoTokenizer.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis')

def tokenize_function(doc):
    return tokenizer(doc['tweet'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [27]:
# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [28]:
# Pre-trained BERTweet model
model = AutoModelForSequenceClassification.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis', num_labels=3)

In [29]:
# Define compute_metrics function
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [30]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./sentiment_model_results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    save_strategy='epoch',  # Save checkpoints at the end of each epoch
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)


In [31]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [32]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3398,0.440015,0.875
2,0.3059,0.360267,0.897333
3,0.2461,0.3704,0.912
4,0.2713,0.374629,0.912333
5,0.005,0.402432,0.924
6,0.1449,0.473031,0.914667
7,0.1421,0.500114,0.918667
8,0.0334,0.545015,0.919333
9,0.0003,0.557381,0.925333
10,0.0002,0.582275,0.921


TrainOutput(global_step=11250, training_loss=0.18508976663371754, metrics={'train_runtime': 4079.9096, 'train_samples_per_second': 22.059, 'train_steps_per_second': 2.757, 'total_flos': 5920051898880000.0, 'train_loss': 0.18508976663371754, 'epoch': 10.0})