# Sentiment Analysis

In [None]:
import pandas as pd

splits = {'train': 'train_df.csv', 'validation': 'val_df.csv', 'test': 'test_df.csv'}
dataframes = []
for split, filename in splits.items():
    if filename != 'val_df.csv':
      df = pd.read_csv(f'hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/{filename}')
      df.drop('id', axis=1, inplace=True)
      dataframes.append(df)
    else:
      test_df = pd.read_csv(f'hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/{filename}')
      test_df.drop('id', axis=1, inplace=True)
train_df = pd.concat(dataframes)
train_df = train_df.dropna()

In [None]:
print(train_df.head())
print(test_df.head())

print(len(train_df))
print(len(test_df))

## Train the Model

In [None]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer

# Initialize the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

In [None]:
print(train_tokenized)

In [None]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    eval_strategy="epoch",           # Evaluate at the end of each epoch
    learning_rate=5e-5,              # Start with a small learning rate
    per_device_train_batch_size=32,  # Batch size per GPU
    per_device_eval_batch_size=32,
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Regularization
    logging_steps=100,               # Log every 100 steps
    fp16=True                        # Enable mixed precision for faster training
)

In [None]:
!pip install evaluate

In [None]:
from transformers import Trainer
from evaluate import load

# Load a metric (F1-score in this case)
metric = load("f1")

# Define a custom compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

In [None]:
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
trainer = Trainer(
    model=model,                        # Pre-trained BERT model
    args=training_args,                 # Training arguments
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,        # Efficient batching
    compute_metrics=compute_metrics,     # Custom metric
)

trainer.train()