In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('/content/youtube_comments.csv')
df

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, comments, sentiments, tokenizer, max_length):
        self.comments = comments
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        comment = self.comments.iloc[idx]
        sentiment = self.sentiments.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(sentiment, dtype=torch.long)
        }

In [None]:
max_length = 128

In [None]:
train_dataset = CustomDataset(train_df["comment"], train_df["sentiment"], tokenizer, max_length)
val_dataset = CustomDataset(val_df["comment"], val_df["sentiment"], tokenizer, max_length)
test_dataset = CustomDataset(test_df["comment"], test_df["sentiment"], tokenizer, max_length)

In [None]:
batch_size = 32

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.cpu().tolist())
            actual.extend(labels.cpu().tolist())

    return accuracy_score(actual, predictions)

In [None]:
train_losses = []
train_accuracies = []
val_accuracies = []
num_epochs = 9

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    train_accuracy = evaluate(model, train_loader, device)
    train_accuracies.append(train_accuracy)

    val_accuracy = evaluate(model, val_loader, device)
    val_accuracies.append(val_accuracy)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")

    print("-" * 50)

In [None]:
test_accuracy = evaluate(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.4f}")