In [None]:
## BERT

In [None]:
!pip install transformers torch

In [None]:
import pandas as pd

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
import torch

In [None]:
from torch.utils.data import DataLoader, TensorDataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import classification_report, roc_curve, auc, roc_auc_score

In [None]:
from sklearn.preprocessing import LabelEncoder, label_binarize

In [None]:
import matplotlib.pyplot as plt

In [None]:
from itertools import cycle

In [None]:
import numpy as np

In [None]:
# Encoding labels ('positive', 'negative', 'neutral') to numerical values

In [None]:
label_encoder = LabelEncoder()

In [None]:
df['label'] = label_encoder.fit_transform(df['class'])

In [None]:
# Tokenization and Data Preparation (using cleaned text)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
encoded_data = tokenizer.batch_encode_plus(

In [None]:
    df['cleaned_text'].tolist(),

In [None]:
    add_special_tokens=True,

In [None]:
    return_attention_mask=True,

In [None]:
    padding='max_length',

In [None]:
    max_length=128,  # Adjust if your sentences are longer

In [None]:
    truncation=True,

In [None]:
    return_tensors='pt'

In [None]:
)

In [None]:
input_ids = encoded_data['input_ids']

In [None]:
attention_masks = encoded_data['attention_mask']

In [None]:
labels = torch.tensor(df['label'].tolist())

In [None]:
# Split into training and testing sets

In [None]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(

In [None]:
    input_ids, labels, random_state=35476648, test_size=0.2

In [None]:
)

In [None]:
train_masks, test_masks, _, _ = train_test_split(

In [None]:
    attention_masks, input_ids, random_state=35476648, test_size=0.2

In [None]:
)

In [None]:
# Create DataLoader

In [None]:
train_data = TensorDataset(train_inputs, train_masks, train_labels)

In [None]:
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)

In [None]:
test_data = TensorDataset(test_inputs, test_masks, test_labels)

In [None]:
test_dataloader = DataLoader(test_data, batch_size=32)

In [None]:
# Load Pre-trained Model

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3) # 3 labels for positive, negative, neutral

In [None]:
# Enable GPU

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model.to(device)

In [None]:
# Fine-tune (Optimizer, Loss Function, Training Loop)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
for epoch in range(3):  # Adjust number of epochs

In [None]:
    model.train()

In [None]:
    for batch in train_dataloader:

In [None]:
        batch = tuple(t.to(device) for t in batch)

In [None]:
        b_input_ids, b_input_mask, b_labels = batch

In [None]:
        optimizer.zero_grad()

In [None]:
        outputs = model(b_input_ids,

In [None]:
                        token_type_ids=None,

In [None]:
                        attention_mask=b_input_mask)

In [None]:
        loss = loss_fn(outputs.logits, b_labels)

In [None]:
        loss.backward()

In [None]:
        optimizer.step()

In [None]:
# Evaluation

In [None]:
model.eval()

In [None]:
logits_list, true_labels = [], []

In [None]:
for batch in test_dataloader:

In [None]:
    batch = tuple(t.to(device) for t in batch)

In [None]:
    b_input_ids, b_input_mask, b_labels = batch

In [None]:
    with torch.no_grad():

In [None]:
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

In [None]:
    logits = outputs.logits.detach().cpu().numpy()

In [None]:
    label_ids = b_labels.to('cpu').numpy()

In [None]:
    logits_list.extend(logits)

In [None]:
    true_labels.extend(label_ids)

In [None]:
# Convert logits to probabilities for ROC AUC

In [None]:
softmax = torch.nn.Softmax(dim=1)

In [None]:
probs = softmax(torch.tensor(logits_list)).numpy()

In [None]:
# Binarize the labels for ROC AUC calculation

In [None]:
y_test_binarized = label_binarize(true_labels, classes=[0, 1, 2])

In [None]:
n_classes = y_test_binarized.shape[1]

In [None]:
fpr, tpr, roc_auc = {}, {}, {}

In [None]:
for i in range(n_classes):

In [None]:
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], probs[:, i])

In [None]:
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
# Calculate macro-average ROC AUC

In [None]:
roc_auc["macro"] = roc_auc_score(y_test_binarized, probs, multi_class="ovr", average="macro")

In [None]:
# Plot ROC AUC curves

In [None]:
plt.figure()

In [None]:
colors = cycle(['blue', 'green', 'red'])

In [None]:
for i, color in zip(range(n_classes), colors):

In [None]:
    plt.plot(fpr[i], tpr[i], color=color, lw=2,

In [None]:
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

In [None]:
plt.plot([0, 1], [0, 1], 'k--', lw=2)

In [None]:
plt.xlim([0.0, 1.0])

In [None]:
plt.ylim([0.0, 1.05])

In [None]:
plt.xlabel('False Positive Rate')

In [None]:
plt.ylabel('True Positive Rate')

In [None]:
plt.title('Multiclass ROC')

In [None]:
plt.legend(loc="lower right")

In [None]:
plt.show()

In [None]:
# Print out all AUC scores

In [None]:
print("AUC for each class:")

In [None]:
for i in range(n_classes):