In [1]:
!pip install -q transformers
!pip install -q datasets
!pip install -q scikit-learn
!pip install -q pandas
!pip install -q numpy
!pip install -q huggingface_hub[hf_xet]
!pip install -q accelerate

In [3]:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

^C


In [3]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import torch
import torch.nn.functional as F
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split 
from torch.utils.data import Dataset, DataLoader

In [25]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    raise ValueError("Running on CPU is not recommended.")

ValueError: Running on CPU is not recommended.

# Constants Setup

In [5]:
TRAIN_PATH = 'Restaurants_Train_v2.xml'
MAX_LENGTH = 256
PRETRAINED_MODEL = 'bert-base-uncased'
MODEL_NAME = "Restaurant-v1"
BATCH_SIZE = 21
EPOCHS = 20

# Load data

In [7]:
tree = ET.parse(TRAIN_PATH)
root = tree.getroot()

data = []

for sentence in root.iter("sentence"):
    text = sentence.find("text").text
    aspect_categories = sentence.find("aspectCategories")
    
    if aspect_categories is not None:
        for category in aspect_categories.findall("aspectCategory"):
            cat = category.attrib["category"]
            polarity = category.attrib["polarity"]
            if polarity != "conflict":
                data.append({
                    "sentence": text,
                    "category": cat,
                    "label": polarity
                })

df = pd.DataFrame(data)
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['label'].map(label_map)
df.to_csv("absa_dataset.csv", index=False)
df.head()

Unnamed: 0,sentence,category,label
0,But the staff was so horrible to us.,service,0
1,"To be completely fair, the only redeeming fact...",food,2
2,"To be completely fair, the only redeeming fact...",anecdotes/miscellaneous,0
3,"The food is uniformly exceptional, with a very...",food,2
4,Where Gabriela personaly greets you and recomm...,service,2


# Tokenize data

In [9]:
sentences = df['sentence'].tolist()
categories = df['category'].tolist()

X_train, X_temp, y_train, y_temp = train_test_split(
    sentences, categories, test_size=0.2, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(f"Training size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")

Training size: 2814
Validation size: 352
Test size: 352


In [10]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)

train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')

In [11]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train + y_val + y_test)

y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

train_dataset = Dataset(train_encodings, y_train_encoded)
val_dataset = Dataset(val_encodings, y_val_encoded)
test_dataset = Dataset(test_encodings, y_test_encoded)

In [13]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [14]:
print(torch.cuda.is_available())

False


# Train the model

In [23]:
print(torch.cuda.is_available())

False


In [16]:
device = torch.device("cuda")

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL,
    num_labels=3,
)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AssertionError: Torch not compiled with CUDA enabled

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )

        loss = outputs.loss
        running_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} - Loss: {running_loss / len(train_loader)}")