In [2]:
import torch

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("../data/train_data.csv")

data["labels"] = data["rating"] - 1

train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_data(data, max_len=512):
    return tokenizer(data["review"].tolist(), max_length=max_len, padding=True, truncation=True, return_tensors="pt")


# Tokenizacja zbiorów danych
train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)

In [None]:
from torch.utils.data import DataLoader, TensorDataset


def create_data_loader(encodings, labels):
    dataset = TensorDataset(encodings["input_ids"], encodings["attention_mask"], torch.tensor(labels))
    loader = DataLoader(dataset, batch_size=16, shuffle=True)
    return loader


train_loader = create_data_loader(train_encodings, train_data["labels"])
val_loader = create_data_loader(val_encodings, val_data["labels"])

In [None]:
from transformers import BertForSequenceClassification
import torch

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(data["rating"].unique()))

# Definicja urządzenia (CPU/GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Funkcje do trenowania i ewaluacji
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    # Implementacja procedury treningowej
    pass


def eval_model(model, data_loader, device):
    # Implementacja procedury ewaluacji
    pass


# Procedura trenowania
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(3):
    train_epoch(model, train_loader, loss_fn, optimizer, device)
    eval_model(model, val_loader, device)

In [None]:
test_data = pd.read_csv("../data/test_data.csv")

test_encodings = tokenize_data(test_data)
test_loader = create_data_loader(test_encodings, test_data["labels"])


In [None]:
predictions = []
for batch in test_loader:
    preds = model(batch[0].to(device), batch[1].to(device))
    predictions.extend(preds.argmax(dim=1).tolist())

pd.DataFrame({"Predicted Rating": predictions
