Zachęcam do zabawy z kodem: <br>
-  dodajcie regularyzację,
- zapiszcie najlepszy model do pliku,
- eksperymentujcie z hiperparametrami (learning rate, hidden_dim, embed_dim itd.) <br>
Powodzenia!

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # uruchomienie na GPU (wspierane na przykład na google colab)

In [3]:
def create_vocab(sentences: list[str]) -> dict[str, int]:
    vocab = {
        '<PAD>': 0,
        '<UNK>': 1
    }
    idx = 2
    for sentence in sentences:
        tokens = sentence.lower().split()
        for token in tokens:
            if token not in vocab:
                vocab[token] = idx
                idx += 1
    return vocab


In [4]:

def tokenize(sentences: list[str], vocab: dict[str, int], seq_len: int) -> list[list[int]]:
    all_sentences = []
    for sentence in sentences:
        indices = []
        tokens = sentence.lower().split()
        if len(tokens) > seq_len:
            tokens = tokens[:seq_len]
        for token in tokens:
            indices.append(vocab.get(token, 1))
        if len(indices) < seq_len:
            zeros_to_add = seq_len - len(indices)
            indices.extend([0] * zeros_to_add)
        all_sentences.append(indices)
    return all_sentences

In [5]:

def create_dataloader(all_indices: list[list[int]], labels, batch_size: int,
        shuffle: bool) -> DataLoader:
    x_tensor = torch.tensor(data=all_indices, dtype=torch.long)
    labels = labels.values if hasattr(labels, 'values') else labels
    y_tensor = torch.tensor(data=labels, dtype=torch.float32)
    new_dataset = TensorDataset(x_tensor, y_tensor)
    # pin_memory pomaga przy transferze batchy na GPU
    pin = (device.type == "cuda")
    dataloader = DataLoader(new_dataset, batch_size=batch_size, shuffle=shuffle, pin_memory=pin)
    dataloader = DataLoader(new_dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [6]:

def download_data(num_samples:int) -> pd.DataFrame:
    print("Pobieranie danych...")
    ds = load_dataset("stanfordnlp/imdb")
    df = pd.DataFrame(ds['train'])
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    df = df.iloc[:num_samples]
    return df

In [7]:
class SentimentClassifierNN(torch.nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int):
        super().__init__()
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim, padding_idx=0)
        self.fc1 = torch.nn.Linear(in_features=embed_dim, out_features=hidden_dim)
        self.relu = torch.nn.ReLU() # max(x, 0)
        self.fc2 = torch.nn.Linear(in_features=hidden_dim, out_features=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [9]:
def evaluate(model, eval_loader, criterion):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0

    with torch.no_grad():
        for batch_x, batch_y in eval_loader:
           batch_x = batch_x.to(device, non_blocking=True)
           batch_y = batch_y.to(device, non_blocking=True)
           predicted = model(batch_x)
           loss = criterion(predicted, batch_y.unsqueeze(1))
           total_loss += loss.item()

           probs = torch.sigmoid(predicted)
           preds = (probs > 0.5).float()
           correct += (preds == batch_y.unsqueeze(1)).sum().item()
           total += batch_y.size(0)

    avg_loss = total_loss / len(eval_loader)
    accuracy = correct / total
    return avg_loss, accuracy

In [None]:

def train(train_loader, vocab_size, embed_dim, hidden_dim, num_of_epochs, eval_loader):
    model = SentimentClassifierNN(vocab_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim)
    model = model.to(device) # model na GPU/CPU
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=10e-3)

    best_loss = float('inf')

    print("Rozpoczynam trening...")
    for epoch in range(num_of_epochs):
        model.train()
        train_loss = 0.0

        for batch_x, batch_y in train_loader:
            batch_x = batch_x.to(device, non_blocking=True)
            batch_y = batch_y.to(device, non_blocking=True)

            optimizer.zero_grad()
            predictions = model(batch_x)
            loss = criterion(predictions, batch_y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        val_loss, val_acc = evaluate(model=model, eval_loader=eval_loader, criterion=criterion)  # Naprawa kolejności nazwanych argumentów, w kodzie z materiału był tu błąd

        print(f"Epoch {epoch}: Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc*100:.2f}%")

        if val_loss < best_loss:
            best_loss = val_loss

    print(f"DONE. Best Val loss was {best_loss:.4f}")

    return model

In [10]:
df = download_data(10000)
print(df.head())

print("Budowanie słownika...")
vocab = create_vocab(sentences=df['text'])
list_of_indices = tokenize(sentences=df['text'], vocab=vocab, seq_len=120)

labels = df['label']
X_train, X_val, y_train, y_val = train_test_split(list_of_indices, labels, test_size=0.3, random_state=42)

train_loader = create_dataloader(X_train, y_train, batch_size=32, shuffle=True)
eval_loader = create_dataloader(X_val, y_val,  batch_size=32, shuffle=False)

model = train(train_loader, vocab_size=len(vocab), embed_dim=16, hidden_dim=32, num_of_epochs=5, eval_loader=eval_loader)

# Wizualizacja przykładowej predykcji
test_sentence = ["This movie was fantastic! I really loved it.", "Even though the plot was boring, the acting was superb.",
    "At first I thought I liked but, but overall a pretty boring movie."]
test_indices = tokenize(test_sentence, vocab, seq_len=120)
test_tensor = torch.tensor(data=test_indices, dtype=torch.long)

test_tensor = test_tensor.to(device)
model.eval()
with torch.no_grad():
    model_output = model(test_tensor)
    probs = torch.sigmoid(model_output)
    predictions = (probs > 0.5).float()
    print("Predictions:", predictions.squeeze().tolist())

Pobieranie danych...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

                                                text  label
0  Dumb is as dumb does, in this thoroughly unint...      0
1  I dug out from my garage some old musicals and...      1
2  After watching this movie I was honestly disap...      0
3  This movie was nominated for best picture but ...      1
4  Just like Al Gore shook us up with his painful...      1
Budowanie słownika...
Rozpoczynam trening...
Epoch 0: Train Loss: 0.5794 | Val Loss: 0.4592 | Val Acc: 78.53%
Epoch 1: Train Loss: 0.2163 | Val Loss: 0.5183 | Val Acc: 78.50%
Epoch 2: Train Loss: 0.0355 | Val Loss: 0.6679 | Val Acc: 79.10%
Epoch 3: Train Loss: 0.0041 | Val Loss: 0.7971 | Val Acc: 79.07%
Epoch 4: Train Loss: 0.0008 | Val Loss: 0.8575 | Val Acc: 78.80%
DONE. Best Val loss was 0.4592
Predictions: [1.0, 0.0, 1.0]
