In [10]:
import torch 
from torch import nn
import numpy as np
TWOJ_KOD = None 
# TODO: Zaprojektuj sieć neuronową która konsumuje wektory o długości 1024 (embeddingi) i zwraca binarną odpowiedź (sigmoid) 
# Pośrednie warstwy niech mają kolejno 150 i 15 neuronów a funkcjami aktywacji niech będzie nn.ReLU 
class ReviewClassifier(nn.Module):
    def __init__(self):
        super().__init__()        
        self.mlp = nn.Sequential(
            nn.Linear(1024, 150),
            nn.ReLU(),
            nn.Linear(150,15),
            nn.ReLU(),
            nn.Linear(15, 1),
            nn.Sigmoid()
        ) 
        
    
    def forward(self, x):        
        return self.mlp(x)

In [1]:
import pandas as pd 
#TODO: wczytaj zapisane dane z embeddingami
df = pd.read_feather("amazon_with_embeddings.bin")

In [2]:
df.columns

Index(['label', 'title', 'content', 'embeddings'], dtype='object')

In [3]:
df.embeddings[0]

array([ 0.04330444, -0.00665665,  0.01806641, ..., -0.04470825,
       -0.02055359,  0.01741028])

In [4]:
df.embeddings[0].shape

(1024,)

In [8]:
df = df.sample(frac=1.0)
# TODO: Podziel dane na train, val, test (25k, 5k, reszta) 
train_df = df[:25000]
val_df = df[25000:30000]
test_df = df[30000:]

In [17]:
from torchmetrics import Accuracy
from tqdm.notebook import tqdm 
import numpy as np 
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()
model = ReviewClassifier()
bce_loss = torch.nn.BCELoss()  # TODO: https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html

batch_size = 128
adam_optimizer = torch.optim.Adam(model.parameters()) #TODO: https://pytorch.org/docs/stable/generated/torch.optim.Adam.html [use lr=0.0005]
nr_of_epochs = 1
losses = []
for epoch in tqdm(range(nr_of_epochs)):
    # shuffle 
    train_df = train_df.sample(frac=1.0)
    current_index = 0 
    while current_index < len(train_df):        
        batch_x = np.array(
            train_df.embeddings[current_index:(current_index + batch_size)].tolist()
        )
        batch_y = np.array(
            train_df.label[current_index: (current_index + batch_size)].tolist()
        )
        
        tensor_batch_x = torch.Tensor(batch_x)
        tensor_batch_y = torch.Tensor(batch_y).reshape(-1, 1)
        
        pred = model.forward(tensor_batch_x)
        loss = bce_loss(pred, tensor_batch_y)
 
        # Backpropagation
        loss.backward() # Liczenie gradientu wag modelu
        adam_optimizer.step() # adam oblicza nowe parametry sieci 
        adam_optimizer.zero_grad()
        
        current_index += batch_size
        losses.append(float(loss.detach().numpy()))
        
    model.eval()
    metric = Accuracy(task='BINARY')
    for _, eval_row in tqdm(val_df.iterrows(), desc='evaluating'):
        pred = model.forward(
            torch.Tensor(np.array(eval_row.embeddings))
        )
        metric.update(pred, torch.Tensor([eval_row.label]))
    metric_value = metric.compute()
    writer.add_scalar('Accuracy/eval', metric_value, epoch)

        

  0%|          | 0/1 [00:00<?, ?it/s]

evaluating: 0it [00:00, ?it/s]

In [19]:
torch.save(model.state_dict(), "review_classifier.bin")

In [21]:
test_df.reset_index().to_feather("test_df.bin")