In [94]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset, random_split
from torch import nn
from transformers import BertTokenizer

In [95]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [96]:
class ReviewsDataset(Dataset):
    def __init__(self, csv, tokenizer, max_length=256):
        data = pd.read_csv(csv)
        data["sentiment"] = data["sentiment"].replace("positive", 1).replace("negative", 0)
        self.x = data["review"].values
        self.y = data["sentiment"].values
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        sentence = self.x[index]
        tokens = self.tokenizer.encode_plus(sentence, add_special_tokens=True,
                                            max_length=self.max_length,
                                            padding='max_length', truncation=True,
                                            return_tensors="pt")
        input_ids = tokens['input_ids'].squeeze()
        y_logs = torch.tensor(self.y[index])
        return input_ids, y_logs

In [97]:
class CustomLSTMModel(nn.Module):
    def __init__(self, vocab_size, num_classes, embed_size=100, hidden_size=128):
        super(CustomLSTMModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        # self.h0 = torch.zeros(1,1,hidden_size)
        # self.c0 = torch.zeros(1,1,hidden_size)
        # self.last_layer = nn.Sequential(
            # nn.ReLU(),
            # nn.Linear(embed_size, 2)
        # )
    
    def forward(self, x):
        embedded = self.embed(x)
        lstm_out, _ = self.lstm(embedded)
        # out_encod, (hn, cn) = self.encoder(emb, (h0, c0))
        logits = self.fc(lstm_out[:, -1])
        return logits

Carregar tokenizer i dataset

In [98]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = ReviewsDataset(csv='data/IMDB_Dataset.csv', tokenizer=tokenizer)

  data["sentiment"] = data["sentiment"].replace("positive", 1).replace("negative", 0)


Dividir en train i test

In [99]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

Crear DataLoaders

In [100]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Inicialitzar model, funció de pèrdua i optimizer

In [101]:
vocab_size = tokenizer.vocab_size
model = CustomLSTMModel(vocab_size, 2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Bucle d'entrenament

In [None]:
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    
    for batch_index, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(output, 1)
        correct += (predicted == target).sum().item()
        total += target.size(0)
        
        accuracy = 100 * correct / total
    print(f'Època {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}, Accuraccy: {accuracy:.2f}%')

Època 1/5, Loss: 0.6930, Accuraccy: 51.06%
Època 2/5, Loss: 0.6809, Accuraccy: 54.40%
Època 3/5, Loss: 0.3963, Accuraccy: 82.74%
Època 4/5, Loss: 0.2556, Accuraccy: 89.98%
Època 5/5, Loss: 0.1912, Accuraccy: 93.08%


In [103]:

torch.save(model.state_dict(), "model.pth")