In [33]:
import sys
sys.path.append("..")
from ArticleDataset import ArticleDataset
from MLPClassifier import MLPClassifier
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn


In [None]:
csv_file = "../data/articles.csv"

dataset = ArticleDataset(csv_file, min_freq=5)

In [35]:
len(dataset)

1239914

In [36]:
n = 5
vocab = dataset.vocab
print(len(vocab))
print(f'Title of article {n}: {dataset.data.iloc[n].title.strip()}')
print(f'vocab : {vocab}')
print(f'data for title {n}: {dataset[n]}')
print(f'category of article {n}: {dataset.data.iloc[n].category}')
print(f'index of this category: {dataset.ctoi[dataset.data.iloc[n].category]}')
print("indexes to title:")
for idx in dataset[n][0]:
    print(dataset.itow[idx.item()])

# we can see that everything works fine

38962
Title of article 5: Hierarchical quantum embedding by machine learning for large molecular
  assemblies
data for title 5: (tensor([46, 41, 47, 48, 37, 38, 49, 50, 51, 52]), tensor(1))
category of article 5: physics.chem-ph
index of this category: 1
indexes to title:
hierarchical
quantum
embedding
by
machine
learning
for
large
molecular
assemblies


In [37]:
train_size = int(0.8 * len(dataset))
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])
len(train_dataset), len(test_dataset)

(991931, 247983)

In [38]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [39]:
for batch, (X,y) in enumerate(train_dataloader):
    if batch > 5:
        break
    print(X,y)
# Variable sizes problem

RuntimeError: stack expects each tensor to be equal size, but got [9] at entry 0 and [10] at entry 1

In [46]:
# We fix it using a collate function that uses padding 
def custom_collate(batch, pad_value=0):
    sequences, labels = zip(*batch)
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=pad_value)
    labels = torch.tensor(labels)
    return padded_sequences, labels

train_dataloader = DataLoader(train_dataset, collate_fn=custom_collate, batch_size=5, shuffle=True)
test_dataloader = DataLoader(test_dataset, collate_fn=custom_collate, batch_size=5, shuffle=True)


In [47]:
for batch, (X,y) in enumerate(train_dataloader):
    if batch > 1:
        break
    print(X,y)
    

tensor([[    7,  3873,   771,     2,  5103,     6, 17438,  1852,  1514,     0,
             0,     0],
        [  244,  1780,  2003,    32,  6834,   450,     6,    19,   409,   659,
          1201,   501],
        [ 5577,  8152,  1863,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [   18, 11008,  2556,   602,     0,  6972,     2, 14685,  2022,     0,
             0,     0],
        [  645,  2129,  2853,  2061,     6,  1827,   707,     0,     0,     0,
             0,     0]]) tensor([ 46,  20, 121, 108,  84])
tensor([[14250,   585,     2,    19,  1216,  3760,  6709,  7004,     0,    19,
          5547,     0],
        [    0,  2198,  6803,   383,  4102,  3417,  7097,  2269,   383,  3220,
         14200,  7097],
        [ 4776,  2040,   693,  9531,  2513,    32,  9529,     0,     0,     0,
             0,     0],
        [ 1841,  1881,   534,  2045, 16756,     6,   960,  6886,     7,  3009,
          1080,     0],
        [    0,  2618,  3275,     6,  

In [48]:
# We can finally create the training loop

# Hyperparameters
vocab_size = len(dataset.wtoi)         # Taille du vocabulaire
embedding_dim = 10                     # Dimension des embeddings
hidden_dim = 256                       # Dimension des couches cachées
num_classes = len(dataset.ctoi)        # Nombre de classes de sortie
num_hidden_layers = 1                  # Nombre de couches cachées
num_epochs = 3                         # Nombre d'époques d'entraînement
learning_rate = 0.001                  # Taux d'apprentissage


In [49]:
# model setup 
model = MLPClassifier(vocab_size, embedding_dim, hidden_dim, num_classes, num_hidden_layers)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MLPClassifier(
  (embedding): Embedding(38963, 10)
  (pool): AdaptiveAvgPool1d(output_size=1)
  (input_layer): Linear(in_features=10, out_features=256, bias=True)
  (hidden_layers): ModuleList()
  (output_layer): Linear(in_features=256, out_features=172, bias=True)
)

In [50]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [51]:
# Training loop 
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch_idx, (padded_sequences, labels) in enumerate(train_dataloader):
        padded_sequences = padded_sequences.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(padded_sequences)  # [batch_size, num_classes]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_loss:.4f}")


    # Évaluation
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for padded_sequences, labels in test_dataloader:
            padded_sequences = padded_sequences.to(device)
            labels = labels.to(device)
            outputs = model(padded_sequences)
            # Prédictions : indice de la classe avec la plus grande probabilité
            predictions = outputs.argmax(dim=1)
            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)
            
    accuracy = total_correct / total_samples * 100
    print(f"Test Accuracy: {accuracy:.2f}%")

Epoch 1/3, Training Loss: 2.8570
Test Accuracy: 40.03%
Epoch 2/3, Training Loss: 2.2537
Test Accuracy: 43.69%
Epoch 3/3, Training Loss: 2.1363
Test Accuracy: 44.73%
