In [10]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [11]:
accions_df = pd.read_csv("data/accions.csv", parse_dates=["Data"])
tramits_df = pd.read_csv("data/tramits.csv")

In [12]:
# Sort by session and timestamp
sorted_df = accions_df.sort_values(['Sessio', 'Data'])

# Group by session to get sequences
sequences = sorted_df.groupby('Sessio')['Tramit'].agg(list).reset_index()

In [13]:
class TramitContextDataset(Dataset):
    def __init__(self, sequences_df, context_size=3):
        self.sequences = sequences_df["Tramit"].values
        self.context_size = context_size

        # Create label encoder
        self.label_encoder = LabelEncoder()
        all_tramits = [tramit for seq in self.sequences for tramit in seq]
        self.label_encoder.fit(all_tramits)

        # Create sequences with context
        self.x = []
        self.y = []
        for seq in self.sequences:
            encoded_seq = self.label_encoder.transform(seq)
            for i in range(len(seq) - 1):
                # Get context (pad with zeros if needed)
                context = np.zeros(self.context_size)
                start_idx = max(0, i - self.context_size + 1)
                context[-len(encoded_seq[start_idx : i + 1]) :] = encoded_seq[
                    start_idx : i + 1
                ]

                self.x.append(context)
                self.y.append(encoded_seq[i + 1])

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.x[idx], dtype=torch.long),
            torch.tensor(self.y[idx], dtype=torch.long),
        )

batch_size = 32
context_dataset = TramitContextDataset(sequences, context_size=3)
context_loader = DataLoader(context_dataset, batch_size=batch_size, shuffle=True)

In [None]:
class TramitPredictor(nn.Module):
    def __init__(self, num_tramits, embedding_dim=32, hidden_dim=64, context_size=3):
        super().__init__()
        self.embedding = nn.Embedding(num_tramits, embedding_dim)

        # LSTM layer to process the context
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            dropout=0.2,
        )

        # Prediction layers
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_dim, num_tramits)

    def forward(self, x):
        # x shape: (batch_size, context_size)
        embedded = self.embedding(x)  # (batch_size, context_size, embedding_dim)

        lstm_out, _ = self.lstm(embedded)  # (batch_size, context_size, hidden_dim)

        # Take the last output
        last_output = lstm_out[:, -1, :]  # (batch_size, hidden_dim)

        # Final prediction
        x = self.fc1(last_output)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

# Create dataset
dataset = TramitContextDataset(sequences, context_size=3)

In [None]:
def train_model(
    model, train_loader, val_loader, criterion, optimizer, num_epochs, device
):
    best_val_loss = float("inf")
    train_losses = []
    val_losses = []

    # Create main progress bar for epochs
    epoch_pbar = tqdm(range(num_epochs), desc="Training Progress")

    for epoch in epoch_pbar:
        # Training phase
        model.train()
        total_train_loss = 0
        train_batches = 0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            train_batches += 1

        avg_train_loss = total_train_loss / train_batches
        train_losses.append(avg_train_loss)

        # Validation phase
        model.eval()
        total_val_loss = 0
        val_batches = 0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)

                total_val_loss += loss.item()
                val_batches += 1

                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == targets).sum().item()
                total_predictions += targets.size(0)

        avg_val_loss = total_val_loss / val_batches
        val_losses.append(avg_val_loss)
        accuracy = correct_predictions / total_predictions * 100

        # Update progress bar description with current metrics
        epoch_pbar.set_postfix(
            {
                "train_loss": f"{avg_train_loss:.4f}",
                "val_loss": f"{avg_val_loss:.4f}",
                "val_acc": f"{accuracy:.2f}%",
            }
        )

        # Save best model silently
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pth")

    return train_losses, val_losses


def train_tramit_predictor(
    sequences_df, context_size=3, batch_size=32, num_epochs=50, learning_rate=0.001
):
    # Create dataset
    dataset = TramitContextDataset(sequences_df, context_size=context_size)

    # Split dataset
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_tramits = len(dataset.label_encoder.classes_)
    model = TramitPredictor(num_tramits=num_tramits, context_size=context_size)
    model = model.to(device)

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train model
    train_losses, val_losses = train_model(
        model, train_loader, val_loader, criterion, optimizer, num_epochs, device
    )

    return model, dataset.label_encoder, train_losses, val_losses


# Train model
model, label_encoder, train_losses, val_losses = train_tramit_predictor(
    sequences, context_size=3, batch_size=32, num_epochs=50
)

Training Progress:  16%|█▌        | 8/50 [00:59<05:10,  7.40s/it, train_loss=1.8150, val_loss=1.8223, val_acc=50.01%]


KeyboardInterrupt: 

In [None]:
def predict_next_tramit(model, label_encoder, context, device):
    model.eval()
    with torch.no_grad():
        # Convert context to tensor and add batch dimension
        context_tensor = torch.tensor(context).unsqueeze(0).to(device)
        
        # Get model prediction
        output = model(context_tensor)
        
        # Get probabilities
        probabilities = torch.softmax(output, dim=1)
        
        # Get top 5 predictions
        top_p, top_class_indices = torch.topk(probabilities, 5)
        
        predictions = []
        for prob, idx in zip(top_p[0], top_class_indices[0]):
            tramit = label_encoder.inverse_transform([idx.cpu().item()])[0]
            predictions.append((tramit, prob.item()))
    
    return predictions

# Make a prediction
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
context = [1, 2, 3]  # Example context (should be encoded tramit IDs)
predictions = predict_next_tramit(model, label_encoder, context, device)

In [None]:
# Print predictions
print("\nTop 5 predicted next tramits:")
for tramit, probability in predictions:
    print(f"Tramit: {tramit}, Probability: {probability:.4f}")


Top 5 predicted next tramits:
Tramit: /bibhZIje8ttoNCixHlQeemRHnz4GjfsbvxDN24M/O4=, Probability: 0.4922
Tramit: OwMGwNHm7i5ca1/GxM+w6PTa56KPvOeExwc5FmAeEj8=, Probability: 0.1448
Tramit: WPXFffQDK8TyVmQXm48qmQN0YAJSVq3p5k+lCqV9f+E=, Probability: 0.0613
Tramit: +1JQhTbg4lgyTIo1IfnXsGHt0uVmFQui/PprXOhQCOQ=, Probability: 0.0460
Tramit: NVMqpInhbPaMaLDy0OuA03rgcJVUWAk6b+RCINxh7yg=, Probability: 0.0386
