In [23]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json

In [24]:
accions_df = pd.read_csv("data/accions.csv", parse_dates=["Data"])
tramits_df = pd.read_csv("data/tramits.csv")

In [None]:
# Sort IDs to ensure consistent mapping across runs
sorted_ids = sorted(tramits_df['Id'].unique())

# Create the mapping dictionary
mapping = {tramit_id: idx for idx, tramit_id in enumerate(sorted_ids)}

# Save the mapping to a JSON file
with open("tramit_mapping.json", 'w', encoding='utf-8') as f:
    json.dump(mapping, f, ensure_ascii=False, indent=2)

# Print some validation information
print(f"Created mapping for {len(mapping)} unique tramits")

# Validate the mapping
reverse_mapping = {v: k for k, v in mapping.items()}
assert len(reverse_mapping) == len(mapping), "Mapping is not one-to-one"
assert all(isinstance(v, int) for v in mapping.values()), "Not all values are integers"
assert all(isinstance(k, str) for k in mapping.keys()), "Not all keys are strings"

Created mapping for 502 unique tramits
{0: '+1JQhTbg4lgyTIo1IfnXsGHt0uVmFQui/PprXOhQCOQ=', 1: '+9hGyUgYGGqr/1tg7SbHuXM2SdoloC0VruPrL3jtm2A=', 2: '+NoTO0ItlnsY3BVIot97xdqd429P3E5EDa09XUEQHJk=', 3: '+P1T5jsqGWcF0eJ0g9ZX7HHKQdaxE8OTAm9uMUcdRJ8=', 4: '+RMaZY0iNBpUgCTfndSjpDAECoYX+P/GE72G6YD9CjA=', 5: '+Ti5iuCSbvCPrzcrOWtfasZM9ALnysMq1mdbcPGMklg=', 6: '+fMaLoK/BqzaE+L2DM8WjufzaXXcBZG5VcUUspZNkDI=', 7: '+gzoIX67Vc3BDtzzSrI8PH2ofTaYmCgjivWJT/o04e4=', 8: '+kKhgUgWOsYKFrnH+SXCmnqFK3A+DNBjs7OD4T3QuBw=', 9: '+tdO4AtyvNhqJfSd87mpHETALCDJdlLWyCrQyGQ5MtY=', 10: '/3JQ89bxH5lmeJX6KK7AucRpkUy+FoNooW081i7o8TM=', 11: '/G7JgOWHzNkVUDM/pT0ZHAC6igSeC+EtcTenAR10EMQ=', 12: '/H580/Kb0mC56bGtqZlmQyePH1qgeLRqPBwuLZYZBIE=', 13: '/MTfZFU+61dycpUuENyS9WNWQ2lJzFJpeJimWfP6smY=', 14: '/T8aI0gX1njc+Wn59vo0wY8GbicYbWyV+6TBmDwdAeE=', 15: '/ZHrTbJK0Er0nTsBomIpAvaB8Sq1Xt+Lb1Bi8lBtaII=', 16: '/bibhZIje8ttoNCixHlQeemRHnz4GjfsbvxDN24M/O4=', 17: '/dIZEULTf8w+C6Hxd609twuaS2HOnKX5iJy3O/KxuEE=', 18: '/o5VxteIj4P6dGS1H8PcQRV1u4Nwt

In [26]:
# Sort by session and timestamp
sorted_df = accions_df.sort_values(['Sessio', 'Data'])

# Group by session to get sequences
sequences = sorted_df.groupby('Sessio')['Tramit'].agg(list).reset_index()

In [None]:
# Remove sequences with only one tramit
sequences = sequences[sequences["Tramit"].apply(len) > 1][:100000]

In [None]:
class TramitContextDataset(Dataset):
    def __init__(self, sequences_df, mapping, context_size=3):
        self.sequences = sequences_df["Tramit"].values
        self.context_size = context_size
        self.mapping = mapping
        self.reverse_mapping = {v: k for k, v in mapping.items()}

        # Validate the mapping
        assert len(self.reverse_mapping) == len(
            self.mapping
        ), "Mapping is not one-to-one"
        assert all(
            isinstance(v, int) for v in self.mapping.values()
        ), "Not all values are integers"
        assert all(
            isinstance(k, str) for k in self.mapping.keys()
        ), "Not all keys are strings"

        # Encode sequences using the mapping
        self.encoded_sequences = [
            [self.mapping[tramit] for tramit in seq] for seq in self.sequences
        ]

        # Create sequences with context
        self.x = []
        self.y = []
        for seq in self.encoded_sequences:
            for i in range(len(seq) - 1):
                # Get context (pad with zeros if needed)
                context = np.zeros(self.context_size, dtype=int)
                start_idx = max(0, i - self.context_size + 1)
                context[-len(seq[start_idx : i + 1]) :] = seq[start_idx : i + 1]

                self.x.append(context)
                self.y.append(seq[i + 1])

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.x[idx], dtype=torch.long),
            torch.tensor(self.y[idx], dtype=torch.long),
        )

batch_size = 32

context_dataset = TramitContextDataset(sequences, mapping, context_size=3)
context_loader = DataLoader(context_dataset, batch_size=batch_size, shuffle=True)

In [31]:
class TramitPredictor(nn.Module):
    def __init__(self, num_tramits, embedding_dim=32, hidden_dim=64, context_size=3):
        super().__init__()
        self.embedding = nn.Embedding(num_tramits, embedding_dim)

        # LSTM layer to process the context
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            dropout=0.2,
        )

        # Prediction layers
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_dim, num_tramits)

    def forward(self, x):
        # x shape: (batch_size, context_size)
        embedded = self.embedding(x)  # (batch_size, context_size, embedding_dim)

        lstm_out, _ = self.lstm(embedded)  # (batch_size, context_size, hidden_dim)

        # Take the last output
        last_output = lstm_out[:, -1, :]  # (batch_size, hidden_dim)

        # Final prediction
        x = self.fc1(last_output)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

In [39]:
def train_model(
    model, train_loader, val_loader, criterion, optimizer, num_epochs, device
):
    best_val_loss = float("inf")
    train_losses = []
    val_losses = []

    # Create main progress bar for epochs
    epoch_pbar = tqdm(range(num_epochs), desc="Training Progress")

    for epoch in epoch_pbar:
        # Training phase
        model.train()
        total_train_loss = 0
        train_batches = 0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            train_batches += 1

        avg_train_loss = total_train_loss / train_batches
        train_losses.append(avg_train_loss)

        # Validation phase
        model.eval()
        total_val_loss = 0
        val_batches = 0
        correct_predictions = 0
        total_predictions = 0

        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)

                total_val_loss += loss.item()
                val_batches += 1

                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == targets).sum().item()
                total_predictions += targets.size(0)

        avg_val_loss = total_val_loss / val_batches
        val_losses.append(avg_val_loss)
        accuracy = correct_predictions / total_predictions * 100

        # Update progress bar description with current metrics
        epoch_pbar.set_postfix(
            {
                "train_loss": f"{avg_train_loss:.4f}",
                "val_loss": f"{avg_val_loss:.4f}",
                "val_acc": f"{accuracy:.2f}%",
            }
        )

        # Save best model silently
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pth")

    return train_losses, val_losses


def train_tramit_predictor(
    sequences_df, context_size=3, batch_size=32, num_epochs=50, learning_rate=0.001
):
    # Create dataset
    dataset = TramitContextDataset(sequences_df, mapping, context_size=context_size)

    # Split dataset
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_tramits = len(mapping)
    model = TramitPredictor(num_tramits=num_tramits, context_size=context_size)
    model = model.to(device)

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train model
    train_losses, val_losses = train_model(
        model, train_loader, val_loader, criterion, optimizer, num_epochs, device
    )

    return model, train_losses, val_losses


# Train model
model, train_losses, val_losses = train_tramit_predictor(
    sequences, context_size=3, batch_size=32, num_epochs=50
)

Training Progress: 100%|██████████| 50/50 [00:05<00:00,  9.44it/s, train_loss=1.2042, val_loss=2.9638, val_acc=56.18%]


In [None]:
def predict_next_tramit(model, context, device):
    model.eval()

    with torch.no_grad():
        # Convert context to tensor and add batch dimension
        context_tensor = torch.tensor(context).unsqueeze(0).to(device)

        # Get model prediction
        output = model(context_tensor)

        # Get probabilities
        probabilities = torch.softmax(output, dim=1)

        # Get top 5 predictions
        top_p, top_class_indices = torch.topk(probabilities, 5)

        predictions = []
        for prob, idx in zip(top_p[0], top_class_indices[0]):
            tramit_id = reverse_mapping[idx.item()]  # Use reverse mapping to decode
            predictions.append((tramit_id, prob.item()))

    return predictions

# Make a prediction
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
context = [1, 2, 3]  # Example context (should be encoded tramit IDs)
predictions = predict_next_tramit(model, context, device)

In [53]:
# Print predictions
print("\nTop 5 predicted next tramits:")
for tramit, probability in predictions:
    print(f"Tramit: {tramit}, Probability: {probability:.4f}")


Top 5 predicted next tramits:
Tramit: 5vNCVIwbIfJT4IH0z/ADNFqUWFl0+nY1HQOnmYBdjQQ=, Probability: 0.3773
Tramit: IhSP8N4DzNtki05jR/GgyC9kubmeKYURK4/qACvQGjc=, Probability: 0.1550
Tramit: WSz+cqp76jqjmiUUJX4ERXWcjzjHriSdI3FYo1yutkM=, Probability: 0.1481
Tramit: 4KE5hVcLhuR5hNmelkoQQkHKbY+pYKWx0izJQGdnvxE=, Probability: 0.0394
Tramit: FKCtMWPFSZssfgm52D0i+zLmIC8QGW5BiauNKSvNjzs=, Probability: 0.0365
