<a href="https://colab.research.google.com/github/hop-ltienn/hop-ltienn/blob/main/train%2Bpred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import các thư viện quan trọng
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
data = torch.load("/kaggle/input/dataflow/processed_data.pth")
padded_inputs = data["train"]           #Dữ liệu đầu vào & đầu ra đã padding.
padded_targets = data["test"]
attention_mask = data["mask"]           #Đánh dấu vị trí có dữ liệu thực.
continuous_cols = data['continuous']      #Các cột dữ liệu liên tục
embedd_col=data['embedd']['col']            #Các cột cần embedding và giá trị của chúng.
embedd_dict=data['embedd']['values']
model_input_dim=len(continuous_cols)+8*len(embedd_col)          #Số chiều đầu vào của mô hình.


  data = torch.load("/kaggle/input/dataflow/processed_data.pth")


In [None]:
# -------------------------------------------
#Tạo Dataset & DataLoader
# ------------------------------------------
#BankSequenceDataset
class BankSequenceDataset(Dataset):
    def __init__(self, sequences, targets, attn_mask, cont_input_dim, cat_cols):
        self.sequences = sequences
        self.attn_mask = attn_mask
        self.targets = targets
        self.cont_input_dim = cont_input_dim
        self.cat_cols = cat_cols

    def __len__(self):
        return self.sequences.shape[0]

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        attn = self.attn_mask[idx]
        target = self.targets[idx]

        # Tách thành đặc trưng liên tục & danh mục
        cont_features = seq[:, :self.cont_input_dim]
        cat_features_raw = seq[:, self.cont_input_dim:]

        cat_features = {}
        for i, col in enumerate(self.cat_cols):
            cat_features[col] = cat_features_raw[:, i].long()
        return cont_features, cat_features, attn, target
BATCH_SIZE = 64
dataset = BankSequenceDataset(padded_inputs, padded_targets, attention_mask, len(continuous_cols), embedd_col)

from torch.utils.data import random_split, DataLoader

#Chia thành tập train (90%) và validation(10%)
total_size = len(dataset)
train_size = int(0.9 * total_size)
val_size = total_size - train_size

train_subset, val_subset = random_split(dataset, [train_size, val_size])

#DataLoaders cho train & validation
BATCH_SIZE = 64
train_dataloader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
#Mô hình Transformer cho Dự đoán Chuỗi Giao Dịch

## 1. Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerSeqModel(nn.Module):
    def __init__(self, model_input_dim, embedd_dict, d_model, num_heads, num_layers, output_dim, embed_dim=8):


        super(TransformerSeqModel, self).__init__()
        ## Embedding các đặc trưng danh mục
        self.embeddings = nn.ModuleDict({
            col: nn.Embedding(num_categories, embed_dim)
            for col, num_categories in embedd_dict.items()
        })
        # Chiếu đầu vào sang d_model
        embedd_total_dim = embed_dim * len(embedd_dict)
        self.input_proj = nn.Linear(model_input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, output_dim)

    def forward(self, cont_features, cat_features, attn_mask):
        cat_emb_list = []
        for col, emb_layer in self.embeddings.items():
            cat_emb = emb_layer(cat_features[col])
            cat_emb_list.append(cat_emb)
        if cat_emb_list:
            cat_emb = torch.cat(cat_emb_list, dim=-1)
            x = torch.cat([cont_features, cat_emb], dim=-1)
        else:
            x = cont_features
        x = self.input_proj(x)
        x = self.pos_encoder(x)
        key_padding_mask = (attn_mask == 0)
        x = self.transformer_encoder(x, src_key_padding_mask=key_padding_mask)
        out = self.fc(x)
        return torch.sigmoid(out)

In [None]:
#Định nghĩa hyperparameters
d_model = 64
num_heads = 8
num_layers = 4
output_dim = 24
#Khởi tạo và Nạp trọng số Mô hình
model = TransformerSeqModel(model_input_dim, embedd_dict, d_model, num_heads, num_layers, output_dim, embed_dim=8)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=model.to(device)
model.load_state_dict(torch.load("/kaggle/input/dataflow_baseline_transformer/pytorch/default/8/focal.pth"))

  model.load_state_dict(torch.load("/kaggle/input/dataflow_baseline_transformer/pytorch/default/8/focal.pth"))


<All keys matched successfully>

In [None]:
# Định Nghĩa Hàm Mất Mát & Tối Ưu Hóa

```python
import torch
import torch.nn as nn
import torch.optim as optim

# Hàm Focal Loss với Label Smoothing và Masking
def focal(outputs, targets, attn_mask, alpha=0.25, gamma=2.0, smoothing=0.1, reduction='mean'):
    targets_smoothed = targets * (1 - smoothing) + 0.5 * smoothing
    eps = 1e-6
    outputs = torch.clamp(outputs, eps, 1.0 - eps)
    bce_loss = - (targets_smoothed * torch.log(outputs) + (1 - targets_smoothed) * torch.log(1 - outputs))
    pt = outputs * targets_smoothed + (1 - outputs) * (1 - targets_smoothed)
    focal_weight = alpha * (1 - pt) ** gamma
    loss = focal_weight * bce_loss
    loss_masked = loss * attn_mask.unsqueeze(-1)

    return loss_masked.sum() / attn_mask.sum() if reduction == 'mean' else loss_masked.sum()

# Hàm BCE Loss với Label Smoothing
def bce(outputs, targets, attn_mask, smoothing=0.1):
    targets_smoothed = targets * (1 - smoothing) + 0.5 * smoothing
    loss_fn = nn.BCELoss(reduction='none')
    loss_raw = loss_fn(outputs, targets_smoothed)
    loss_masked = loss_raw * attn_mask.unsqueeze(-1)

    return loss_masked.sum() / attn_mask.sum()

# Khởi tạo Optimizer và Scheduler
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)


In [None]:
# Định Nghĩa Hàm Đánh Giá MAP@5 & Early Stopping
import numpy as np
import torch

# Hàm đánh giá Mean Average Precision @5
def Map(y_true, y_pred, k=5):
    num_samples = y_true.shape[0]
    average_precisions = []

    for i in range(num_samples):
        preds = np.argsort(-y_pred[i])[:k]
        true_indices = np.where(y_true[i] == 1)[0]
        if len(true_indices) == 0:
            average_precisions.append(0)
            continue
        score = num_hits = 0.0
        for j, pred in enumerate(preds):
            if pred in true_indices:
                num_hits += 1.0
                score += num_hits / (j + 1.0)
        average_precisions.append(score / min(len(true_indices), k))

    return np.mean(average_precisions)

# Early Stopping để dừng training khi không có cải thiện
class EarlyStopping:
    def __init__(self, patience=3, delta=0.0, verbose=False, save_path="best_model.pth"):
        self.patience = patience
        self.delta = delta
        self.verbose = verbose
        self.save_path = save_path
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_loss = float('inf')

    def __call__(self, val_loss, model):
        score = -val_loss  # Đảo dấu vì muốn giảm val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        """Lưu model nếu loss cải thiện."""
        if self.verbose:
            print(f"Validation loss improved to {val_loss:.4f}. Saving model to {self.save_path}")
        torch.save(model.state_dict(), self.save_path)
        self.best_loss = val_loss

In [None]:
# Huấn Luyện & Đánh Giá Mô Hình

```python
import torch
import numpy as np
from tqdm import tqdm

def train_and_eval(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    scheduler,
    thres=0.5,
    loss='focal',
    epochs=10,
    device="cpu",
    alpha=0.25,
    gamma=2.0,
    smoothing=0.1,
    patience=3
):
    early_stopping = EarlyStopping(patience=patience, verbose=True, save_path="best_model.pth")
    thres_tensor = torch.tensor(thres).to(device)

    for epoch in range(epochs):
        # ---------------------------
        # 1) Training Phase
        # ---------------------------
        model.train()
        total_train_loss = 0.0
        for cont_batch, cat_batch, attn_mask_batch, y_batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Train]"):
            cont_batch, attn_mask_batch, y_batch = cont_batch.to(device), attn_mask_batch.to(device), y_batch.to(device)
            cat_batch = {k: v.to(device) for k, v in cat_batch.items()}

            optimizer.zero_grad()
            outputs = model(cont_batch, cat_batch, attn_mask_batch)
            loss = focal(outputs, y_batch, attn_mask_batch, alpha, gamma, smoothing) if loss == 'focal' else bce(outputs, y_batch, attn_mask_batch, smoothing)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_dataloader)

        # ---------------------------
        # 2) Validation Phase
        # ---------------------------
        model.eval()
        total_val_loss, all_outputs, all_targets = 0.0, [], []
        with torch.no_grad():
            for cont_batch, cat_batch, attn_mask_batch, y_batch in tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Val]"):
                cont_batch, attn_mask_batch, y_batch = cont_batch.to(device), attn_mask_batch.to(device), y_batch.to(device)
                cat_batch = {k: v.to(device) for k, v in cat_batch.items()}

                outputs = model(cont_batch, cat_batch, attn_mask_batch)
                val_loss = focal(outputs, y_batch, attn_mask_batch, alpha, gamma, smoothing)
                total_val_loss += val_loss.item()

                final_outputs = (outputs[:, -1, :] > thres_tensor).int()
                final_targets = (y_batch[:, -1, :] > thres_tensor).int()
                all_outputs.append(final_outputs.cpu().numpy())
                all_targets.append(final_targets.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_dataloader)
        all_outputs, all_targets = np.vstack(all_outputs), np.vstack(all_targets)
        map7 = Map(all_targets, all_outputs, k=7) / y_batch.shape[0]

        scheduler.step(avg_val_loss)
        print(f"Epoch [{epoch+1}/{epochs}] -> Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, MAP@7: {map7:.4f}")

        # ---------------------------
        # 3) Early Stopping Check
        # ---------------------------
        early_stopping(avg_val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping triggered.")
            break

    model.load_state_dict(torch.load("best_model.pth"))
    return model

In [None]:

# 8. Train and Evaluate the Model

train_and_eval(model, train_dataloader, val_dataloader, optimizer, scheduler,loss='focal', epochs=30, device=device, smoothing=0.1,patience=5)
torch.save(model.state_dict(), "focal.pth")


In [None]:
# Dự Đoán với Mô Hình Transformer

## Hàm `predict`
```python
import torch

def predict(model, dataloader, device="cpu", thres=0.5):
    """ Sinh dự đoán từ mô hình đã huấn luyện. """
    model.eval()
    all_predictions = []
    thres_tensor = torch.tensor(thres).to(device)

    with torch.no_grad():
        for cont_batch, cat_batch, attn_mask, _ in dataloader:
            cont_batch, attn_mask = cont_batch.to(device), attn_mask.to(device)
            cat_batch = {k: v.to(device) for k, v in cat_batch.items()}
            outputs = model(cont_batch, cat_batch, attn_mask)
            if outputs.dim() == 3:
                outputs = (outputs[:, -1, :] > thres_tensor).int()
            all_predictions.append(outputs)

    return torch.cat(all_predictions, dim=0)

In [None]:
preds=predict(model,val_dataloader,device)

  output = torch._nested_tensor_from_mask(


In [None]:
perf=[]
for i in range(len(val_subset)):
    m=Map(val_subset[i][-1][-1],preds[i].cpu())
    perf.append(m)

  true_indices = np.where(y_true[i] == 1)[0]


In [None]:
sum(perf)/len(perf)

0.04858423015298433