In [1]:
import copy
import sys
import random
import pickle
import torch
import warnings
import logging
import gc
import math
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pytorch_optimizer as optim_mod

from torch.optim import Optimizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from collections import Counter
from tqdm import tqdm
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.exceptions import UndefinedMetricWarning
from torch.nn.modules.transformer import TransformerEncoderLayer, TransformerEncoder

from torch.cuda.amp import autocast, GradScaler
from IPython.display import display, HTML

from collections import defaultdict
from sklearn.model_selection import StratifiedKFold

from scipy.stats import mode

from IPython.display import Javascript, display

In [2]:
#Choose Model
useTRANSFORMERS = True
useLSTM = False
useLSTMAttention = False
useLSTMWeightAtention = False
useLSTMEncoder = False

variables = {
    'useTRANSFORMERS': useTRANSFORMERS,
    'useLSTM': useLSTM,
    'useLSTMAttention': useLSTMAttention,
    'useLSTMWeightAtention': useLSTMWeightAtention,
    'useLSTMEncoder': useLSTMEncoder
}
true_model = [var_name for var_name, var_value in variables.items() if var_value]

In [3]:
class RotaryPositionalEmbeddingROPE(nn.Module):
    def __init__(self, dim, base=10000):
        super(RotaryPositionalEmbeddingROPE, self).__init__()
        if dim % 2 != 0:
            raise ValueError("The dimension for RoPE must be even.")
        self.dim = dim
        self.base = base

    def forward(self, x):
        batch_size, num_heads, seq_len, head_dim = x.size()
        device = x.device
        position_ids = torch.arange(seq_len, dtype=torch.float, device=device).unsqueeze(0).unsqueeze(2)
        dim_t = torch.arange(0, head_dim, 2, dtype=torch.float, device=device)
        dim_t = self.base ** (-dim_t / head_dim)
        angles = position_ids * dim_t
        sin_angles = angles.sin().unsqueeze(0).unsqueeze(2)
        cos_angles = angles.cos().unsqueeze(0).unsqueeze(2)
        x1, x2 = x[..., 0::2], x[..., 1::2]
        x_rotated = torch.cat([x1 * cos_angles - x2 * sin_angles,
                               x1 * sin_angles + x2 * cos_angles], dim=-1)
        return x_rotated

class MultiheadAttentionWithRoPE(nn.MultiheadAttention):
    def __init__(self, embed_dim, num_heads, dropout=0.1, bias=True, add_bias_kv=False,
                 add_zero_attn=False, kdim=None, vdim=None, base=10000, batch_first=True):
        super(MultiheadAttentionWithRoPE, self).__init__(
            embed_dim=embed_dim,
            num_heads=num_heads,
            dropout=dropout,
            bias=bias,
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
            kdim=kdim,
            vdim=vdim,
            batch_first=batch_first
        )
        self.rotary_emb = RotaryPositionalEmbeddingROPE(self.head_dim, base=base)

    def forward(self, query, key, value, key_padding_mask=None, attn_mask=None, need_weights=True,
                average_attn_weights=True, is_causal=False, **kwargs):
        if self.batch_first:
            Q = query.view(query.size(0), query.size(1), self.num_heads, self.head_dim).transpose(1,2)
            K = key.view(key.size(0), key.size(1), self.num_heads, self.head_dim).transpose(1,2)
            Q = self.rotary_emb(Q)
            K = self.rotary_emb(K)
            Q = Q.transpose(1,2).contiguous().view(query.size(0), query.size(1), self.embed_dim)
            K = K.transpose(1,2).contiguous().view(key.size(0), key.size(1), self.embed_dim)
        else:
            raise NotImplementedError("batch_first=False is not supported in this custom attention.")
        attn_output, attn_weights = super(MultiheadAttentionWithRoPE, self).forward(
            Q, K, value, key_padding_mask=key_padding_mask, attn_mask=attn_mask,
            need_weights=need_weights, average_attn_weights=average_attn_weights
        )
        return attn_output, attn_weights

class TransformerEncoderLayerWithRoPE(TransformerEncoderLayer):
    def __init__(self, embed_dim, num_heads, dim_feedforward=2048, dropout=0.1, base=10000, activation='relu', batch_first=True):
        super(TransformerEncoderLayerWithRoPE, self).__init__(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation=activation,
            batch_first=batch_first
        )
        self.self_attn = MultiheadAttentionWithRoPE(
            embed_dim=embed_dim,
            num_heads=num_heads,
            dropout=dropout,
            base=base,
            batch_first=batch_first
        )

class ActorCriticOrdinalTransformerModel(nn.Module):
    def __init__(self, input_dim, d_model=512, num_heads=8, num_layers=6, 
                 dim_feedforward=2048, dropout=0.2, num_actor_heads=10, 
                 num_classes=5, base=10000):
        super(ActorCriticOrdinalTransformerModel, self).__init__()
        
        self.num_classes = num_classes
        self.num_thresholds = num_classes - 1
        
        # Input projection
        self.linear_proj = nn.Linear(input_dim, d_model)
        
        # Actor Transformer Encoder with RoPE
        actor_encoder_layer = TransformerEncoderLayerWithRoPE(
            embed_dim=d_model,
            num_heads=num_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            base=base,
            activation='relu',
            batch_first=True
        )
        self.actor_transformer_encoder = TransformerEncoder(actor_encoder_layer, num_layers=num_layers)
        
        # Actor Heads
        self.actor_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(d_model, dim_feedforward),
                nn.LeakyReLU(negative_slope=0.01),
                nn.LayerNorm(dim_feedforward),
                nn.Dropout(dropout),
                nn.Linear(dim_feedforward, d_model),
                nn.LeakyReLU(negative_slope=0.01),
                nn.LayerNorm(d_model),
                nn.Linear(d_model, self.num_thresholds)  # Output logits for thresholds
            )
            for _ in range(num_actor_heads)
        ])
        
        # Learnable Raw Thresholds (before ordering)
        # Initialize raw thresholds; values will be transformed to ensure ordering
        self.raw_thresholds = nn.Parameter(torch.linspace(-1, 1, steps=self.num_thresholds))
    
    def forward(self, src, src_key_padding_mask=None):
        """
        Args:
            src: [batch_size, seq_len, input_dim]
            src_key_padding_mask: [batch_size, seq_len]
        
        Returns:
            action_probs: [batch_size, seq_len, K-1] (Cumulative Probabilities)
        """
        # Project input to model dimension
        src = self.linear_proj(src)  # [batch_size, seq_len, d_model]
        
        # Actor Transformer Encoding
        actor_transformer_output = self.actor_transformer_encoder(
            src, src_key_padding_mask=src_key_padding_mask
        )  # [batch_size, seq_len, d_model]
        
        # Actor Heads Processing
        actor_outputs = [actor_head(actor_transformer_output) for actor_head in self.actor_heads]
        actor_outputs = torch.stack(actor_outputs, dim=0)  # [num_actor_heads, batch_size, seq_len, K-1]
        action_preds = actor_outputs.mean(dim=0)  # [batch_size, seq_len, K-1]
        
        # Ordering and Applying Thresholds
        # Apply softplus to raw thresholds to ensure positivity, then cumulative sum to ensure ordering
        thresholds = torch.cumsum(F.softplus(self.raw_thresholds), dim=0)  # [K-1]
        thresholds = thresholds.unsqueeze(0).unsqueeze(0)  # [1, 1, K-1]
        
        # Compute Cumulative Probabilities using CLM
        # P(Y <= k | X) = sigmoid(logit(Y <= k | X) - theta_k)
        cumulative_probs = torch.sigmoid(action_preds - thresholds)  # [batch_size, seq_len, K-1]
        
        # Assertions to ensure correct output shapes
        assert cumulative_probs.shape[-1] == self.num_thresholds, \
            f"Expected cumulative_probs to have last dimension {self.num_thresholds}, but got {cumulative_probs.shape[-1]}"
        
        return cumulative_probs



In [4]:
def create_padding_mask(input_seq, pad_token=0):
    # Returns True for padding positions, False for valid positions
    mask = (input_seq == pad_token).all(dim=-1)
    return mask

In [5]:
def get_class_labels(action_probs, threshold=0.5):
    """
    Convert threshold probabilities to class labels.

    Args:
        action_probs (torch.Tensor): Probabilities for each ordinal threshold.
                                     Shape: [batch_size, seq_len, K-1]
        threshold (float): Probability threshold to determine class boundaries.

    Returns:
        class_labels (torch.Tensor): Predicted class indices.
                                     Shape: [batch_size, seq_len]
    """
    # Compare probabilities against the threshold
    exceeded = (action_probs > threshold).int()  # [batch_size, seq_len, K-1]

    # Sum the number of exceeded thresholds to get class labels
    class_labels = exceeded.sum(dim=-1)  # [batch_size, seq_len]

    # Clamp class_labels to be within [0, K-1]
    class_labels = torch.clamp(class_labels, 0, action_probs.size(-1))

    return class_labels

def get_class_labels_best(action_probs, thresholds):
    """
    Convert threshold probabilities to class labels using thresholds per class boundary.

    Args:
        action_probs (torch.Tensor): Probabilities for each ordinal threshold.
                                     Shape: [batch_size, seq_len, K-1]
        thresholds (float or list or torch.Tensor): Thresholds for each class boundary.
                                                    Shape: [K-1]

    Returns:
        class_labels (torch.Tensor): Predicted class indices.
                                     Shape: [batch_size, seq_len]
    """
    # Ensure thresholds is a torch tensor
    if not isinstance(thresholds, torch.Tensor):
        thresholds = torch.tensor(thresholds, device=action_probs.device, dtype=action_probs.dtype)
    else:
        thresholds = thresholds.to(action_probs.device).type_as(action_probs)

    # Reshape thresholds for broadcasting
    thresholds = thresholds.view(1, 1, -1)  # Shape: [1, 1, K-1]

    # Compare probabilities against thresholds
    exceeded = (action_probs > thresholds).int()  # Shape: [batch_size, seq_len, K-1]

    # Sum the number of thresholds exceeded to get class labels
    class_labels = exceeded.sum(dim=-1)  # Shape: [batch_size, seq_len]

    # Clamp class_labels to be within [0, K]
    class_labels = torch.clamp(class_labels, 0, action_probs.size(-1))

    return class_labels

In [6]:
# Load the sequences and labels arrays
loaded_data_sequences = np.load('sequence_array_filtered_INFERENCE_final.npz', allow_pickle=True)
loaded_data_labels = np.load('label_array_filtered_INFERENCE.npz', allow_pickle=True)

# Access the saved arrays
sequence_array = loaded_data_sequences['sequences']
label_array = loaded_data_labels['labels']
label_array = label_array[:, :, :-2]  # Remove the last two timesteps from the label array

print("Array loaded successfully!")

Array loaded successfully!


In [7]:
print(label_array.shape[:])
print(sequence_array.shape[:])

(165093, 1, 31)
(165093, 33, 307)


In [8]:
numero_de_cliente = sequence_array[:, 0, 0]

In [9]:
numero_de_cliente

array([2.91837330e+07, 2.91844680e+07, 2.91852450e+07, ...,
       1.88122277e+08, 1.88128903e+08, 1.88136205e+08])

In [10]:
sequence_array = sequence_array[:, :, 1:]  # Remove client ID (first feature)
print(sequence_array.shape[:])

(165093, 33, 306)


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [12]:
# Initialize a LabelEncoder
label_encoder = LabelEncoder()

# Flatten the label array and encode the labels as integers
label_array_flat = label_array.flatten()  # Flattening the array for label encoding
label_array_encoded = label_encoder.fit_transform(label_array_flat)

# Reshape it back to the original shape after encoding
label_array_encoded = label_array_encoded.reshape(label_array.shape)

# Print encoded classes for verification
class_names = label_encoder.classes_
print(f"Encoded Classes: {class_names}")

Encoded Classes: ['BAJA+1' 'BAJA+2' 'CONTINUA' 'OUT']


In [13]:
print(label_array_encoded.shape[:])

(165093, 1, 31)


In [14]:
Params={'lr': 1e-07, 'dropout': 0.1, 'batch_size': 500, 'num_layers': 8, 'd_model': 420, 
         'weight_baja1': 60, 'weight_baja2': 70, 'weight_continua': 1, 
         'weight_decay': 5e-5, 'dim_feedforward': 1000, 'beta1': 0.93, 'beta2': 0.97, 
         'eps': 2.1e-06, 'grad_clip': 0.69, 'gamma': 4.0, 'reward_baja_2': 117, 
         'penalty_baja_2': -3, 'miss_baja_2_penalty': -100, 'scheduler_patience': 2, 'scheduler_factor': 0.69, 
         'num_heads': 10, 'factor': 50}

In [15]:
input_dim = 306
num_classes = 5
num_heads = 4 
d_model = Params['d_model']    
dim_feedforward=Params['dim_feedforward'] 
dropout = Params['dropout']
num_layers = Params['num_layers']

num_heads = Params['num_heads']
factor = Params['factor']
d_model = num_heads * factor

In [16]:
# Check if CUDA is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using: " + device)

Using: cuda


In [17]:
class_weights_vector = [Params['weight_baja1'] , Params['weight_baja2'] , Params['weight_continua']]
class_weights = torch.tensor(class_weights_vector, dtype=torch.float32).to(device)

model = ActorCriticOrdinalTransformerModel(
    input_dim=input_dim,  # Replace with your input dimension
    d_model=512,
    num_heads=8,
    num_layers=6,
    dim_feedforward=2048,
    dropout=0.2,
    num_actor_heads=10,
    num_classes=5,
    base=10000
)


model.load_state_dict(torch.load(f'best_model_threshold_ganancia_[].pth')) 
model = model.to(device)

# Set the model to evaluation mode
model.eval()

  model.load_state_dict(torch.load(f'best_model_threshold_ganancia_[].pth'))


ActorCriticOrdinalTransformerModel(
  (linear_proj): Linear(in_features=306, out_features=512, bias=True)
  (actor_transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayerWithRoPE(
        (self_attn): MultiheadAttentionWithRoPE(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          (rotary_emb): RotaryPositionalEmbeddingROPE()
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (actor_heads): ModuleList(
    (0-9): 10 x Sequential(
      (0): Linear(in_features=512, out_f

In [18]:
sequence_tensor = torch.tensor(sequence_array, dtype=torch.float32).squeeze(1).to(device) 

In [19]:
sequence_tensor.shape

torch.Size([165093, 33, 306])

In [30]:
# Define batch size
batch_size = 512

# Prepare DataLoader for the sequence_tensor
test_dataset = TensorDataset(sequence_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize lists to collect the predictions
all_probs_last = []

# Disable gradient calculation for inference
with torch.no_grad():
    progress_bar = tqdm(test_loader, desc="Inference", leave=True)
    
    for batch in progress_bar:
        # Extract the sequences from the batch and move to device
        X_test_batch = batch[0].to(device)

        # Create the key padding mask for the test data
        src_key_padding_mask = create_padding_mask(X_test_batch, pad_token=0).to(device)

        # Forward pass: Get model predictions
        #outputs = model(X_test_batch, src_key_padding_mask)

        #outputs, critic_outputs = model(X_test_batch, src_key_padding_mask) 
        #actions_taken, action_probs, state_values = model(X_test_batch, src_key_padding_mask)
        action_preds = model(X_test_batch, src_key_padding_mask=src_key_padding_mask)

        with torch.no_grad():
            #predicted_labels = get_class_labels(action_preds, threshold=0.5)
            predicted_labels = get_class_labels_best(action_preds, thresholds=[(0.08499999999999999, 0.16000000000000011, 0.16000000000000011, 0.55)])

        last_timestep_probs = predicted_labels[:, -1].cpu().numpy()

        # Get probabilities for the last timestep
        #last_timestep_probs = torch.softmax(outputs[:, -1, :], dim=-1).cpu().numpy()  # Shape: [batch_size, num_classes]
        #last_timestep_probs = torch.softmax(action_preds[:, -1, :], dim=-1).cpu().numpy()  # Shape: [batch_size, num_classes]

        # Collect predicted probabilities for the last timestep
        all_probs_last.append(last_timestep_probs)

all_probs_last = np.concatenate(all_probs_last, axis=0)

Inference: 100%|██████████████████████████████| 323/323 [00:32<00:00,  9.85it/s]


In [31]:
class_labels_list = [0, 1, 2, 3, 4, 5]
target_names = ['BAJA+1', 'BAJA+2', 'BAJA+3', 'BAJA+4', 'CONTINUA', 'OUT']

class_counts = Counter(all_probs_last)
print("\nFinal Class Counts (Predicted for Last Timestep):")
for class_value, class_name in zip(class_labels_list, target_names):
    count = class_counts.get(class_value, 0)
    print(f"{class_name} ({class_value}): {count}")



Final Class Counts (Predicted for Last Timestep):
BAJA+1 (0): 0
BAJA+2 (1): 9526
BAJA+3 (2): 273
BAJA+4 (3): 2694
CONTINUA (4): 152600
OUT (5): 0


In [36]:
y_pred = all_probs_last

# Convert 2 (CONTINUA) and 3 (OUT) to 0 (BAJA+1)
#y_pred_converted = np.where((y_pred == 0) , 1, y_pred)
#y_pred_converted = np.where((y_pred_converted == 2) , 0, y_pred_converted)

y_pred_converted = np.where((y_pred == 2) , 0, y_pred)
y_pred_converted = np.where((y_pred_converted == 3) , 1, y_pred_converted)
y_pred_converted = np.where((y_pred_converted == 4) , 0, y_pred_converted)


#y_pred_converted = np.where((y_pred == 2) | (y_pred == 3), 0, y_pred)
#y_pred_converted = np.where((y_pred == 1) | (y_pred == 2) | (y_pred == 3), 5, y_pred)
#y_pred_converted = np.where((y_pred_converted == 0) , 1, y_pred_converted)
#y_pred_converted = np.where((y_pred_converted == 5) , 0, y_pred_converted)

# Check the new class distribution after conversion
class_counts_converted = Counter(y_pred_converted)

class_labels_list = [0, 1, 2, 3, 4, 5]
target_names = ['BAJA+1', 'BAJA+2', 'BAJA+3', 'BAJA+4', 'CONTINUA', 'OUT']

class_counts = Counter(y_pred_converted)
print("\nFinal Class Counts (Predicted for Last Timestep):")
for class_value, class_name in zip(class_labels_list, target_names):
    count = class_counts.get(class_value, 0)
    print(f"{class_name} ({class_value}): {count}")


Final Class Counts (Predicted for Last Timestep):
BAJA+1 (0): 152873
BAJA+2 (1): 12220
BAJA+3 (2): 0
BAJA+4 (3): 0
CONTINUA (4): 0
OUT (5): 0


In [37]:
y_pred_converted = y_pred_converted.astype(int)
numero_de_cliente_flat = numero_de_cliente.flatten()
numero_de_cliente_flat = numero_de_cliente_flat.astype(int)
submission_df = pd.DataFrame({
    'numero_de_cliente': numero_de_cliente_flat,
    'Predicted': y_pred_converted
})


In [38]:
# Ensure the submission file has exactly 165,093 rows (as specified in your requirements)
assert submission_df.shape[0] == 165093, "The submission file must have exactly 165,093 rows."

# Save the first submission file
print("Saving submission file...")
submission_df.to_csv('submissionNoForce.csv', index=False)
print("Submission file saved as 'submissionNoForce.csv'")


Saving submission file...
Submission file saved as 'submissionNoForce.csv'


In [39]:
import gzip
import shutil

# Compress the submission.csv file
with open('submissionNoForce.csv', 'rb') as f_in:
    with gzip.open('submissionNoForce.csv.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("gzip done")

gzip done
