In [None]:
import pickle as pkl
import numpy as np
import os
import torch

from tqdm import tqdm


os.chdir('/home/jcsanguino10/local_citation_model/recommender-fusion-recsys/loaders')
from create_dataloader_sequential import (load_course_encoder)

os.chdir('/home/jcsanguino10/local_citation_model/recommender-fusion-recsys/architectures/Sequence')
from sec_transformer_pytorch import (create_model, load_pytorch_weights)

os.chdir('/home/jcsanguino10/local_citation_model/recommender-fusion-recsys/architectures/Multimodal')
from multimodal import Autoencoder, MultimodalModel






In [None]:
if torch.cuda.is_available():
    # Change to just a particular GPU changing the enviroment variable
    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    device = torch.device("cuda")
    # Change to just use a particular GPU via torch
    #torch.cuda.set_device("cuda:3")
    print(torch.cuda.current_device())

In [None]:
# Paths from the label encoder dicts 

PATH_TO_LABEL_ENCODER = '/home/jcsanguino10/local_citation_model/data/processed/'

# Path to folder with datasets

PATH_TO_DATASETS = '/home/jcsanguino10/local_citation_model/data/'

# Path to folder with checkpoints best models

PATH_TO_CHECKPOINTS = '/home/jcsanguino10/local_citation_model/models/'

In [None]:
label_encoder, dicts = load_course_encoder('/home', PATH_TO_LABEL_ENCODER)

In [None]:
len(label_encoder.classes_)  # List of all course IDs in the dataset

In [None]:
import pandas as pd
df_binary = pd.read_pickle(f'{PATH_TO_DATASETS}train_binary_all_vectors_128_01_transe_seqvec.pkl')
df_bpr_df = pd.read_pickle(f'{PATH_TO_DATASETS}train_bpr_all_vectors_128_01_transe_seqvec.pkl')

In [None]:
df_binary.head()

In [None]:
def concat_columns_to_tensor(df, columns, new_column_name):
    """
    Concatenates specified columns in a DataFrame and creates a tensor.
    The resulting tensor is saved in a new column.

    Args:
        df (pd.DataFrame): The input DataFrame.
        columns (list): List of column names to concatenate.
        new_column_name (str): Name of the new column to store the tensor.

    Returns:
        pd.DataFrame: The DataFrame with the new column containing tensors.
    """
    df[new_column_name] = df[columns].apply(
        lambda row: torch.tensor([item for col in columns for item in row[col]], dtype=torch.float),
        axis=1
    )
    return df

In [None]:
df_binary = concat_columns_to_tensor(df_binary, ['item_text_embedding', 'item_bpr_embedding', 'item_graph_embedding'], 'course_full_embeddings')
df_binary = concat_columns_to_tensor(df_binary, ['user_text_embedding', 'user_bpr_embedding', 'user_graph_embedding', 'user_sequence_embedding'], 'user_full_embeddings')

In [None]:
df_bpr_df = concat_columns_to_tensor(df_bpr_df, ['pos_item_text_embedding', 'pos_item_bpr_embedding', 'pos_item_graph_embedding'], 'pos_course_full_embeddings')
df_bpr_df = concat_columns_to_tensor(df_bpr_df, ['neg_item_text_embedding', 'neg_item_bpr_embedding', 'neg_item_graph_embedding'], 'neg_course_full_embeddings')
df_bpr_df = concat_columns_to_tensor(df_bpr_df, ['user_text_embedding', 'user_bpr_embedding', 'user_graph_embedding', 'user_sequence_embedding'], 'user_full_embeddings')

# Autoencoder training 

In [None]:
def train_autoencoder_and_extract_encoder(data, input_dim, encoding_dims, epochs=50, lr=1e-3, 
                                         save_path=None, device='cuda', verbose=True):
    """
    Train an autoencoder.
    
    Args:
        data (torch.Tensor): Training data tensor of shape (batch_size, input_dim)
        input_dim (int): Dimension of input features
        encoding_dims (list): List of hidden layer dimensions for encoder
                             Example: [512, 256, 128] for 3-layer encoder
        epochs (int): Number of training epochs
        lr (float): Learning rate
        save_path (str): Path to save the best autoencoder model (optional)
        device (str): Device to train on ('cpu' or 'cuda')
        verbose (bool): Whether to print training progress
        
    Returns:
        encoder (nn.Module): The trained autoencoder model
    """
    
    if verbose:
        print(f"🚀 Starting autoencoder training...")
        print(f"Input dimension: {input_dim}")
        print(f"Encoding dimensions: {encoding_dims}")
        print(f"Final encoding dimension: {encoding_dims[-1]}")
        print(f"Training data shape: {data.shape}")
    
    # Create autoencoder
    autoencoder = Autoencoder(input_dim=input_dim, encoding_dims=encoding_dims)
    
    if verbose:
        print(f"📊 Autoencoder architecture created")
        print(f"Encoder layers: {len(autoencoder.encoder)}")
        print(f"Decoder layers: {len(autoencoder.decoder)}")
    
    # Train the autoencoder using the enhanced train_autoencoder method
    trained_autoencoder = Autoencoder.train_autoencoder(
        autoencoder=autoencoder,
        data=data,
        epochs=epochs,
        lr=lr,
        save_path=save_path,
        device=device
    )
    
    
    if verbose:
        print(f"✅ Training completed!")
        print(f"🔧 Encoder extracted successfully")
        print(f"📐 Encoder output dimension: {encoding_dims[-1]}")
    
    return trained_autoencoder

In [None]:
course_tensor = [torch.tensor(x, dtype=torch.float32) for x in df_binary['course_full_embeddings'].values]
embeddings__course_tensor = torch.stack(course_tensor)

user_tensor = [torch.tensor(x, dtype=torch.float32) for x in df_binary['user_full_embeddings'].values]
embeddings_user_tensor = torch.stack(user_tensor)

In [None]:
course_encoder = train_autoencoder_and_extract_encoder(embeddings__course_tensor, embeddings__course_tensor.shape[1], [680, 560, 360], save_path=f'{PATH_TO_CHECKPOINTS}encoder_course.pth' ,epochs=100, lr=1e-3, verbose=False)

In [None]:
user_encoder = train_autoencoder_and_extract_encoder(embeddings_user_tensor, embeddings_user_tensor.shape[1], [680, 560, 360], save_path=f'{PATH_TO_CHECKPOINTS}encoder_user.pth' ,epochs=100, lr=1e-3, verbose=False)

# Multimodal training

In [None]:
modality_dims = {
    'course': embeddings__course_tensor.shape[1],
    'user': embeddings_user_tensor.shape[1]
}

In [None]:
model_encoder = MultimodalModel(modality_dims, use_bpr=True, fusion_method='by_autoencoder',shared_dim=32, layers_per_modality=[256 ,128, 64] ,autoencoders={'course': course_encoder, 'user': user_encoder}, autoencoder_output_dim=360)

In [None]:
# Create a dataloader for BPR training using the df_bpr_df dataframe with the columns: pos_course_full_embeddings, neg_course_full_embeddings, user_full_embeddings
class BPRDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_feat = self.df.iloc[idx]['user_full_embeddings']
        pos_course_feat = self.df.iloc[idx]['pos_course_full_embeddings']
        neg_course_feat = self.df.iloc[idx]['neg_course_full_embeddings']
        return {
            'user': torch.tensor(user_feat, dtype=torch.float),
            'course_positive': torch.tensor(pos_course_feat, dtype=torch.float),
            'course_negative': torch.tensor(neg_course_feat, dtype=torch.float)
        }
bpr_dataset = BPRDataset(df_bpr_df)
bpr_dataloader = torch.utils.data.DataLoader(bpr_dataset, batch_size=64, shuffle=True)

In [None]:
model_encoder.train_model(
    train_loader=bpr_dataloader,
    epochs=5,
    lr=1e-3,
    device='cuda',
    save_path=f'{PATH_TO_CHECKPOINTS}multimodal_encoder_bpr_model.pth'
)

In [None]:
model_no_encoder = MultimodalModel(modality_dims, use_bpr=True, fusion_method='concat',shared_dim=32, layers_per_modality=[680, 560, 360 ,256 ,128, 64] ,autoencoders=None, autoencoder_output_dim=None)

In [None]:
model_no_encoder.train_model(
    train_loader=bpr_dataloader,
    epochs=5,
    lr=1e-3,
    device='cuda',
    save_path=f'{PATH_TO_CHECKPOINTS}multimodal_no_encoder_bpr_model.pth'
)

In [None]:
## Create dataloaders for binary training using the df_binary dataframe with the columns: course_full_embeddings, user_full_embeddings
class BinaryDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_feat = self.df.iloc[idx]['user_full_embeddings']
        course_feat = self.df.iloc[idx]['course_full_embeddings']
        label = self.df.iloc[idx]['label']
        return {
            'user': torch.tensor(user_feat, dtype=torch.float),
            'course_positive': torch.tensor(course_feat, dtype=torch.float),
            'targets': torch.tensor(label, dtype=torch.float)
        }

In [None]:
## Split df_binary into train and validation sets
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df_binary, test_size=0.2, random_state=42)

binary_dataset = BinaryDataset(train_df)
binary_dataloader = torch.utils.data.DataLoader(binary_dataset, batch_size=64, shuffle=True)

val_dataset = BinaryDataset(val_df)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
model_binary = MultimodalModel(modality_dims, use_bpr=False, fusion_method='by_autoencoder',shared_dim=32, layers_per_modality=[256 ,128, 64] ,autoencoders={'course': course_encoder, 'user': user_encoder}, autoencoder_output_dim=360)

In [None]:
model_binary_no_encoder = MultimodalModel(modality_dims, use_bpr=False, fusion_method='concat',shared_dim=32, layers_per_modality=[680, 560, 360 ,256 ,128, 64] ,autoencoders=None, autoencoder_output_dim=None)

In [None]:
model_binary.train_model(
    train_loader=binary_dataloader,
    val_loader=val_dataloader,
    epochs=20,
    lr=1e-3,
    device='cuda',
    save_path=f'{PATH_TO_CHECKPOINTS}multimodal_encoder_binary_model.pth'
)

In [None]:
model_binary_no_encoder.train_model(
    train_loader=binary_dataloader,
    val_loader=val_dataloader,
    epochs=20,
    lr=1e-3,
    device='cuda',
    save_path=f'{PATH_TO_CHECKPOINTS}multimodal_no_encoder_binary_model.pth'
)

In [None]:
import pandas as pd
df_test_binary = pd.read_pickle(f'{PATH_TO_DATASETS}test_binary_all_vectors_128_01_transe_seqvec.pkl')
df_test_bpr = pd.read_pickle(f'{PATH_TO_DATASETS}test_bpr_all_vectors_128_01_transe_seqvec.pkl')

In [None]:
df_bpr_df.head()

In [None]:
# Create a mapping of user_id to user_sequence_embedding from the train dataset
user_sequence_mapping = df_binary.drop_duplicates(subset="user_id").set_index('user_id')['user_full_embeddings'].to_dict()

# Replace the user_sequence_embedding in the test dataset using the mapping
df_test_binary['user_full_embeddings'] = df_test_binary['user_id'].map(user_sequence_mapping)
df_test_bpr['user_full_embeddings'] = df_test_bpr['user_id'].map(user_sequence_mapping)

In [None]:
course_sequence_mapping = df_bpr_df.drop_duplicates(subset="pos_item_id").set_index('pos_item_id')['pos_course_full_embeddings'].to_dict()

In [None]:
def generate_recommendations_per_user(df, model, courses_dict, k=5, batch_size=64):
    all_user_embs = torch.stack(df["user_full_embeddings"].values)  # shape [num_users, dim]
    recommendations = []

    for i in tqdm(range(0, len(all_user_embs), batch_size), desc="Generating recommendations"):
        batch = all_user_embs[i:i+batch_size]
        batch_recs = model.generate_k_recommendations(courses_dict, batch, k=k)
        recommendations.extend(batch_recs)

    df["recommendations"] = recommendations
    return df


In [None]:
test_df_1 = generate_recommendations_per_user(df_test_binary.drop_duplicates(subset="user_id"),model_binary_no_encoder, course_sequence_mapping)

In [None]:
test_df_1["recommendations"]

In [None]:
os.chdir('/home/jcsanguino10/local_citation_model/Secuencial SR')
from evaluation_metrics import calculate_average_mrr, calculate_average_precision_at_k, calculate_average_ndcg_at_k, calculate_average_custom_precision_at_k
