In [1]:
import pickle as pkl
import numpy as np
import os
import torch

from tqdm import tqdm


os.chdir('/home/jcsanguino10/local_citation_model/recommender-fusion-recsys/loaders')
from create_dataloader_sequential import (load_course_encoder)

os.chdir('/home/jcsanguino10/local_citation_model/recommender-fusion-recsys/architectures/Sequence')
from sec_transformer_pytorch import (create_model, load_pytorch_weights)

os.chdir('/home/jcsanguino10/local_citation_model/recommender-fusion-recsys/architectures/Multimodal')
from multimodal import Autoencoder, MultimodalModel


In [2]:
if torch.cuda.is_available():
    # Change to just a particular GPU changing the enviroment variable
    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    device = torch.device("cuda")
    # Change to just use a particular GPU via torch
    #torch.cuda.set_device("cuda:3")
    print(torch.cuda.current_device())

0


In [3]:
# Paths from the label encoder dicts 

PATH_TO_LABEL_ENCODER = '/home/jcsanguino10/local_citation_model/data/processed/'

# Path to folder with datasets

PATH_TO_DATASETS = '/home/jcsanguino10/local_citation_model/data/'

# Path to folder with checkpoints best models

PATH_TO_CHECKPOINTS = '/home/jcsanguino10/local_citation_model/models/'

In [4]:
label_encoder, dicts = load_course_encoder('/home', PATH_TO_LABEL_ENCODER)

Loading existing mappings from /home/jcsanguino10/local_citation_model/data/processed


In [5]:
len(label_encoder.classes_)  # List of all course IDs in the dataset

197

In [6]:
import pandas as pd
df_binary = pd.read_pickle(f'{PATH_TO_DATASETS}train_binary_all_vectors_128_01_transe_seqvec.pkl')
df_bpr_df = pd.read_pickle(f'{PATH_TO_DATASETS}train_bpr_all_vectors_128_01_transe_seqvec.pkl')

In [7]:
def concat_columns_to_tensor(df, columns, new_column_name):
    """
    Concatenates specified columns in a DataFrame and creates a tensor.
    The resulting tensor is saved in a new column.

    Args:
        df (pd.DataFrame): The input DataFrame.
        columns (list): List of column names to concatenate.
        new_column_name (str): Name of the new column to store the tensor.

    Returns:
        pd.DataFrame: The DataFrame with the new column containing tensors.
    """
    df[new_column_name] = df[columns].apply(
        lambda row: torch.tensor([item for col in columns for item in row[col]], dtype=torch.float),
        axis=1
    )
    return df

In [8]:
df_binary = concat_columns_to_tensor(df_binary, ['item_bpr_embedding'], 'course_full_embeddings')
df_binary = concat_columns_to_tensor(df_binary, ['user_bpr_embedding'], 'user_full_embeddings')

In [9]:
df_bpr_df = concat_columns_to_tensor(df_bpr_df, ['pos_item_bpr_embedding'], 'pos_course_full_embeddings')
df_bpr_df = concat_columns_to_tensor(df_bpr_df, ['neg_item_bpr_embedding'], 'neg_course_full_embeddings')
df_bpr_df = concat_columns_to_tensor(df_bpr_df, ['user_bpr_embedding'], 'user_full_embeddings')

# Autoencoder training 

In [10]:
def train_autoencoder_and_extract_encoder(data, input_dim, encoding_dims, epochs=50, lr=1e-3, 
                                         save_path=None, device='cuda', verbose=True):
    """
    Train an autoencoder.
    
    Args:
        data (torch.Tensor): Training data tensor of shape (batch_size, input_dim)
        input_dim (int): Dimension of input features
        encoding_dims (list): List of hidden layer dimensions for encoder
                             Example: [512, 256, 128] for 3-layer encoder
        epochs (int): Number of training epochs
        lr (float): Learning rate
        save_path (str): Path to save the best autoencoder model (optional)
        device (str): Device to train on ('cpu' or 'cuda')
        verbose (bool): Whether to print training progress
        
    Returns:
        encoder (nn.Module): The trained autoencoder model
    """
    
    if verbose:
        print(f"Starting autoencoder training...")
        print(f"Input dimension: {input_dim}")
        print(f"Encoding dimensions: {encoding_dims}")
        print(f"Final encoding dimension: {encoding_dims[-1]}")
        print(f"Training data shape: {data.shape}")
    
    # Create autoencoder
    autoencoder = Autoencoder(input_dim=input_dim, encoding_dims=encoding_dims)
    
    if verbose:
        print(f"Autoencoder architecture created")
        print(f"Encoder layers: {len(autoencoder.encoder)}")
        print(f"Decoder layers: {len(autoencoder.decoder)}")
    
    # Train the autoencoder using the enhanced train_autoencoder method
    trained_autoencoder = Autoencoder.train_autoencoder(
        autoencoder=autoencoder,
        data=data,
        epochs=epochs,
        lr=lr,
        save_path=save_path,
        device=device
    )
    
    
    if verbose:
        print(f"Training completed!")
        print(f"Encoder extracted successfully")
        print(f"Encoder output dimension: {encoding_dims[-1]}")
    
    return trained_autoencoder

In [11]:
course_tensor = [torch.tensor(x, dtype=torch.float) for x in df_binary['course_full_embeddings'].values]
embeddings__course_tensor = torch.stack(course_tensor)

user_tensor = [torch.tensor(x, dtype=torch.float) for x in df_binary['user_full_embeddings'].values]
embeddings_user_tensor = torch.stack(user_tensor)

  course_tensor = [torch.tensor(x, dtype=torch.float) for x in df_binary['course_full_embeddings'].values]
  user_tensor = [torch.tensor(x, dtype=torch.float) for x in df_binary['user_full_embeddings'].values]


In [12]:
#course_encoder = train_autoencoder_and_extract_encoder(embeddings__course_tensor, embeddings__course_tensor.shape[1], [720, 562, 432], save_path=f'{PATH_TO_CHECKPOINTS}encoder_course.pth' ,epochs=100, lr=1e-3, verbose=False)

In [13]:
#user_encoder = train_autoencoder_and_extract_encoder(embeddings_user_tensor, embeddings_user_tensor.shape[1], [720, 562, 432], save_path=f'{PATH_TO_CHECKPOINTS}encoder_user.pth' ,epochs=100, lr=1e-3, verbose=False)

# Multimodal training

In [14]:
modality_dims = {
    'course': embeddings__course_tensor.shape[1],
    'user': embeddings_user_tensor.shape[1]
}

In [15]:
#model_encoder = MultimodalModel(modality_dims, use_bpr=True, fusion_method='by_autoencoder',shared_dim=64, layers_per_modality=[312 ,256, 128] ,autoencoders={'course': course_encoder, 'user': user_encoder}, autoencoder_output_dim=432)

In [16]:
# Create a dataloader for BPR training using the df_bpr_df dataframe with the columns: pos_course_full_embeddings, neg_course_full_embeddings, user_full_embeddings
class BPRDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_feat = self.df.iloc[idx]['user_full_embeddings']
        pos_course_feat = self.df.iloc[idx]['pos_course_full_embeddings']
        neg_course_feat = self.df.iloc[idx]['neg_course_full_embeddings']
        return {
            'user': torch.tensor(user_feat, dtype=torch.float),
            'course_positive': torch.tensor(pos_course_feat, dtype=torch.float),
            'course_negative': torch.tensor(neg_course_feat, dtype=torch.float)
        }
bpr_dataset = BPRDataset(df_bpr_df)
bpr_dataloader = torch.utils.data.DataLoader(bpr_dataset, batch_size=64, shuffle=True)

In [17]:
# model_encoder.train_model(
#     train_loader=bpr_dataloader,
#     epochs=50,
#     lr=1e-3,
#     device='cuda',
#     save_path=f'{PATH_TO_CHECKPOINTS}multimodal_encoder_bpr_model.pth'
# )

In [18]:
model_no_encoder = MultimodalModel(modality_dims, use_bpr=True, fusion_method='concat',shared_dim=128, layers_per_modality=[128] ,autoencoders=None, autoencoder_output_dim=None)

User feature layer input dim: 128
Course feature layer input dim: 128


In [19]:
model_no_encoder.train_model(
    train_loader=bpr_dataloader,
    epochs=30,
    lr=1e-3,
    device='cuda',
    save_path=f'{PATH_TO_CHECKPOINTS}multimodal_no_encoder_bpr_model.pth',
    verbose=False
)

  'user': torch.tensor(user_feat, dtype=torch.float),
  'course_positive': torch.tensor(pos_course_feat, dtype=torch.float),
  'course_negative': torch.tensor(neg_course_feat, dtype=torch.float)


In [20]:
## Create dataloaders for binary training using the df_binary dataframe with the columns: course_full_embeddings, user_full_embeddings
class BinaryDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_feat = self.df.iloc[idx]['user_full_embeddings']
        course_feat = self.df.iloc[idx]['course_full_embeddings']
        label = self.df.iloc[idx]['label']
        return {
            'user': torch.tensor(user_feat, dtype=torch.float),
            'course_positive': torch.tensor(course_feat, dtype=torch.float),
            'targets': torch.tensor(label, dtype=torch.float)
        }

In [21]:
## Split df_binary into train and validation sets
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df_binary, test_size=0.2, random_state=42)

binary_dataset = BinaryDataset(train_df)
binary_dataloader = torch.utils.data.DataLoader(binary_dataset, batch_size=64, shuffle=True)

val_dataset = BinaryDataset(val_df)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

In [22]:
#model_binary = MultimodalModel(modality_dims, use_bpr=True, fusion_method='by_autoencoder',shared_dim=64, layers_per_modality=[312 ,256, 128] ,autoencoders={'course': course_encoder, 'user': user_encoder}, autoencoder_output_dim=432)

In [23]:
model_binary_no_encoder = MultimodalModel(modality_dims, use_bpr=False, fusion_method='concat',shared_dim=128, layers_per_modality=[128] ,autoencoders=None, autoencoder_output_dim=None)

User feature layer input dim: 128
Course feature layer input dim: 128


In [24]:
# model_binary.train_model(
#     train_loader=binary_dataloader,
#     val_loader=val_dataloader,
#     epochs=50,
#     lr=1e-3,
#     device='cuda',
#     save_path=f'{PATH_TO_CHECKPOINTS}multimodal_encoder_binary_model.pth'
# )

In [25]:
model_binary_no_encoder.train_model(
    train_loader=binary_dataloader,
    val_loader=val_dataloader,
    epochs=30,
    lr=1e-3,
    device='cuda',
    save_path=f'{PATH_TO_CHECKPOINTS}multimodal_no_encoder_binary_model.pth',
    verbose=False
)

  'user': torch.tensor(user_feat, dtype=torch.float),
  'course_positive': torch.tensor(course_feat, dtype=torch.float),


In [26]:
import pandas as pd
df_test_binary = pd.read_pickle(f'{PATH_TO_DATASETS}test_binary_all_vectors_128_01_transe_seqvec.pkl')
df_test_bpr = pd.read_pickle(f'{PATH_TO_DATASETS}test_bpr_all_vectors_128_01_transe_seqvec.pkl')

In [27]:
temp_df_train = df_binary.drop_duplicates(subset=['user_id']).reset_index(drop=True)

In [28]:
# Generate a mapping of all user IDs to the list of course IDs they have taken using the full_item_seq column in df_bpr_df
user_courses_taken = {}
for _, row in temp_df_train.iterrows():
    user_id = row['user_id']
    courses = row['full_item_seq']
    user_courses_taken[user_id] = courses

#Create a mapping of all user IDs and the user embeddings
user_embedding_mapping = {}
for _, row in temp_df_train.iterrows():
    user_id = row['user_id']
    embedding = row['user_full_embeddings']
    user_embedding_mapping[user_id] = embedding


In [29]:
temp_df_test = df_test_binary.drop_duplicates(subset=['user_id']).reset_index(drop=True)[['user_id', 'full_item_seq']]

#Create a new column in temp_df_test with the list of courses already taken by each user
temp_df_test['courses_taken'] = temp_df_test['user_id'].map(user_courses_taken)
temp_df_test['user_full_embeddings'] = temp_df_test['user_id'].map(user_embedding_mapping)

In [30]:
temp_df_test.head()

Unnamed: 0,user_id,full_item_seq,courses_taken,user_full_embeddings
0,0,[6897],"[6863, 6864]","[tensor(-0.1649), tensor(0.1331), tensor(-0.10..."
1,1,[6920],"[6865, 6866]","[tensor(-0.1568), tensor(0.1479), tensor(-0.14..."
2,2,"[6937, 6961, 6895, 6950, 6964, 6965, 6994, 702...","[6867, 6868, 6869, 6870, 6871, 6872, 6873, 687...","[tensor(-0.1206), tensor(0.1338), tensor(-0.13..."
3,3,[6996],"[6870, 6872]","[tensor(-0.1470), tensor(0.1081), tensor(-0.11..."
4,4,"[6901, 6865, 6912]","[6894, 6895, 6872, 6870]","[tensor(-0.1137), tensor(0.1420), tensor(-0.11..."


In [31]:
#Create a mapping of all course IDs to their course_full_embeddings
course_sequence_mapping = {}
for _, row in df_binary.drop_duplicates(subset=['item_id']).iterrows():
    item_id = row['item_id']
    course = row['course_full_embeddings']
    course_sequence_mapping[item_id] = course

In [32]:
os.chdir('/home/jcsanguino10/local_citation_model/Secuencial SR')
from evaluation_metrics import calculate_average_mrr, calculate_average_precision_at_k, calculate_average_ndcg_at_k, calculate_average_custom_precision_at_k


In [33]:
def generate_recommendations_per_user(df, model, courses_dict, k=5, batch_size=64):

    user_tensors = [torch.tensor(x) if not isinstance(x, torch.Tensor) else x 
                for x in df["user_full_embeddings"].values]

    user_tensors = torch.stack(user_tensors)

    all_user_embs = user_tensors  # shape [num_users, dim]

    courses_already_taken = df["courses_taken"].values
    recommendations = []

    for i in tqdm(range(0, len(all_user_embs), batch_size), desc="Generating recommendations"):
        batch = all_user_embs[i:i+batch_size]
        batch_courses_taken = courses_already_taken[i:i+batch_size]
        # Generate recommendations for the batch
        batch_recs = model.generate_k_recommendations(courses_dict, batch, batch_courses_taken, k=k)
        recommendations.extend(batch_recs)

    df["recommendations"] = recommendations
    return df


In [34]:
def test_model(df, models, courses_dict, k):
    for model in models:
        temp_df = generate_recommendations_per_user(df.drop_duplicates(subset="user_id"), model, courses_dict)
        k=5
        courses_test_dataset = temp_df["full_item_seq"].to_list()
        courses_recommended_list = temp_df["recommendations"].to_list()

        avg_mrr = calculate_average_mrr(courses_test_dataset, courses_recommended_list)
        avg_ndcg_at_k = calculate_average_ndcg_at_k(courses_test_dataset, courses_recommended_list, k)
        avg_precision_at_k = calculate_average_precision_at_k(courses_test_dataset, courses_recommended_list, k)
        avg_custom_precision_at_k = calculate_average_custom_precision_at_k(courses_test_dataset, courses_recommended_list, k)
        print(f"Average MRR: {avg_mrr}")
        print(f"Average NDCG@{k}: {avg_ndcg_at_k}")
        print(f"Average Precision@{k}: {avg_precision_at_k}")
        print(f"Average Custom Precision@{k}: {avg_custom_precision_at_k}") 


In [35]:
# test_model(temp_df_test, models=[model_no_encoder, model_binary_no_encoder], courses_dict=course_sequence_mapping, k=5)
test_model(temp_df_test, models=[model_no_encoder, model_binary_no_encoder], courses_dict=course_sequence_mapping, k=5)

Generating recommendations: 100%|██████████| 108/108 [00:00<00:00, 235.65it/s]


Average MRR: 0.1751153528583239
Average NDCG@5: 0.2395144053322115
Average Precision@5: 0.08917383068628834
Average Custom Precision@5: 0.2731118558453542


Generating recommendations: 100%|██████████| 108/108 [00:04<00:00, 26.54it/s]


Average MRR: 0.16040604206129672
Average NDCG@5: 0.21544790706181466
Average Precision@5: 0.07777939676526072
Average Custom Precision@5: 0.24131575112924358
