In [1]:
import sys
import os
from pathlib import Path
import joblib
import pandas as pd

# Add parent directory to Python path
module_path = str(Path().absolute().parent)
if module_path not in sys.path:
    sys.path.append(module_path)

In [1]:
from utils.logger import LhydraLogger, log_function
import logging

logger = LhydraLogger(log_dir="../ship_test_1k_logs1", log_level=logging.INFO)


logger.info("Starting 1k data preprocessing script")

def preprocess_sample_data(sample_data_path, preprocessor_path, save_dir=None):
    """
    Loads a preprocessor and applies it to sample data.

    Args:
        sample_data_path (str): Path to the sample data CSV file.
        preprocessor_path (str): Path to the saved joblib preprocessor file.

    Returns:
        pandas.DataFrame: The preprocessed sample data.
    """
    try:
        # 1. Load the preprocessor
        preprocessor = joblib.load(preprocessor_path)
        print(f"Preprocessor loaded from: {preprocessor_path}")

        # 2. Load the sample data
        # sample_df = pd.read_csv(sample_data_path)
        # print(f"Sample data loaded from: {sample_data_path}")

        # 3. Apply the preprocessing transformations
        # Assuming the preprocessor object has a 'transform' method,
        # otherwise you'll have to adapt the code to your preprocessor's methods.
        # Run preprocessing pipeline
        train_df, val_df, test_df = preprocessor.preprocess_pipeline(sample_data_path, save_dir)

        print("Sample data preprocessed.")
        return train_df, val_df, test_df

    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

[2025-03-30 21:00:04,926] [INFO] [logger:__init__:77] - Logger initialized. Log directory: c:\Users\mecha\Documents\lhydra-hybrid\ship_test_1k_logs1
[2025-03-30 21:00:04,927] [INFO] [logger:info:172] - Starting 1k data preprocessing script


In [3]:

# # Example usage:
# sample_data_path = "spotify_1k_sample_data.csv"  # Replace with your sample data path
# preprocessor_path = "../shipping_dataset_4_training/preprocessor.joblib"  # Replace with your preprocessor path

# preprocessed_data = preprocess_sample_data(sample_data_path, preprocessor_path, save_dir="ship_test_1k_data")


In [21]:
from utils.data_utils import (
    load_config, 
    load_data, 
    load_preprocessed_data,
    prepare_user_item_data,
    preprocess_user_features, 
    preprocess_item_features,
    create_interaction_features,
    train_test_split_interactions,
    create_data_loaders,
    HybridRecommenderDataset
)
from utils.logging_utils import setup_logging, log_training_info

In [3]:
config = load_config("../training/configs/training_config.yaml")
config

{'model': {'embedding_dim': 32,
  'hidden_dims': [128, 64],
  'prediction_dims': [64, 32],
  'dropout': 0.2,
  'final_layer_size': 16,
  'user_tower': {'hidden_layers': [128, 64],
   'dropout': 0.2,
   'activation': 'relu'},
  'item_tower': {'hidden_layers': [128, 64],
   'dropout': 0.2,
   'activation': 'relu'}},
 'training': {'learning_rate': 0.001,
  'weight_decay': '1e-5',
  'num_epochs': 100,
  'patience': 10,
  'batch_size': 64,
  'num_workers': 4,
  'checkpoint_dir': 'models/checkpoints',
  'log_dir': 'logs',
  'optimizer': 'adam',
  'loss_function': 'binary_cross_entropy',
  'lr_scheduler': {'use': True, 'factor': 0.5, 'patience': 3},
  'early_stopping_patience': 5},
 'data': {'target_column': 'high_engagement',
  'train_path': 'data/interactions.csv',
  'preprocessed_data_dir': 'data/preprocessed',
  'user_features_path': 'data/user_features.csv',
  'item_features_path': 'data/item_features.csv',
  'user_demographic_features': ['age', 'gender', 'country'],
  'user_listening_fe

In [5]:
config['data']['random_seed']


42

## LoADING PROCEESED DATA

In [6]:
train_df, val_df, test_df, preprocessor = load_preprocessed_data(
    "ship_test_1k_data"
    )


In [10]:
train_df.head()

Unnamed: 0,monthly_hours,duration_ms,avg_mode,avg_acousticness,avg_instrumentalness,avg_key,instrumentalness,avg_time_signature,key,mode,...,artist_The Kooks,artist_The Number Twelve Looks Like You,artist_The Verve,artist_Underoath,mood_category_angry,mood_category_happy,mood_category_peaceful,mood_category_sad,gender_Female,gender_Male
0,0.307007,1.310824,0.952471,-0.780132,-0.902954,0.023178,-0.29407,0.554011,0.416387,0.868554,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.470249,1.310824,-0.006353,0.230154,-0.603785,0.409406,-0.29407,0.248153,0.416387,0.868554,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.686707,-0.729951,0.77862,-1.212915,-0.912213,2.422214,-0.29407,0.554011,0.734969,-1.151339,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-0.819999,1.310824,-0.888979,1.07206,-0.879335,0.737489,-0.29407,-0.637225,0.416387,0.868554,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.445888,-0.729951,-2.741866,-0.848112,-0.910682,2.745544,-0.29407,0.554011,0.734969,-1.151339,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [14]:
test_df.shape


(200, 211)

In [15]:
interactions_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
interactions_df.head()


Unnamed: 0,monthly_hours,duration_ms,avg_mode,avg_acousticness,avg_instrumentalness,avg_key,instrumentalness,avg_time_signature,key,mode,...,artist_The Kooks,artist_The Number Twelve Looks Like You,artist_The Verve,artist_Underoath,mood_category_angry,mood_category_happy,mood_category_peaceful,mood_category_sad,gender_Female,gender_Male
0,0.307007,1.310824,0.952471,-0.780132,-0.902954,0.023178,-0.29407,0.554011,0.416387,0.868554,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.470249,1.310824,-0.006353,0.230154,-0.603785,0.409406,-0.29407,0.248153,0.416387,0.868554,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.686707,-0.729951,0.77862,-1.212915,-0.912213,2.422214,-0.29407,0.554011,0.734969,-1.151339,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-0.819999,1.310824,-0.888979,1.07206,-0.879335,0.737489,-0.29407,-0.637225,0.416387,0.868554,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.445888,-0.729951,-2.741866,-0.848112,-0.910682,2.745544,-0.29407,0.554011,0.734969,-1.151339,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [16]:
interactions_df, user_features_df, item_features_df = prepare_user_item_data(
            interactions_df, 
            config['data'],
            use_high_engagement=True
        )

In [17]:
user_features_df.head()


Unnamed: 0,age,monthly_hours,genre_diversity,avg_danceability,avg_energy,avg_key,avg_loudness,avg_mode,avg_speechiness,avg_acousticness,...,age_group_genre_teen_pop,age_group_genre_teen_rock,age_group_genre_young_adult_electronic,age_group_genre_young_adult_folk,age_group_genre_young_adult_hip_hop,age_group_genre_young_adult_jazz,age_group_genre_young_adult_other,age_group_genre_young_adult_pop,age_group_genre_young_adult_rock,user_id
0,-0.781937,0.307007,-0.000798,0.441511,0.919701,0.023178,1.231868,0.952471,1.014843,-0.780132,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,346
1,-0.241599,0.470249,-0.000798,0.294228,-0.03801,0.409406,0.323455,-0.006353,0.236458,0.230154,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,625
2,-0.241599,0.686707,0.797035,1.824603,0.19479,2.422214,1.078535,0.77862,-0.174098,-1.212915,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,455
3,1.041705,-0.819999,1.594868,0.817977,-0.758553,0.737489,-0.020488,-0.888979,1.530112,1.07206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,674
4,-0.174056,-0.445888,-0.000798,3.070667,-0.366072,2.745544,-0.272958,-2.741866,0.513441,-0.848112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,786


In [18]:
item_features_df.head()


Unnamed: 0,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,duration_min,duration_category_long,duration_category_medium,duration_category_short,duration_category_very_long,mood_category_angry,mood_category_happy,mood_category_peaceful,mood_category_sad,track_id
0,-0.796408,1.310824,-0.869455,0.639348,0.416387,0.408878,0.868554,-0.783334,-0.565099,-0.29407,...,1.310824,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,20
2,-0.475599,-0.729951,1.239807,-0.028112,0.734969,0.197401,-1.151339,0.32764,0.97781,-0.29407,...,-0.729951,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,17
5,1.449254,-0.708903,1.277281,-0.542133,-1.813687,1.116262,0.868554,1.394716,-0.096792,-0.29407,...,-0.708903,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,9
6,-1.758834,-2.037927,-1.345912,-2.29901,0.734969,-1.438079,0.868554,-1.050103,-0.333083,2.152929,...,-2.037927,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,30
7,0.807636,-0.586251,0.618806,0.478237,0.734969,0.021479,-1.151339,0.67883,-0.323313,-0.293623,...,-0.586251,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,44


In [24]:
# Create data loaders with target column name from config
target_col = config['data'].get('target_column', 'high_engagement')
dataloaders = create_data_loaders(
    train_df,
    val_df,
    test_df,
    user_features_df,
    item_features_df,
    batch_size=config['training']['batch_size'],
    target_col=target_col
)
dataloaders

{'train': <torch.utils.data.dataloader.DataLoader at 0x2179d597e20>,
 'validation': <torch.utils.data.dataloader.DataLoader at 0x2179d9c2a30>,
 'test': <torch.utils.data.dataloader.DataLoader at 0x2179d9c2bb0>}

In [25]:
dataloaders['train']

<torch.utils.data.dataloader.DataLoader at 0x2179d597e20>