In [1]:
import sys
import os
from pathlib import Path

# Add parent directory to Python path
module_path = str(Path().absolute().parent)
if module_path not in sys.path:
    sys.path.append(module_path)
    
from data.preprocessor import MusicDataPreprocessor


In [2]:
from utils.logger import LhydraLogger, log_function
import logging



logger = LhydraLogger(log_dir="../shipping_logs", log_level=logging.INFO)


logger.info("Starting data preprocessing script")

[2025-03-30 17:45:42,879] [INFO] [logger:__init__:77] - Logger initialized. Log directory: c:\Users\mecha\Documents\lhydra-hybrid\shipping_logs
[2025-03-30 17:45:42,881] [INFO] [logger:info:172] - Starting data preprocessing script


In [3]:
 # Initialize preprocessor
preprocessor = MusicDataPreprocessor(config_path="./training/configs/training_config.yaml", logger=logger)

# Run preprocessing pipeline
train_df, val_df, test_df = preprocessor.preprocess_pipeline("../data/raw/spotify_complete_dataset.csv", "../shipping_dataset_4_training")


logger.info("Data preprocessing completed successfully") 

[2025-03-30 17:46:30,129] [INFO] [logger:info:172] - Initializing MusicDataPreprocessor
[2025-03-30 17:46:30,132] [INFO] [logger:info:172] - MusicDataPreprocessor initialized with config: {'user_demographic_features': ['age', 'gender', 'country'], 'user_listening_features': ['monthly_hours', 'genre_diversity', 'top_genre'], 'user_audio_preferences': ['avg_danceability', 'avg_energy', 'avg_key', 'avg_loudness', 'avg_mode', 'avg_speechiness', 'avg_acousticness', 'avg_instrumentalness', 'avg_liveness', 'avg_valence', 'avg_tempo', 'avg_time_signature'], 'track_metadata_features': ['artist', 'main_genre', 'year', 'duration_ms'], 'track_audio_features': ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'], 'target_column': 'playcount', 'target_threshold': 5, 'test_size': 0.2, 'validation_size': 0.1, 'random_state': 42, 'embedding_dim': 32, 'categorical_features': ['gender', 'country', 'top_g

In [4]:
train_df

Unnamed: 0,avg_liveness,avg_tempo,loudness,instrumentalness,avg_valence,tempo,user_id,avg_mode,avg_speechiness,avg_loudness,...,top_genre_electronic,top_genre_folk,top_genre_hip_hop,top_genre_jazz,top_genre_latin,top_genre_other,top_genre_pop,top_genre_rb_soul,top_genre_rock,top_genre_unknown
21424,-0.518001,-0.834184,0.476193,2.510766,-1.852639,-0.189835,5942,-0.221305,0.842753,0.726739,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
43465,-0.426103,0.311123,0.933148,-0.559421,-0.569596,-1.461766,1652,0.044181,-0.605254,-0.049310,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
37009,0.663921,-0.260592,1.185336,-0.382501,0.865575,-0.324868,975,-0.008785,0.347197,1.475843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19095,0.367932,0.043452,1.041998,-0.559411,0.064291,1.610877,3347,0.251646,-0.566118,-0.120418,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12628,-1.325635,0.477335,0.642432,-0.558665,-1.985124,0.236006,6448,-1.630095,-1.153574,1.345107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69820,0.345153,0.096681,-0.483522,-0.445376,0.110198,-0.393403,2741,-0.381766,0.430643,0.394132,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
63417,-0.353702,-0.890806,0.471882,-0.492198,-0.909719,1.471254,1769,0.080579,-0.478838,-1.313864,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
49864,-0.341822,-1.889626,0.813251,-0.559421,-0.284172,-1.655706,2840,1.149749,0.517195,0.654100,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
65513,-1.236843,0.409188,-0.217324,-0.559242,-0.739003,0.571813,6675,1.642497,-1.188045,-0.306678,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
train_df.columns.array

<NumpyExtensionArray>
[        'avg_liveness',            'avg_tempo',             'loudness',
     'instrumentalness',          'avg_valence',                'tempo',
              'user_id',             'avg_mode',      'avg_speechiness',
         'avg_loudness',
 ...
 'top_genre_electronic',       'top_genre_folk',    'top_genre_hip_hop',
       'top_genre_jazz',      'top_genre_latin',      'top_genre_other',
        'top_genre_pop',    'top_genre_rb_soul',       'top_genre_rock',
    'top_genre_unknown']
Length: 4096, dtype: object