# Best practice: Set root directory to the root of the project


In [1]:
%load_ext autoreload
%autoreload 2
from constants import ROOT_DIR
import os
import sys
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)
os.chdir(ROOT_DIR)

# Load Imports and Configs

In [None]:
from configs.data_config import DataConfig
from configs.processing_config import DataProcessingConfig
import pandas as pd
from src.feature_processing.process_features import ProcessFeatures

data_config = DataConfig()
processing_config = DataProcessingConfig()

# Load Processed Data

In [37]:
users = pd.read_parquet(data_config.processed_users.file_path)
movies = pd.read_parquet(data_config.processed_movies.file_path)
ratings = pd.read_parquet(data_config.processed_ratings.file_path)

users.head()
movies.head()
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,interaction_num,qid,label
0,1,1193,5,978300760,42.0,0,1
1,1,661,3,978302109,23.0,1,1
2,1,914,3,978301968,28.0,2,1
3,1,3408,4,978300275,47.0,3,1
4,1,2355,5,978824291,4.0,4,1


# Process Features for Training

In [35]:
pf_object = ProcessFeatures(users, movies, ratings, processing_config)
pf_object.process_for_training()

[32m2025-05-04 16:04:35.718[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mcombine_features[0m:[36m34[0m - [1mCombining features to Ratings[0m
[32m2025-05-04 16:04:39.981[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mcombine_features[0m:[36m58[0m - [1mCompleted combining features to Ratings[0m
[32m2025-05-04 16:04:39.985[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mtrain_test_split[0m:[36m62[0m - [1mCreating train and val data for modelling[0m
[32m2025-05-04 16:04:40.702[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mtrain_test_split[0m:[36m82[0m - [1mCompleted creating train and val data for modelling[0m
[32m2025-05-04 16:04:40.703[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mnumerical_feature_processing_for_training[0m:[36m86[0m - [1mProcessing numerical features[0m
[32m2025-05-04 16:04:42.4

<src.feature_processing.process_features.ProcessFeatures at 0x470f20dd0>

# Save Processed Data for training

In [36]:
pf_object.train_data.to_parquet(data_config.train.file_path)
pf_object.val_data.to_parquet(data_config.val.file_path)
pf_object.test_data.to_parquet(data_config.test.file_path)

# Test process features for inference

In [38]:
pf_object = ProcessFeatures(users, movies, ratings,processing_config)
pf_object_loaded = pf_object.load()
pf_object_loaded.process_for_inference()
master_data = pf_object_loaded.master_data

[32m2025-05-04 16:06:12.135[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mload[0m:[36m16[0m - [1mLoaded[0m
[32m2025-05-04 16:06:12.138[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mcombine_features[0m:[36m34[0m - [1mCombining features to Ratings[0m
[32m2025-05-04 16:06:16.497[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mcombine_features[0m:[36m58[0m - [1mCompleted combining features to Ratings[0m
[32m2025-05-04 16:06:16.499[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mnumerical_feature_processing_for_inference[0m:[36m113[0m - [1mProcessing numerical features[0m
[32m2025-05-04 16:06:18.135[0m | [1mINFO    [0m | [36msrc.feature_processing.process_features[0m:[36mnumerical_feature_processing_for_inference[0m:[36m115[0m - [1mCompleted processing numerical features[0m
[32m2025-05-04 16:06:18.141[0m | [1mINFO    [0m | [

In [41]:
pf_object = ProcessFeatures(users, movies, ratings, processing_config)
pf_object_loaded = pf_object.load()
pf_object_loaded.process_for_inference()
pf_object_loaded.master_data.to_parquet(data_config.inference_model_input.file_path)