In [1]:
from tqdm import tqdm
from copy import deepcopy
import pandas as pd
import polars as pl
import numpy as np
import pickle

In [None]:
train_df = pl.read_parquet(
    "data/raw/train.parquet",
    use_pyarrow=True,   
    low_memory=True
).lazy()

In [None]:
test_df = pl.read_parquet(
    "data/raw/test.parquet",
    use_pyarrow=True,   
    low_memory=True
).lazy()

In [None]:
with open("features/best_pc1_contribution.pkl", "rb") as f:
    temp1 = pickle.load(f)
with open("features/best_label_correlation.pkl", "rb") as f:
    temp2 = pickle.load(f)
with open("features/best_overall.pkl", "rb") as f:
    temp3 = pickle.load(f)

In [5]:
def preprocessing(df):
    new_selected_variable_1 = deepcopy(temp1)
    new_selected_variable_2 = deepcopy(temp2)
    new_selected_variable_3 = deepcopy(temp3)
    if "timestamp" in df.collect_schema().names():
        new_selected_variable_1.append("timestamp")
        new_selected_variable_2.append("timestamp")
        new_selected_variable_3.append("timestamp")
    if "label" in df.collect_schema().names():
        new_selected_variable_1.append("label")
        new_selected_variable_2.append("label")
        new_selected_variable_3.append("label")
    cleaned_df_1 = df.select(new_selected_variable_1)
    cleaned_df_2 = df.select(new_selected_variable_2)
    cleaned_df_3 = df.select(new_selected_variable_3)
    return cleaned_df_1, cleaned_df_2, cleaned_df_3

In [None]:
cleaned_train_df_1, cleaned_train_df_2, cleaned_train_df_3 = preprocessing(train_df)
cleaned_train_df_1.sink_parquet("data/cleaned/cleaned_train_1.parquet")
cleaned_train_df_2.sink_parquet("data/cleaned/cleaned_train_2.parquet")
cleaned_train_df_3.sink_parquet("data/cleaned/cleaned_train_3.parquet")

In [None]:
cleaned_test_df_1, cleaned_test_df_2, cleaned_test_df_3 = preprocessing(test_df)
cleaned_test_df_1.sink_parquet("data/cleaned/cleaned_test_1.parquet")
cleaned_test_df_2.sink_parquet("data/cleaned/cleaned_test_2.parquet")
cleaned_test_df_3.sink_parquet("data/cleaned/cleaned_test_3.parquet")

In [None]:
popular_features_train = train_df.select(["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"])
popular_features_train.sink_parquet("cleaned/popular_features_train.parquet")

In [None]:
popular_features_test = test_df.select(["volume", "bid_qty", "ask_qty", "buy_qty", "sell_qty"])
popular_features_test.sink_parquet("cleaned/popular_features_test.parquet")