In [2]:
import polars as pl

# Load the Parquet file with Pandas
data = pl.read_parquet('/home/jupyter/data/int32Features.parquet', use_pyarrow=True)
#print(data.info())
#print(data.head())

In [2]:
# Filter rows with date_id >= 85
data = data.filter(pl.col("date_id") >= 85)
#print(train_data.head())


In [3]:
# Define columns to exclude
exclude_columns = ['date_id', 'time_id', 'symbol_id', 'weight', 'partition_id', "responder_6"]
feature_columns = [col for col in data.columns if col not in exclude_columns]

In [4]:
# Generate lagged features for numerical feature columns
for feature in feature_columns:
    data = data.with_columns(
        pl.col(feature).shift(1).over("symbol_id").alias(f"{feature}_lag_1")
    )

In [5]:
# Fill new NaN values
data = data.fill_null(0)

In [None]:
# Generate difference features
for feature in feature_columns:
    data = data.with_columns(
        (pl.col(feature) - pl.col(f'{feature}_lag_1')).alias(f'{feature}_lag_diff_1')
    )

In [None]:
# Generate ratio features
for feature in feature_columns:
    data = data.with_columns(
        (pl.col(feature) / (pl.col(f'{feature}_lag_1') + 1e-9)).alias(f'{feature}_lag_ratio_1')
    )

In [7]:
# Save the dataset for XGBoost (raw + lagged)
data.write_parquet('/home/jupyter/data/Features.parquet', compression='snappy')

In [None]:
print(data.head())
total_nulls = data.null_count().sum()
print(f"Total null values in dataset: {total_nulls}")

In [3]:
import os
output_dir = "/home/jupyter/data/partitioned/"
os.makedirs(output_dir, exist_ok=True)
chunk_size = 1_500_000
# Partition the dataset by rows and save each chunk
for i in range(0, len(data), chunk_size):
    chunk = data.slice(i, chunk_size)
    chunk.write_parquet(
        f"{output_dir}/XGFeatures_part_{i // chunk_size}.parquet",
        compression="zstd"  # Used ZSTD for best performance and compression
    )

In [None]:
null_counts = data.null_count()
print(null_counts)

In [None]:
print(data.columns)

In [None]:
data.write_parquet('../data/XGFeatures.parquet', compression='snappy')

In [None]:
from sklearn.preprocessing import MinMaxScaler#for TFT

#Feature scaling
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(data.select(feature_columns).to_numpy())

scaled_df = pl.DataFrame(scaled_features, schema=feature_columns)
data = data.drop(feature_columns).hstack(scaled_df)

data.write_parquet('../data/TFT_Featured_Scaled.parquet', compression='snappy')'''

In [None]:
import polars as pl #lazy way to fill null values(explored due to memory issues)


data = pl.scan_parquet('../data/XGFeatures.parquet')
data = data.with_columns([
    pl.col("*").fill_null(0)
])
data = data.collect()
print(data.head())