In [None]:
import polars as pl
import os
from concurrent.futures import ThreadPoolExecutor

def process_parquet_file(input_path, output_path):
    df = pl.read_parquet(input_path)
    df = df.with_columns([
        pl.col(col).cast(pl.Float32)
        for col, dtype in df.schema.items()
        if dtype == pl.Float64
    ])
    df.write_parquet(output_path)

def process_partitioned_data(input_dir, output_dir, num_workers=4):
    os.makedirs(output_dir, exist_ok=True)
    parquet_files = [
        os.path.join(root, file)
        for root, _, files in os.walk(input_dir)
        for file in files if file.endswith(".parquet")
    ]

    output_files = [
        os.path.join(output_dir, os.path.relpath(file, input_dir))
        for file in parquet_files
    ]
    
    for output_file in output_files:
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        executor.map(process_parquet_file, parquet_files, output_files)

input_directory = "/home/jupyter/data/XGFeatures_partitioned"
output_directory = "/home/jupyter/data/XGFeatures_partitioned"
process_partitioned_data(input_directory, output_directory, num_workers=8)#change number of workers accordingly


In [None]:
import polars as pl
import numpy as np
data = pl.read_parquet("/home/jupyter/data/XGFeatures_partitioned/", use_pyarrow=True)

In [None]:
data = data.with_columns([
    pl.col(col).cast(pl.Float32) 
    for col, dtype in data.schema.items() 
    if dtype == pl.Float64
])


In [None]:
import os

output_dir = "../data/XGFeatures_row_partitioned/"
os.makedirs(output_dir, exist_ok=True)
chunk_size = 2_000_000

for i in range(0, len(data), chunk_size):
    chunk = data.slice(i, chunk_size)
    chunk.write_parquet(
        f"{output_dir}/XGFeatures_part_{i // chunk_size}.parquet",
        compression="zstd"
    )

In [None]:
#verify no null
null= data.null_count()
total=0
for col in null:
    total += col[0]

print(f'total null: {total}')

In [None]:
# Check responder_6 distribution
print(data.select(pl.col('responder_6')).describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert Polars DataFrame to Pandas for visualization
responder_6_data = data.select(pl.col("responder_6")).to_pandas()

# Plot the histogram with Seaborn
plt.figure(figsize=(10, 6))
sns.histplot(responder_6_data["responder_6"], bins=50, kde=True, color="blue")
plt.title("Distribution of responder_6", fontsize=16)
plt.xlabel("Responder 6", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()

In [None]:
constant_columns = [col for col in data.columns if data.select(pl.col(col).n_unique())[0, 0] == 1]
print(f"Constant columns: {constant_columns}")


In [None]:
# Display summary statistics to check for extreme outliers
summary_stats = data.select(pl.col("*")).describe()
print(summary_stats)

In [None]:
#drop columns of low correlation with f6 ignore for now, seems too tight 

numerical_columns = [
    col for col in data.columns if col not in ['date_id', 'time_id', 'symbol_id', 'weight', 'partition_id']
    and not col.startswith('responder_')  # Exclude all responder columns
]

correlation_with_target = {}
for col in numerical_columns:
    correlation = data.select(
        pl.corr(pl.col(col), pl.col("responder_6")).alias(col)
    ).to_dict(as_series=False)
    correlation_with_target[col] = correlation[col][0]

low_corr_threshold = 0.05
low_corr_features = [col for col, corr in correlation_with_target.items() if abs(corr) < low_corr_threshold]

print(f"Low-correlation features: {low_corr_features}")
if low_corr_features:
    data = data.drop(columns=low_corr_features)
    print(f"Dropped {len(low_corr_features)} features with low correlation to responder_6.")


In [None]:
# 80-20 split
split_date = data.select(pl.col("date_id").quantile(0.8)).to_numpy().item()

train_data = data.filter(pl.col("date_id") < split_date)
valid_data = data.filter(pl.col("date_id") >= split_date)

print(f"Training set: {len(train_data)} rows")
print(f"Validation set: {len(valid_data)} rows")


In [None]:
#feature importance
import xgboost as xgb

features = [col for col in train_data.columns if col not in ["responder_6", "date_id", "time_id"]]

dtrain = xgb.DMatrix(
    data=train_data.select(features).to_pandas(),
    label=train_data["responder_6"].to_pandas()
)

model = xgb.train(
    params={'tree_method': 'hist', 'max_depth': 3, 'n_estimators': 50},
    dtrain=dtrain,
    num_boost_round=100
)

importance = model.get_score(importance_type='weight')

important_features = sorted(importance.items(), key=lambda x: -x[1])
print("Top important features:", important_features[:10])


In [None]:
#more efficient feature importance test
import xgboost as xgb
import numpy as np

# Sample a subset of the data (70%)
sample_size = int(len(train_data) * 0.5)
subset = train_data.sample(n=sample_size)

features = [col for col in train_data.columns if col not in ["responder_6", "date_id", "time_id"]]
X_subset = subset.select(features).to_pandas()
y_subset = subset["responder_6"].to_pandas()

dtrain = xgb.DMatrix(data=X_subset, label=y_subset)

model = xgb.train(
    params={'tree_method': 'hist', 'max_depth': 3, 'n_estimators': 50},
    dtrain=dtrain,
    num_boost_round=50
)

importance = model.get_score(importance_type='weight')
important_features = sorted(importance.items(), key=lambda x: -x[1])
print("Top important features:", important_features[:10])


In [None]:
# Ensure no leakage in target variables
# Avoid lagged values of responder_6 and ensure no overlap with prediction window
leakage_check = [col for col in data.columns if "responder" in col and "lag" in col]
print("Potential leakage columns:", leakage_check)

In [None]:
print(data["responder_1_lag_1"] == data["responder_1"].shift(1))

In [None]:
# Check alignment of responder_1_lag_1
data_check0 = data.select([
    "date_id",
    "responder_1",
    pl.col("responder_1").shift(1).alias("responder_1_lag_1")
])

data_check1 = data.select([
    "date_id",
    "responder_2",
    pl.col("responder_2").shift(1).alias("responder_2_lag_1")
])

print(data_check.head(10))
print(data_check1.head(10))


In [None]:
# Check for missing `date_id` or `time_id` combinations
missing_rows = data.unique(subset=["date_id", "time_id"]).filter(pl.col("date_id").diff() > 1)
print(missing_rows)
