In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the dataset (relative path from the notebook's location in Exploration folder)
train_data = pd.read_parquet('../data/jane-street/train.parquet')

# Display the first few rows
#train_data.head()


In [2]:
# Filter out the first 85 days
train_data = train_data[train_data['date_id'] >= 85]

# Drop rows that are completely empty
train_data = train_data.dropna(how='all')

In [3]:
# Define columns to exclude
exclude_columns = ['date_id', 'time_id', 'symbol_id', 'weight', 'partition_id'] + \
                  [col for col in train_data.columns if col.startswith('responder_')]

# Select numerical columns
numerical_columns = [col for col in train_data.columns if col not in exclude_columns]


In [4]:
# Interpolate missing values for numerical columns
train_data[numerical_columns] = train_data[numerical_columns].interpolate(method='linear', axis=0)

# Drop rows with any remaining NaN values
train_data = train_data.dropna()


In [5]:
# Calculate IQR thresholds
q1 = train_data[numerical_columns].quantile(0.25)
q3 = train_data[numerical_columns].quantile(0.75)
iqr = q3 - q1

# Calculate lower and upper bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Clip outliers to bounds
train_data[numerical_columns] = train_data[numerical_columns].clip(lower=lower_bound, upper=upper_bound, axis=1)


In [None]:
#print(train_data[numerical_columns].head())

In [None]:
'''for col in numerical_columns:
    length = train_data[col].size  # Size of the column
    nan_count = train_data[col].isna().sum()  # Count of NaN values
    print(f"Column: {col}, Length: {length}, NaN Count: {nan_count}")'''

In [6]:
# Check for columns with NaN values
nan_counts = train_data.isnull().sum()

# Filter columns with NaN values
columns_with_nan = nan_counts[nan_counts > 0]
print("Columns with NaN values:\n", columns_with_nan)


Columns with NaN values:
 Series([], dtype: int64)


In [None]:
# Identify constant columns (zero variance)
constant_columns = [col for col in numerical_columns if train_data[col].nunique() == 1]
print("Constant columns:", constant_columns)

# Drop constant columns
train_data = train_data.drop(columns=constant_columns)
numerical_columns = [col for col in numerical_columns if col not in constant_columns]


In [None]:
# Calculate correlation matrix
corr_matrix = train_data[numerical_columns].corr().abs()

# Find highly correlated pairs
correlated_pairs = [(i, j) for i in corr_matrix.columns for j in corr_matrix.columns 
                    if i != j and corr_matrix.loc[i, j] > 0.9]

print("Highly correlated pairs:", correlated_pairs)


In [None]:
# Drop one column from each correlated pair
columns_to_drop = set()
for i, j in correlated_pairs:
    if i not in columns_to_drop and j not in columns_to_drop:
        columns_to_drop.add(j)

train_data = train_data.drop(columns=list(columns_to_drop))
numerical_columns = [col for col in numerical_columns if col not in columns_to_drop]


In [None]:
# Save the cleaned data as Parquet
train_data.to_parquet('../data/XGBoost.parquet', engine='pyarrow', compression='snappy')


In [None]:
# Initialize scaler - TFT only
scaler = MinMaxScaler()

# Scale numerical columns
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])


In [None]:
print("Columns in saved data:", train_data.columns)