In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

train_data = pd.read_parquet('/home/jupyter/data/jane-street/train.parquet')
#train_data.head()

In [None]:
train_data = train_data[train_data['date_id'] >= 85]
# Drop rows that are completely empty
train_data = train_data.dropna(how='all')

In [None]:
exclude_columns = ['date_id', 'time_id', 'symbol_id', 'weight', 'partition_id', "responder_6"]
numerical_columns = [col for col in train_data.columns if col not in exclude_columns]

In [None]:
train_data[numerical_columns] = train_data[numerical_columns].interpolate(method='linear', axis=0)
train_data = train_data.dropna()

In [None]:
# Calculate IQR and clip
q1 = train_data[numerical_columns].quantile(0.25)
q3 = train_data[numerical_columns].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
train_data[numerical_columns] = train_data[numerical_columns].clip(lower=lower_bound, upper=upper_bound, axis=1)

In [None]:
print(train_data[numerical_columns].head())

In [None]:
for col in numerical_columns: #-> not needed
    length = train_data[col].size
    nan_count = train_data[col].isna().sum()
    print(f"Column: {col}, Length: {length}, NaN Count: {nan_count}")

In [None]:
# Check for columns with NaN values -> not needed
nan_counts = train_data.isnull().sum()
columns_with_nan = nan_counts[nan_counts > 0]
print("Columns with NaN values:\n", columns_with_nan)

In [None]:
# Identify constant columns (zero variance) -> not needed
constant_columns = [col for col in numerical_columns if train_data[col].nunique() == 1]
print("Constant columns:", constant_columns)
train_data = train_data.drop(columns=constant_columns)
numerical_columns = [col for col in numerical_columns if col not in constant_columns]


In [None]:
# Calculate correlation matrix
corr_matrix = train_data[numerical_columns].corr().abs()
correlated_pairs = [(i, j) for i in corr_matrix.columns for j in corr_matrix.columns 
                    if i != j and corr_matrix.loc[i, j] > 0.9]
print("Highly correlated pairs:", correlated_pairs)

In [None]:
# Drop one column from each correlated pair
columns_to_drop = set()
for i, j in correlated_pairs:
    if i not in columns_to_drop and j not in columns_to_drop:
        columns_to_drop.add(j)
        
train_data = train_data.drop(columns=list(columns_to_drop))
numerical_columns = [col for col in numerical_columns if col not in columns_to_drop]

In [None]:
train_data.to_parquet('/home/jupyter/data/TFT.parquet', engine='pyarrow', compression='snappy')

In [None]:
# Initialize scaler - TFT only
scaler = MinMaxScaler()
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])