In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the dataset (relative path from the notebook's location in Exploration folder)
train_data = pd.read_parquet('../Data/train.parquet')

# Display the first few rows
train_data.head()


Unnamed: 0,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,...,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id
0,0,0,1,3.889038,,,,,,0.851033,...,0.738489,-0.069556,1.380875,2.005353,0.186018,1.218368,0.775981,0.346999,0.095504,0
1,0,0,7,1.370613,,,,,,0.676961,...,2.965889,1.190077,-0.523998,3.849921,2.626981,5.0,0.703665,0.216683,0.778639,0
2,0,0,9,2.285698,,,,,,1.056285,...,-0.864488,-0.280303,-0.326697,0.375781,1.271291,0.099793,2.109352,0.670881,0.772828,0
3,0,0,10,0.690606,,,,,,1.139366,...,0.408499,0.223992,2.294888,1.097444,1.225872,1.225376,1.114137,0.775199,-1.379516,0
4,0,0,14,0.44057,,,,,,0.9552,...,-0.373387,-0.502764,-0.348021,-3.928148,-1.591366,-5.0,-3.57282,-1.089123,-5.0,0


In [2]:
# Filter out the first 85 days
train_data = train_data[train_data['date_id'] >= 85]

# Drop rows that are completely empty
train_data = train_data.dropna(how='all')

In [3]:
# Define columns to exclude
exclude_columns = ['date_id', 'time_id', 'symbol_id', 'weight', 'partition_id'] + \
                  [col for col in train_data.columns if col.startswith('responder_')]

# Select numerical columns
numerical_columns = [col for col in train_data.columns if col not in exclude_columns]


In [4]:
# Interpolate missing values for numerical columns
train_data[numerical_columns] = train_data[numerical_columns].interpolate(method='linear', axis=0)

# Drop rows with any remaining NaN values
train_data = train_data.dropna()


In [5]:
# Calculate IQR thresholds
q1 = train_data[numerical_columns].quantile(0.25)
q3 = train_data[numerical_columns].quantile(0.75)
iqr = q3 - q1

# Calculate lower and upper bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Clip outliers to bounds
train_data[numerical_columns] = train_data[numerical_columns].clip(lower=lower_bound, upper=upper_bound, axis=1)


In [6]:
# Initialize scaler
scaler = MinMaxScaler()

# Scale numerical columns
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])


In [8]:
print(train_data[numerical_columns].head())

         feature_00  feature_01  feature_02  feature_03  feature_04  \
8212377    0.505810    0.724187    0.399453    0.454499    0.641736   
8212378    0.418131    0.665448    0.426027    0.419061    0.495121   
8212379    0.383212    0.825243    0.409935    0.398409    0.469392   
8212380    0.479958    0.756942    0.353602    0.399388    0.480056   
8212381    0.338658    0.795772    0.350866    0.393804    0.513743   

         feature_05  feature_06  feature_07  feature_08  feature_09  ...  \
8212377    0.488392         0.0    0.226021    0.449224      0.1125  ...   
8212378    0.484060         0.0    0.099103    0.432922      0.9875  ...   
8212379    0.495072         0.0    0.225261    0.448839      0.0250  ...   
8212380    0.485809         0.0    0.065998    0.436263      0.0000  ...   
8212381    0.485138         0.0    0.226461    0.459308      0.1125  ...   

         feature_69  feature_70  feature_71  feature_72  feature_73  \
8212377    0.625125    0.120495    1.000000  

In [10]:
for col in numerical_columns:
    length = train_data[col].size  # Size of the column
    nan_count = train_data[col].isna().sum()  # Count of NaN values
    print(f"Column: {col}, Length: {length}, NaN Count: {nan_count}")

Column: feature_00, Length: 38914961, NaN Count: 0
Column: feature_01, Length: 38914961, NaN Count: 0
Column: feature_02, Length: 38914961, NaN Count: 0
Column: feature_03, Length: 38914961, NaN Count: 0
Column: feature_04, Length: 38914961, NaN Count: 0
Column: feature_05, Length: 38914961, NaN Count: 0
Column: feature_06, Length: 38914961, NaN Count: 0
Column: feature_07, Length: 38914961, NaN Count: 0
Column: feature_08, Length: 38914961, NaN Count: 0
Column: feature_09, Length: 38914961, NaN Count: 0
Column: feature_10, Length: 38914961, NaN Count: 0
Column: feature_11, Length: 38914961, NaN Count: 0
Column: feature_12, Length: 38914961, NaN Count: 0
Column: feature_13, Length: 38914961, NaN Count: 0
Column: feature_14, Length: 38914961, NaN Count: 0
Column: feature_15, Length: 38914961, NaN Count: 0
Column: feature_16, Length: 38914961, NaN Count: 0
Column: feature_17, Length: 38914961, NaN Count: 0
Column: feature_18, Length: 38914961, NaN Count: 0
Column: feature_19, Length: 389

In [None]:
# Check for columns with NaN values
nan_counts = train_data.isnull().sum()

# Filter columns with NaN values
columns_with_nan = nan_counts[nan_counts > 0]
print("Columns with NaN values:\n", columns_with_nan)


In [None]:
# Identify constant columns (zero variance)
constant_columns = [col for col in numerical_columns if train_data[col].nunique() == 1]
print("Constant columns:", constant_columns)

# Drop constant columns
train_data = train_data.drop(columns=constant_columns)
numerical_columns = [col for col in numerical_columns if col not in constant_columns]


In [None]:
# Calculate correlation matrix
corr_matrix = train_data[numerical_columns].corr().abs()

# Find highly correlated pairs
correlated_pairs = [(i, j) for i in corr_matrix.columns for j in corr_matrix.columns 
                    if i != j and corr_matrix.loc[i, j] > 0.9]

print("Highly correlated pairs:", correlated_pairs)


In [None]:
# Drop one column from each correlated pair
columns_to_drop = set()
for i, j in correlated_pairs:
    if i not in columns_to_drop and j not in columns_to_drop:
        columns_to_drop.add(j)

train_data = train_data.drop(columns=list(columns_to_drop))
numerical_columns = [col for col in numerical_columns if col not in columns_to_drop]
