In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [4]:
# 1. Setup Paths
BASE_PATH = "/rds/rds-lxu/ml_datasets/hf_ts_forecast"
TRAIN_PATH = os.path.join(BASE_PATH, "train.parquet")
TEST_PATH = os.path.join(BASE_PATH, "test.parquet")

In [5]:
# 2. Load Data (Using pandas to read parquet)
print("Loading data... this might take a minute depending on file size.")
train_df = pd.read_parquet(TRAIN_PATH)
test_df = pd.read_parquet(TEST_PATH)

# 3. Basic Inspection
print(f"Train Shape: {train_df.shape}")
print(f"Test Shape:  {test_df.shape}")
display(train_df.head())

Loading data... this might take a minute depending on file size.
Train Shape: (5337414, 94)
Test Shape:  (1447107, 92)


Unnamed: 0,id,code,sub_code,sub_category,horizon,ts_index,feature_a,feature_b,feature_c,feature_d,...,feature_ca,feature_cb,feature_cc,feature_cd,feature_ce,feature_cf,feature_cg,feature_ch,y_target,weight
0,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__25__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,25,89,29,16.364093,7.464023,5.966933,...,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.551324,40.982572
1,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__1__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,1,89,53,2.858806,5.050617,15.906651,...,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.315583,150.075406
2,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__3__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,3,89,51,9.585452,1.076268,9.004147,...,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.362894,115.953552
3,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__10__89,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,10,89,44,8.840588,15.034634,4.17078,...,-0.001686,-0.105328,-0.005045,,-0.133697,2.849819,0.112068,1,-0.667023,64.573073
4,W2MW3G2L__J0G2B0KU__PZ9S1Z4V__25__90,W2MW3G2L,J0G2B0KU,PZ9S1Z4V,25,90,28,2.303825,7.696209,12.8961,...,-0.001622,-0.103809,-0.005135,,-0.17466,2.738606,0.109204,1,-0.437398,41.948761


In [None]:
# --- EDA SECTION ---

# 4. Check Horizon Distribution
# This confirms how many short-term vs long-term predictions you need to make
plt.figure(figsize=(10, 4))
sns.countplot(data=train_df, x='horizon')
plt.title("Distribution of Forecast Horizons")
plt.show()

# 5. Visualize a Single Asset (Code) over Time
# Pick one asset code to see how features change over ts_index
sample_code = train_df['code'].unique()[0]
asset_data = train_df[train_df['code'] == sample_code].sort_values('ts_index')

plt.figure(figsize=(15, 5))
# Plotting feature_a as an example of time-series movement
sns.lineplot(data=asset_data, x='ts_index', y='feature_a', hue='horizon')
plt.title(f"Feature A movement for Asset: {sample_code} across all Horizons")
plt.legend(title='Horizon', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# 6. Correlation Heatmap (Sampling 10 features to keep it readable)
# This helps identify which features are strongly related
feature_cols = [c for c in train_df.columns if c.startswith('feature_')]
subset_features = feature_cols[:15] # Just the first 15

plt.figure(figsize=(12, 10))
sns.heatmap(train_df[subset_features].corr(), annot=False, cmap='coolwarm')
plt.title("Correlation Matrix of First 15 Features")
plt.show()

# 7. Check for Missing Values
nan_counts = train_df.isna().sum().sum()
print(f"Total missing values in training set: {nan_counts}")