In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("data/processed/train_processed.csv", index_col="time", parse_dates=True)
eval_df = pd.read_csv("data/processed/eval_processed.csv", index_col="time", parse_dates=True)
holdout_df = pd.read_csv("data/processed/holdout_processed.csv", index_col="time", parse_dates=True)

print("Data loaded successfully!")
print("Train shape:", train_df.shape)

Data loaded successfully!
Train shape: (105, 6)


In [2]:
def validate_weather_data(df: pd.DataFrame, dataset_name: str):
    """Runs strict validation checks based on test_data.py constraints."""
    
    # City Check (Must be 1-10)
    assert df['city'].between(1, 10).all(), f"[{dataset_name}] City ID out of range! Must be 1-10."
    
    # Temperature Checks (-30 to 45 degrees)
    assert df['temperature_2m_max'].between(-30, 45).all(), f"[{dataset_name}] Max temp out of logical bounds!"
    assert df['temperature_2m_min'].between(-30, 45).all(), f"[{dataset_name}] Min temp out of logical bounds!"
    
    # Logic Check: Max temp must always be >= Min temp
    assert (df['temperature_2m_max'] >= df['temperature_2m_min']).all(), f"[{dataset_name}] Found Min Temp > Max Temp!"
    
    # Precipitation Check (0.0 to 60.0)
    assert df['precipitation_sum'].between(0.0, 60.0).all(), f"[{dataset_name}] Precipitation out of bounds!"
    
    # Valid WMO Weathercodes Check
    valid_codes = [0, 1, 2, 3, 45, 51, 53, 55, 61, 63, 65, 71, 73, 75, 80, 81, 82, 95, 96, 99]
    assert df['weathercode'].isin(valid_codes).all(), f"[{dataset_name}] Found an invalid WMO weathercode!"
    
    print(f" {dataset_name} passed all validation checks!")

# Run validations
validate_weather_data(train_df, "Train")
validate_weather_data(eval_df, "Eval")
validate_weather_data(holdout_df, "Holdout")

 Train passed all validation checks!
 Eval passed all validation checks!
 Holdout passed all validation checks!


In [3]:
def encode_features(df: pd.DataFrame) -> pd.DataFrame:
    """Applies One-Hot Encoding to categorical features."""
    df_encoded = df.copy()
    
    df_encoded = pd.get_dummies(df_encoded, columns=['city'], drop_first=False, dtype=int)
    
    
    return df_encoded

# Apply encoding
print("Encoding features...")
train_eng = encode_features(train_df)
eval_eng = encode_features(eval_df)
holdout_eng = encode_features(holdout_df)

print("Encoding complete.")
print("New Train columns:", train_eng.columns.tolist())

Encoding features...
Encoding complete.
New Train columns: ['weathercode', 'temperature_2m_max', 'temperature_2m_min', 'precipitation_sum', 'month-day', 'city_1', 'city_2', 'city_3', 'city_4', 'city_5']


In [4]:
train_eng.to_csv("data/processed/train_encoded.csv", index=True)
eval_eng.to_csv("data/processed/eval_encoded.csv", index=True)
holdout_eng.to_csv("data/processed/holdout_encoded.csv", index=True)

print("Engineered datasets saved successfully!")

Engineered datasets saved successfully!


In [5]:
import plotly.express as px

cols_to_correlate = ['temperature_2m_max', 'temperature_2m_min', 'precipitation_sum', 'weathercode']

if 'month-day' in train_df.columns:
    cols_to_correlate.append('month-day')


corr_matrix = train_df[cols_to_correlate].corr()


fig = px.imshow(
    corr_matrix, 
    text_auto=".2f", 
    aspect="auto", 
    color_continuous_scale='RdBu_r',
    title="Correlation Matrix of Weather Features",
    zmin=-1, zmax=1
)

fig.show()