In [1]:
import pandas as pd
import plotly.express as px

train_df = pd.read_csv("data/raw/train.csv")
eval_df = pd.read_csv("data/raw/eval.csv")
holdout_df = pd.read_csv("data/raw/holdout.csv")

def clean_weather_data(df: pd.DataFrame) -> pd.DataFrame:
    """Apply date conversions, feature engineering, and deduplication."""

    if 'time' not in df.columns:
        df = df.reset_index()
        if 'time' not in df.columns:
            df.rename(columns={'index': 'time'}, inplace=True)

    df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d')

    df['month-day'] = df['time'].dt.strftime('%m.%d').astype(float)

    df = df.dropna().drop_duplicates()

    df.set_index('time', inplace=True)
    df = df.sort_index()
    
    return df


print("Cleaning Train...")
train_df = clean_weather_data(train_df)

print("Cleaning Eval...")
eval_df = clean_weather_data(eval_df)

print("Cleaning Holdout...")
holdout_df = clean_weather_data(holdout_df)

print("\n All datasets processed successfully!")
print("Final Train shape:", train_df.shape)

Cleaning Train...
Cleaning Eval...
Cleaning Holdout...

 All datasets processed successfully!
Final Train shape: (105, 6)


In [2]:
print("Cleaning Train...")
train_df = clean_weather_data(train_df)

print("\nCleaning Eval...")
eval_df = clean_weather_data(eval_df)

print("\nCleaning Holdout...")
holdout_df = clean_weather_data(holdout_df)

print("\nAll datasets processed successfully!")
print("Final Train shape:", train_df.shape)
print("Final Eval shape:", eval_df.shape)
print("Final Holdout shape:", holdout_df.shape)

Cleaning Train...

Cleaning Eval...

Cleaning Holdout...

All datasets processed successfully!
Final Train shape: (105, 6)
Final Eval shape: (25, 6)
Final Holdout shape: (25, 6)


In [3]:
plot_df = train_df.reset_index()

fig = px.violin(
    plot_df, 
    y="temperature_2m_max", 
    color="city", 
    box=True, 
    title="Violin Plot of Max Temperatures by City (Training Data)"
)
fig.show()

In [4]:
train_df.to_csv("data/processed/train_processed.csv", index=True)
eval_df.to_csv("data/processed/eval_processed.csv", index=True)
holdout_df.to_csv("data/processed/holdout_processed.csv", index=True)

print("Cleaning and saving complete.")

Cleaning and saving complete.
