In [1]:
# source: http://kaggle.com/datasets/emirhanakku/climate-and-energy-consumption-dataset-20202024

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
csv_path = '/content/global_climate_energy_2020_2024.csv'
df = pd.read_csv(csv_path)

print(df.shape)
print(df.head())

print("1. EDA & DATA CLEANING")

# Data Info
print("Data Info")
df.info()

print("Missing Values")
print(df.isnull().sum())

print("Statistical Summary")
print(df.describe())

print("Column Names")
print(list(df.columns))

# Drop constant columns if they exist
if 'country' in df.columns:
    df_clean = df.drop(columns=['country'])
else:
    df_clean = df.copy()

# Convert date to datetime
df_clean['date'] = pd.to_datetime(df_clean['date'])
df_clean = df_clean.set_index('date').sort_index()

print(f"Cleaned DataFrame: {df_clean.shape}")
print(df_clean.head())

print("2. FEATURE ENGINEERING (NO DATA LEAKAGE)")

# Automatically detect target variable (assuming 'energy_price' or similar)
possible_targets = ['energy_price', 'price', 'energy_cost']
TARGET = None
for col in possible_targets:
    if col in df_clean.columns:
        TARGET = col
        break

if TARGET is None:
    # If no standard target found, use the last numeric column
    TARGET = df_clean.select_dtypes(include=[np.number]).columns[-1]

print(f"\nTarget variable: {TARGET}")

# Separate features and target BEFORE any transformations
X = df_clean.drop(columns=[TARGET])
y = df_clean[TARGET]

# Get all numeric columns for feature engineering
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric features available: {numeric_cols}")

# TRAIN/TEST SPLIT (CHRONOLOGICAL FOR TIME SERIES)
split_idx = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print(f"\nTrain set: {X_train.shape}, Test set: {X_test.shape}")
print(f"  Train period: {X_train.index[0]} to {X_train.index[-1]}")
print(f"  Test period: {X_test.index[0]} to {X_test.index[-1]}")

# FEATURE ENGINEERING FUNCTIONS
def create_time_features(df):
    """Create time-based features"""
    df = df.copy()
    df['day_of_year'] = df.index.dayofyear
    df['month'] = df.index.month
    df['day_of_week'] = df.index.dayofweek
    df['week_of_year'] = df.index.isocalendar().week
    return df

def create_lag_features(df, columns, lags=[1, 3, 7]):
    """Create lag features (NO LEAKAGE - only uses past values)"""
    df = df.copy()
    for col in columns:
        if col in df.columns:
            for lag in lags:
                df[f'{col}_lag{lag}'] = df[col].shift(lag)
    return df

def create_rolling_features(df, columns, windows=[3, 7]):
    """Create rolling statistics (NO LEAKAGE - only uses past values)"""
    df = df.copy()
    for col in columns:
        if col in df.columns:
            for window in windows:
                df[f'{col}_roll_mean{window}'] = df[col].shift(1).rolling(window=window).mean()
                df[f'{col}_roll_std{window}'] = df[col].shift(1).rolling(window=window).std()
    return df

def create_interaction_features(df, columns):
    """Create interaction terms between first few columns"""
    df = df.copy()
    # Create interactions between first 3 numeric columns if available
    if len(columns) >= 2:
        df[f'{columns[0]}_{columns[1]}'] = df[columns[0]] * df[columns[1]]
    if len(columns) >= 3:
        df[f'{columns[0]}_{columns[2]}'] = df[columns[0]] * df[columns[2]]
        df[f'{columns[1]}_{columns[2]}'] = df[columns[1]] * df[columns[2]]
    return df

# Select columns for lag/rolling (use first few numeric columns)
lag_cols = numeric_cols[:3] if len(numeric_cols) >= 3 else numeric_cols
rolling_cols = numeric_cols[:2] if len(numeric_cols) >= 2 else numeric_cols

# Combine train and test for proper feature engineering
X_combined = pd.concat([X_train, X_test])
y_combined = pd.concat([y_train, y_test])

# Apply all feature engineering to combined data
X_combined_fe = create_time_features(X_combined)
X_combined_fe = create_lag_features(X_combined_fe, lag_cols)
X_combined_fe = create_rolling_features(X_combined_fe, rolling_cols)
X_combined_fe = create_interaction_features(X_combined_fe, numeric_cols)

# Remove rows with NaN from combined dataset
valid_idx = X_combined_fe.notna().all(axis=1)
X_combined_clean = X_combined_fe[valid_idx]
y_combined_clean = y_combined[valid_idx]

print(f"After NaN removal - Combined: {X_combined_clean.shape}")

# Now split back into train and test using the INDICES
train_indices = X_train.index
test_indices = X_test.index

# Filter to only valid indices that remain after NaN removal
train_valid_indices = train_indices.intersection(X_combined_clean.index)
test_valid_indices = test_indices.intersection(X_combined_clean.index)

X_train_fe = X_combined_clean.loc[train_valid_indices]
y_train = y_combined_clean.loc[train_valid_indices]
X_test_fe = X_combined_clean.loc[test_valid_indices]
y_test = y_combined_clean.loc[test_valid_indices]

print(f"Final split - Train: {X_train_fe.shape}, Test: {X_test_fe.shape}")
print(f"  Train period: {X_train_fe.index[0]} to {X_train_fe.index[-1]}")
print(f"  Test period: {X_test_fe.index[0]} to {X_test_fe.index[-1]}")
print(f"\nTotal features created: {X_train_fe.shape[1]}")

# SCALING
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_fe)
X_test_scaled = scaler.transform(X_test_fe)

print("\nFeatures scaled using StandardScaler")

print("3. MODEL TRAINING & EVALUATION")

models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
    'Ridge Regression': Ridge(alpha=1.0)
}

results = {}

for name, model in models.items():
    print(f"\n--- Training {name} ---")

    # Train
    model.fit(X_train_scaled, y_train)

    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # Metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)

    results[name] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'model': model
    }

    print(f"  Train R2: {train_r2:.4f}")
    print(f"  Test R2:  {test_r2:.4f}")
    print(f"  Test RMSE: {test_rmse:.4f}")
    print(f"  Test MAE:  {test_mae:.4f}")

# FEATURE IMPORTANCE (Random Forest)
print("4. FEATURE IMPORTANCE (Top 15)")

rf_model = results['Random Forest']['model']
feature_importance = pd.DataFrame({
    'feature': X_train_fe.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(15))

# SUMMARY
print("5. MODEL COMPARISON SUMMARY")

summary_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train R2': [results[m]['train_r2'] for m in results.keys()],
    'Test R2': [results[m]['test_r2'] for m in results.keys()],
    'Test RMSE': [results[m]['test_rmse'] for m in results.keys()],
    'Test MAE': [results[m]['test_mae'] for m in results.keys()]
})

print(summary_df.to_string(index=False))

best_model_name = max(results.keys(), key=lambda x: results[x]['test_r2'])
print(f"\nBest Model: {best_model_name} (Test R2 = {results[best_model_name]['test_r2']:.4f})")

(36540, 10)
         date  country  avg_temperature  humidity  co2_emission  \
0  2020-01-01  Germany            28.29     31.08        212.63   
1  2020-01-02  Germany            28.38     37.94        606.05   
2  2020-01-03  Germany            28.74     57.67        268.72   
3  2020-01-04  Germany            26.66     51.34        167.32   
4  2020-01-05  Germany            26.81     65.38        393.89   

   energy_consumption  renewable_share  urban_population  \
0            11348.75            14.42             76.39   
1             4166.64             5.63             86.26   
2             4503.80            14.20             75.92   
3             3259.13            13.84             63.15   
4             7023.72             6.93             76.02   

   industrial_activity_index  energy_price  
0                      51.22         83.93  
1                      78.27        110.40  
2                      48.96        173.58  
3                      97.42         89.13  

In [5]:
import joblib

BEST_MODEL_NAME = 'Ridge Regression'

# Retrieve the trained model object
BEST_MODEL = results[BEST_MODEL_NAME]['model']

# The fitted StandardScaler is still named 'scaler'
# The original feature columns are still available from the previous step.

# 7. SAVE NECESSARY ARTIFACTS FOR DEPLOYMENT
joblib.dump(BEST_MODEL, 'simple_climate_model.joblib')
print(f"Saved Best Model: {BEST_MODEL_NAME}")

joblib.dump(scaler, 'simple_climate_scaler.joblib')
print("Saved Scaler")

# The original columns used for raw input (before complex features)
original_numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
joblib.dump(original_numeric_cols, 'original_numeric_cols.joblib')
print("Saved Original Numeric Columns")

# The list of final features the model *should* expect (after simple time features)
# Re-calculate the list of final features needed for deployment
X_original_with_time = X.copy()
# NOTE: We need to ensure the index is datetime for the time features to work
X_original_with_time.index = pd.to_datetime(X_original_with_time.index)
X_original_with_time['day_of_year'] = X_original_with_time.index.dayofyear
X_original_with_time['month'] = X_original_with_time.index.month
X_original_with_time['day_of_week'] = X_original_with_time.index.dayofweek
X_original_with_time['week_of_year'] = X_original_with_time.index.isocalendar().week.astype(int)

FINAL_FEATURES_SIMPLE = X_original_with_time.columns.tolist()
joblib.dump(FINAL_FEATURES_SIMPLE, 'final_features_simple.joblib')
print("Saved Final Simple Feature Columns")

Saved Best Model: Ridge Regression
Saved Scaler
Saved Original Numeric Columns
Saved Final Simple Feature Columns


In [6]:
import gradio as gr
import pandas as pd
import numpy as np
import joblib
from datetime import datetime, timedelta

# Load saved artifacts
try:
    MODEL = joblib.load('simple_climate_model.joblib')
    SCALER = joblib.load('simple_climate_scaler.joblib')
    ORIGINAL_COLS = joblib.load('original_numeric_cols.joblib')
    FINAL_FEATURES = joblib.load('final_features_simple.joblib')
    TARGET = 'energy_price' # Assuming this target for display purposes
    print("Simplified Deployment Artifacts Loaded Successfully.")
except FileNotFoundError as e:
    # NOTE: This error means you need to run the artifact saving cell above first.
    print(f"Deployment Error: Could not load required file: {e}")
    raise


# Function to create time features (copied from original notebook)
def create_time_features_deploy(input_df, target_date):
    """Create time-based features"""
    df = input_df.copy()
    df.index = [target_date] # Set the index correctly
    df['day_of_year'] = df.index.dayofyear
    df['month'] = df.index.month
    df['day_of_week'] = df.index.dayofweek
    df['week_of_year'] = df.index.isocalendar().week.astype(int) # Ensure integer
    return df

# Prediction function: Simple inputs only
def predict_climate_simple(target_date_str, *raw_inputs):
    """
    Predicts the target variable using the simplified model.
    Inputs: target_date_str, followed by the values of all ORIGINAL_COLS.
    """
    try:
        target_date = datetime.strptime(target_date_str, '%Y-%m-%d')
    except ValueError:
        return "Error: Invalid date format. Please use YYYY-MM-DD."

    # 1. Create a DataFrame from the raw inputs
    input_data = pd.DataFrame(
        {col: [val] for col, val in zip(ORIGINAL_COLS, raw_inputs)}
    )

    # 2. Add time features
    input_data_fe = create_time_features_deploy(input_data.copy(), target_date)

    # 3. Align features (Crucial step)
    X_predict = input_data_fe[FINAL_FEATURES]

    # 4. Scale the features (using the pre-fitted scaler)
    scaled_features = SCALER.transform(X_predict)

    # 5. Predict
    prediction = MODEL.predict(scaled_features)[0]

    # 6. Return formatted result
    return f"Predicted {TARGET.replace('_', ' ').title()} for {target_date_str}: **{prediction:,.4f}**"

# Dynamically generate the list of inputs for Gradio (Only the raw features + date)
input_components = [
    gr.Textbox(label="Target Date (YYYY-MM-DD)", value=datetime.now().strftime('%Y-%m-%d'))
]

# Add inputs for the numeric features of the TARGET DAY
for col in ORIGINAL_COLS:
    input_components.append(gr.Number(label=f"Current Day's {col.replace('_', ' ').title()}", value=1.0))


# GRADIO INTERFACE SETUP
iface = gr.Interface(
    fn=predict_climate_simple,
    inputs=input_components,
    outputs=gr.Markdown(),
    title=f"Simplified Climate/Energy Price Prediction (Model: {MODEL.__class__.__name__})",
    description="This demo predicts energy prices using only current-day raw data and time features. This results in a cleaner interface but may slightly reduce model accuracy compared to the full time series model."
)

# Launch the interface
iface.launch(share=True)

Simplified Deployment Artifacts Loaded Successfully.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5a1bded7e0b186e510.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


