In [None]:
### 1.2 with plot

import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from prophet import Prophet
from statsmodels.tsa.arima.model import ARIMA

# ... [Keep the existing functions: mean_absolute_percentage_error, load_and_clean_data, add_lag_and_rolling_features] ...

# Function to calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_idx = y_true != 0
    y_true_non_zero = y_true[non_zero_idx]
    y_pred_non_zero = y_pred[non_zero_idx]
    if len(y_true_non_zero) == 0:
        return np.nan
    return np.mean(np.abs((y_true_non_zero - y_pred_non_zero) / y_true_non_zero)) * 100

# Load and clean data
def load_and_clean_data(file_path):
    data = pd.read_excel(file_path, parse_dates=['DateAndHour'])
    
    # Feature engineering: Date and Time
    data['Hour'] = data['DateAndHour'].dt.hour
    data['DayOfWeek'] = data['DateAndHour'].dt.dayofweek
    data['Month'] = data['DateAndHour'].dt.month
    
    # Remove rows with missing Temperature or Load_data
    data = data.dropna(subset=['Temperature', 'Load_data'])
    
    return data

# Adding lag, rolling features, and temperature bins
def add_lag_and_rolling_features(data, lags=[1, 24, 168], window=3):
    for lag in lags:
        data[f'Load_data_lag_{lag}'] = data['Load_data'].shift(lag)
        data[f'Temperature_lag_{lag}'] = data['Temperature'].shift(lag)
    
    # Rolling features for load data (mean and standard deviation)
    data[f'Load_data_rolling_{window}'] = data['Load_data'].rolling(window=window).mean()
    data[f'Load_data_rolling_std_{window}'] = data['Load_data'].rolling(window=window).std()
    
    # Adding temperature change feature
    data['Temperature_change'] = data['Temperature'].diff()
    
    # Temperature Binning (cold, moderate, hot)
    data['Temperature_bins'] = pd.cut(data['Temperature'], bins=[-np.inf, 0, 15, np.inf], labels=['cold', 'moderate', 'hot'])
    
    # Convert categorical bins to dummy variables
    data = pd.get_dummies(data, columns=['Temperature_bins'], drop_first=True)
    
    return data

# Load and prepare data
data = load_and_clean_data('/kaggle/input/baseline2/Hackathon_Data_Cleaned.xlsx')
data = add_lag_and_rolling_features(data, lags=[1, 24, 168], window=3)

# Select test dates
test_dates = ['2024-02-01', '2024-03-02', '2024-03-26', '2024-03-30']
test_mask = data['DateAndHour'].dt.date.astype(str).isin(test_dates)
test_data = data[test_mask]
train_data = data[~test_mask]

# Prepare features and target
features = ['Hour', 'DayOfWeek', 'Month', 'Temperature', 'Temperature_change',
            'Load_data_lag_1', 'Load_data_lag_24', 'Load_data_lag_168', 
            'Temperature_lag_1', 'Temperature_lag_24', 'Temperature_lag_168',
            'Load_data_rolling_3', 'Load_data_rolling_std_3',
            'Temperature_bins_moderate', 'Temperature_bins_hot']

X_train = train_data[features]
y_train = train_data['Load_data']
X_test = test_data[features]
y_test = test_data['Load_data']

# Handle missing values and scale features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train.fillna(X_train.mean())), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test.fillna(X_test.mean())), columns=X_test.columns)

# XGBoost model
def xgboost_model(X_train, y_train, X_test):
    model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
    model.fit(X_train, y_train)
    return model.predict(X_test)

# Prophet model
def prophet_model(data_train, data_test):
    model = Prophet(daily_seasonality=True)
    model.add_regressor('Temperature')
    model.fit(data_train)
    future = data_test[['ds']].copy()
    future['Temperature'] = data_test['Temperature'].values
    forecast = model.predict(future)
    return forecast['yhat'].values

# ARIMA model
def arima_model(y_train, n_test):
    model = ARIMA(y_train, order=(5,1,2))
    results = model.fit()
    return results.forecast(steps=n_test)

# Random Forest model
def random_forest_model(X_train, y_train, X_test):
    model = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5, random_state=42)
    model.fit(X_train, y_train)
    return model.predict(X_test)

# Train and predict
xgb_pred = xgboost_model(X_train_scaled, y_train, X_test_scaled)

prophet_data_train = train_data[['DateAndHour', 'Load_data', 'Temperature']].rename(columns={'DateAndHour': 'ds', 'Load_data': 'y'})
prophet_data_test = test_data[['DateAndHour', 'Load_data', 'Temperature']].rename(columns={'DateAndHour': 'ds', 'Load_data': 'y'})
prophet_pred = prophet_model(prophet_data_train, prophet_data_test)

rf_pred = random_forest_model(X_train_scaled, y_train, X_test_scaled)
arima_pred = arima_model(y_train, len(y_test))

# Meta-learner
meta_X = np.column_stack((xgb_pred, prophet_pred, rf_pred, arima_pred))
meta_learner = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
meta_learner.fit(meta_X, y_test)

# Final predictions
final_pred = meta_learner.predict(meta_X)

# Calculate MAE and MAPE
ensemble_mae = mean_absolute_error(y_test, final_pred)
ensemble_mape = mean_absolute_percentage_error(y_test, final_pred)

print(f"Ensemble Model MAE: {ensemble_mae:.2f}")
print(f"Ensemble Model MAPE: {ensemble_mape:.2f}%")

# Append predictions to the original dataset
data.loc[test_mask, 'Predicted_Load'] = final_pred

# Display results for the test dates
for date in test_dates:
    print(f"\nPredictions for {date}:")
    day_data = data[data['DateAndHour'].dt.date.astype(str) == date]
    print(day_data[['DateAndHour', 'Load_data', 'Predicted_Load']])

# Optionally, save the updated dataset
data.to_csv('load_data_with_predictions.csv', index=False)
print("\nUpdated dataset saved as 'load_data_with_predictions.csv'")

# Plotting
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 10))
for i, date in enumerate(test_dates):
    plt.subplot(2, 1, i+1)
    day_data = data[data['DateAndHour'].dt.date.astype(str) == date]
    plt.plot(day_data['DateAndHour'], day_data['Load_data'], label='Actual Load', marker='o')
    plt.plot(day_data['DateAndHour'], day_data['Predicted_Load'], label='Predicted Load', marker='x')
    plt.title(f'Load Prediction for {date}')
    plt.xlabel('Hour')
    plt.ylabel('Load')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True)

plt.tight_layout()
plt.savefig('load_predictions.png')
plt.close()
print("Prediction plots saved as 'load_predictions.png'")