In [None]:
import models.PCA_model as pca_model
#!/usr/bin/env python3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgbÂ¨
from statsmodels.tsa.arima.model import ARIMA
from arhmm import ARHMM  # assuming you have an ARHMM implementation
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import joblib
import os
import logging
import utils.preprocessing as pr_utils
import utils.plotting_eda as plot_eda_utils
import models.hmm_model as hmm_model
import train_models as train_models_utils
import utils.forecast as forecast_utils
import utils.plotting_models as plot_model


In [None]:
root_dir = 'C:/Users/irmak/Desktop/datascience/lulzern/energysystems/great-energy-predictor-shootout-i'



In [None]:
    # Load data
loader = pr_utils.EnergyDataLoader(root_dir)
df_train_A = loader.load_A_data('Atrain.dat')
df_test_A = loader.load_A_data('Atest.dat')
df_train_B = loader.load_B_data('Btrain.dat', is_train=True)
df_test_B = loader.load_B_data('Btest.dat', is_train=False)

    # Visualize energy time series
visualizer = plot_eda_utils.EnergyVisualizer()
for col in ['WBE', 'WBCW', 'WBHW']:
    visualizer.plot_time_series(df_train_A, col, save_path=f'{col}.png')
visualizer.plot_correlation_matrix(df_train_A, save_path='correlation_matrix.png')

    # PCA analysis
energy_cols = ['WBE', 'WBCW', 'WBHW']
analyzer = pca_model.PCAAnalyzer(df_train_A, target_cols=energy_cols)
analyzer.run_pca()
analyzer.plot_variable_importance()
analyzer.plot_explained_variance()
analyzer.plot_loadings_heatmap(n_components=5)

    # Top related variables per target
for target in energy_cols:
        print(f"Top variables related to {target}:")
        print(analyzer.top_related_variables(target))

HMM

In [None]:
features = [
    'TEMP', 'HUMID', 'SOLAR', 'WIND',
    'WBE', 'WBCW', 'WBHW',
    'hourOfDay', 'is_weekend'
]

hmm_model = hmm_model.EnergyHMM(df_train_A, feature_cols=features)

best_n, bic_scores = hmm_model.select_optimal_states(2, 9)
print("Optimal number of states:", best_n)




In [None]:
hmm_model.plot_bic(bic_scores, best_n)

hmm_model.fit(best_n)

state_means_df = hmm_model.compute_state_means()
print(state_means_df)

hmm_model.plot_hidden_states()
hmm_model.plot_state_means_heatmap()
hmm_model.plot_transition_diagram(threshold=0.05

In [None]:
df_train_processed, df_val_processed, results, artifacts = train_models_utils.pipeline(df_train_A)
hmm_model = artifacts["hmm_model"]

In [None]:

# show evaluation
for model_name, metrics in results.items():
    print(model_name)
    for k, v in metrics.items():
        print(f"  {k}: {v:.4f}")
    print()

In [None]:
y_val = artifacts["y_val"]

for model_name, y_pred in artifacts["predictions"].items():
    if model_name == "LSTM":
        # Trim first seq_len predictions (they are NaN)
        seq_len = train_lstm_seq_len  # set this from your pipeline or LSTM training
        y_trimmed = y_val[seq_len:]
        x_index = df_val_processed.index[seq_len:]
        y_pred_trimmed = y_pred[seq_len:]
    else:
        y_trimmed = y_val
        x_index = df_val_processed.index
        y_pred_trimmed = y_pred

    plot_model.plot_forecast(x_index, y_trimmed, y_pred_trimmed, model_name)



In [None]:
feature_cols = ["TEMP", "HUMID", "SOLAR", "WIND", "WBCW", "WBHW", "hourOfDay"]

for model_name, model in artifacts["models"].items():
    # Only tree-based models have feature_importances_
    if model_name in ["Random Forest", "XGBoost", "HMM + RF", "HMM + XGBoost"]:
        plot_model.plot_feature_importance(model, feature_cols, model_name)
    # Linear Regression: use coefficient magnitudes as proxy
    elif model_name == "Linear Regression" and hasattr(model, "coef_"):
        coefs = np.abs(model.coef_)
        idx = np.argsort(coefs)
        plt.figure(figsize=(8, 6))
        plt.barh(np.array(feature_cols)[idx], coefs[idx])
        plt.title(f"{model_name}: Coefficient Magnitude as Feature Importance")
        plt.xlabel("Coefficient magnitude")
        plt.show()





In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 6))

# Use LSTM seq_len from artifacts or default to 24
seq_len = artifacts.get("lstm_seq_len", 24)

# Choose dataset index: processed validation set if available
x_index_full = df_val_processed.index if "df_val_processed" in globals() else df_processed.index
y_full = artifacts.get("y_val", artifacts["y"])

for model_name, y_pred in artifacts["predictions"].items():
    if model_name == "LSTM":
        y_trimmed = y_full[seq_len:]
        y_pred_trimmed = y_pred[seq_len:]
        x_index = x_index_full[seq_len:]
    else:
        y_trimmed = y_full
        y_pred_trimmed = y_pred
        x_index = x_index_full

    residuals = y_trimmed - y_pred_trimmed
    plt.plot(x_index, residuals, label=model_name)

plt.legend()
plt.title("Residuals of All Models")
plt.xlabel("Time")
plt.ylabel("Residual")
plt.grid(True)
plt.show()


