In [10]:
import os
import kagglehub

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')
import mlflow
import mlflow.sklearn

from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Execise 1

In this exercise, do the following:
1. Load the dataset used in the time series example - Energy consumption data. You can find it in the notebook "TSA_Example" in Time Series folder in Moodle.
2. Setup a nested MLFlow loop where different modelling experiments can be tracked and then use the dataset in point 1 to experiment and track models. You should do following combinations:
    1. At least 3 model types
    2. At least 3 different feature combinations
    3. At least 3 different options for 3 different hyperparameters
    4. At least 3 different time splits for train test
3. For each option in the combination, you should calculate & log the following in MLFlow:
    1. RMSE
    2. MAE
    3. Plot of actual vs predicted for 1 month data
    4. Plot of actual vs predicted for 1 week of data
    5. All of the combination info in point 2, such as which model, what feature combindation, what hyperparameter, what train test split has been used
4. Turn on MLFlow UI and track your experiments

### 1. Load the dataset used in the time series example - Energy consumption data. You can find it in the notebook "TSA_Example" in Time Series folder in Moodle.

In [2]:
# Download latest version
path = kagglehub.dataset_download("robikscube/hourly-energy-consumption")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/robikscube/hourly-energy-consumption?dataset_version_number=3...


100%|█████████████████████████████████████████████████████████████████████████████| 11.4M/11.4M [00:01<00:00, 11.8MB/s]

Extracting model files...





Path to dataset files: C:\Users\Bruger\.cache\kagglehub\datasets\robikscube\hourly-energy-consumption\versions\3


In [4]:
dataset_path = "C:/Users/bruger/.cache/kagglehub/datasets/robikscube/hourly-energy-consumption/versions/3"

files = os.listdir(dataset_path)
print(files)

['AEP_hourly.csv', 'COMED_hourly.csv', 'DAYTON_hourly.csv', 'DEOK_hourly.csv', 'DOM_hourly.csv', 'DUQ_hourly.csv', 'EKPC_hourly.csv', 'est_hourly.paruqet', 'FE_hourly.csv', 'NI_hourly.csv', 'PJME_hourly.csv', 'PJMW_hourly.csv', 'pjm_hourly_est.csv', 'PJM_Load_hourly.csv']


In [7]:
file_path = "C:/Users/bruger/.cache/kagglehub/datasets/robikscube/hourly-energy-consumption/versions/3/PJME_hourly.csv"

In [8]:
df = pd.read_csv(file_path)
df = df.set_index('Datetime')
df.index = pd.to_datetime(df.index)
df = df.sort_index()

In [63]:
%run mlflow_startup_interface.ipynb # it won't let me import it

In [67]:
start_mlflow()


Starting MLFlow UI...


[Open MLFlow UI](http://localhost:5000)

MLFlow UI is running at http://localhost:5000. Press Ctrl+C in the terminal to stop it.


In [9]:
df

Unnamed: 0_level_0,PJME_MW
Datetime,Unnamed: 1_level_1
2002-01-01 01:00:00,30393.0
2002-01-01 02:00:00,29265.0
2002-01-01 03:00:00,28357.0
2002-01-01 04:00:00,27899.0
2002-01-01 05:00:00,28057.0
...,...
2018-08-02 20:00:00,44057.0
2018-08-02 21:00:00,43256.0
2018-08-02 22:00:00,41552.0
2018-08-02 23:00:00,38500.0


### 2. Setup a nested MLFlow loop where different modelling experiments can be tracked and then use the dataset in point 1 to experiment and track models. You should do following combinations:

    At least 3 model types
    At least 3 different feature combinations
    At least 3 different options for 3 different hyperparameters
    At least 3 different time splits for train test


In [11]:
def create_features(df):
    """
    Create time series features and lag features based on time series index.
    """
    df = df.copy()

    # Basic time-based features
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week

    # Lag features
    df['lag_1d'] = df['PJME_MW'].shift(1)   # 1 day lag
    df['lag_1w'] = df['PJME_MW'].shift(7)   # 1 week lag
    df['lag_1m'] = df['PJME_MW'].shift(30)  # 1 month lag (approx. 30 days)
    df['lag_1y'] = df['PJME_MW'].shift(365) # 1 year lag

    # Rolling statistics features
    df['rolling_mean_3d'] = df['PJME_MW'].rolling(window=3).mean()  # Last 3 days rolling mean
    df['rolling_mean_30d'] = df['PJME_MW'].rolling(window=30).mean()  # Last month rolling mean
    df['rolling_mean_same_month_last_year'] = df['PJME_MW'].shift(365).rolling(window=30).mean()  # Same month previous year rolling mean
    df['rolling_mean_same_week_last_year'] = df['PJME_MW'].shift(365).rolling(window=7).mean()  # Same week previous year rolling mean

    return df

df = create_features(df)

In [13]:
df = df.dropna()

In [14]:
df

Unnamed: 0_level_0,PJME_MW,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,lag_1d,lag_1w,lag_1m,lag_1y,rolling_mean_3d,rolling_mean_30d,rolling_mean_same_month_last_year,rolling_mean_same_week_last_year
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2002-01-17 11:00:00,34115.0,11,3,1,1,2002,17,17,3,34638.0,25708.0,26174.0,30748.0,34485.333333,32294.733333,30465.500000,28444.000000
2002-01-17 12:00:00,33835.0,12,3,1,1,2002,17,17,3,34115.0,26130.0,28361.0,34725.0,34196.000000,32477.200000,30609.900000,29181.428571
2002-01-17 13:00:00,33368.0,13,3,1,1,2002,17,17,3,33835.0,28123.0,32443.0,37313.0,33772.666667,32508.033333,30878.166667,30494.571429
2002-01-17 14:00:00,33152.0,14,3,1,1,2002,17,17,3,33368.0,32359.0,34902.0,37322.0,33451.666667,32449.700000,31177.000000,31906.714286
2002-01-17 15:00:00,32662.0,15,3,1,1,2002,17,17,3,33152.0,34860.0,34752.0,37035.0,33060.666667,32380.033333,31481.533333,33297.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-08-02 20:00:00,44057.0,20,3,3,8,2018,214,2,31,45641.0,45372.0,45313.0,42771.0,45486.000000,41515.666667,38934.233333,38711.142857
2018-08-02 21:00:00,43256.0,21,3,3,8,2018,214,2,31,44057.0,46534.0,46430.0,43742.0,44318.000000,41409.866667,38948.666667,40083.428571
2018-08-02 22:00:00,41552.0,22,3,3,8,2018,214,2,31,43256.0,47154.0,47867.0,44607.0,42955.000000,41199.366667,38906.000000,41346.428571
2018-08-02 23:00:00,38500.0,23,3,3,8,2018,214,2,31,41552.0,46989.0,48855.0,45057.0,41102.666667,40854.200000,38805.866667,42449.000000


In [None]:
# Split the data
X = df
y = df['PJME_MW']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)

# Define different feature combinations
feature_combinations = [
    iris.feature_names[:2],  # First two features
    iris.feature_names[2:],  # Last two features
    iris.feature_names       # All features
]

time_split = ['01-01-2015', '01-01-2016', '01-01-2017']

In [None]:

# Start MLFlow experiment
mlflow.set_experiment("MLFlow Energy Consumption data")

results = []

# Train models with different feature combinations
for features in feature_combinations:
    for model_name, model in zip(["RandomForest", "LogisticRegression"], 
                                 [RandomForestClassifier(random_state=42), LogisticRegression(max_iter=200)]):
        with mlflow.start_run():
            # Log feature combination and model type
            mlflow.log_param("features", features)
            mlflow.log_param("model_type", model_name)

            for split in time_split:
                    
                # Train test split
                train = X.loc[X.index < split]
                test = X.loc[X.index >= split]
                
                X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.2, random_state=42)
                
                # Train the model
                model.fit(X_train[features], y_train)
                predictions = model.predict(X_test[features])
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, predictions)
                precision = precision_score(y_test, predictions, average='weighted')
                recall = recall_score(y_test, predictions, average='weighted')
                f1 = f1_score(y_test, predictions, average='weighted')
                
                # Log metrics
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("precision", precision)
                mlflow.log_metric("recall", recall)
                mlflow.log_metric("f1_score", f1)
                
                # Create and log a plot of the metrics
                fig, ax = plt.subplots(figsize=(8, 4))
                metrics = [accuracy, precision, recall, f1]
                metric_names = ["Accuracy", "Precision", "Recall", "F1 Score"]
                ax.bar(metric_names, metrics, color='skyblue')
                ax.set_title(f"{model_name} Metrics for Feature Set: {features}")
                ax.set_ylim(0, 1)
                
                # Save the plot to a temporary file and log it as an artifact
                temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
                plt.savefig(temp_file.name)
                mlflow.log_artifact(temp_file.name, artifact_path="plots")
                temp_file.close()
                
                # Log the model
                mlflow.sklearn.log_model(model, model_name)
                
                # Store results for summary
                results.append({
                    "features": features,
                    "model": model_name,
                    "accuracy": accuracy,
                    "precision": precision,
                    "recall": recall,
                    "f1_score": f1
                })

# Print summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Experiment Results:")
print(results_df)

### 3. For each option in the combination, you should calculate & log the following in MLFlow: