In [1]:
!python --version

Python 3.10.14


In [None]:
import os
import torch
import matplotlib.pyplot as plt
import glob
import pandas as pd
from tqdm.autonotebook import tqdm
import matplotlib.dates as mdates
from itertools import islice
from collections import defaultdict
from datetime import datetime
import numpy as np

  from tqdm.autonotebook import tqdm


In [None]:
from autogluon.timeseries.metrics import TimeSeriesScorer
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [4]:
TimeSeriesScorer.greater_is_better_internal = True

class NRMSE(TimeSeriesScorer):
   greater_is_better_internal = True
   optimum = 0.0

   def compute_metric(self, data_future, predictions, target, **kwargs):
      # return sklearn.metrics.root_mean_squared_error(y_true=data_future[target], y_pred=predictions["mean"]) / data_future[target].mean()
       return np.sqrt(np.mean(np.square(data_future[target] - predictions["mean"]))) / data_future[target].mean()

In [5]:
# Data pipelining
def get_batched_data_fn(sub_df,
    batch_size: int = 128, 
    context_len: int = 168, 
    horizon_len: int = 24):
    
    examples = defaultdict(list)
    num_examples = 0
    for start in range(0, len(sub_df) - (context_len + horizon_len), horizon_len):
      num_examples += 1
      #examples["country"].append(country)
      examples["inputs"].append(sub_df["y"][start:(context_end := start + context_len)].tolist())
      #examples["gen_forecast"].append(sub_df["gen_forecast"][start:context_end + horizon_len].tolist())
      #examples["week_day"].append(sub_df["week_day"][start:context_end + horizon_len].tolist())
      examples["outputs"].append(sub_df["y"][context_end:(context_end + horizon_len)].tolist())
      examples['inputs_ts'].append(sub_df.index[start:(context_end := start + context_len)])
      examples["outputs_ts"].append(sub_df.index[context_end:(context_end + horizon_len)])

    return examples

In [None]:
def forecast_building(df):
    # torch.cuda.empty_cache()
    # Set numerical columns as float32
    for col in df.columns:
        # Check if column is not of string type
        if df[col].dtype != 'object' and pd.api.types.is_string_dtype(df[col]) == False:
            df[col] = df[col].astype('float32')
    
    # Create the Timeseries Dataframe
    # dataset = PandasDataset.from_long_dataframe(df, target="target", item_id="item_id")
    dataset = TimeSeriesDataFrame(df.reset_index())

    backtest_dataset = dataset
    prediction_length = 24  # Define your prediction length. We use 24 here since the data is of hourly frequency
    num_samples = 100

    train_data, test_data = backtest_dataset.train_test_split(prediction_length)

    predictor = TimeSeriesPredictor(prediction_length=prediction_length).fit(
    train_data,
    hyperparameters={
        "AutoARIMA": {
            "seasonal_period": 168
        }
    },
    skip_model_selection=True,
    verbosity=0)
    predictions = predictor.predict(train_data)
    agg_metrics = predictor.evaluate(backtest_dataset, metrics=["RMSE", "MSE", "MAE", "MSE", "MAPE", "SMAPE", NRMSE(), "SQL"])

    res_all = pd.DataFrame(test_data[test_data.index.isin(predictions.index)].target)
    res_all.columns = ['y_true']
    res_all.insert(1, 'y_pred', predictions['mean'])
    res_all_df = res_all.reset_index().drop('item_id', axis = 1).sort_values('timestamp')
    
    return res_all_df, agg_metrics


In [None]:
def process_building(df): 
    building_name = df.columns[0]
    df.columns = ['y']
    input_data = get_batched_data_fn(df, batch_size=500)
    
    windows_all = []
    counter = 1
    for inputs_ts, inputs, outputs_ts, outputs in zip(input_data['inputs_ts'], 
                                                      input_data['inputs'], 
                                                      input_data['outputs_ts'], 
                                                      input_data['outputs']):
        
        input_df = pd.DataFrame({'timestamp': inputs_ts, 
                                 'target': inputs})
        
        output_df = pd.DataFrame({'timestamp': outputs_ts, 
                                 'target': outputs})
        combined = pd.concat([input_df, output_df], axis=0)
        combined['item_id'] = str(building_name) + '_' + str(counter)
        combined['item_id_no'] = counter
        counter += 1
        windows_all.append(combined)
        
    windows_all_df = pd.concat(windows_all)
    windows_all_df.timestamp = pd.to_datetime(windows_all_df.timestamp)
    windows_all_df.set_index('timestamp', inplace=True)
    # windows_all_df.to_csv('test.csv')

    res, agg_metrics = forecast_building(windows_all_df)
    return res, agg_metrics

In [None]:
def process_file(filename):
    df = pd.read_csv(filename)
    df = df.set_index(['timestamp'])

    if df.shape[1] < 2:
        return None
        
    print(datetime.now(), df.shape, flush=True)

    res_all = []
    agg_metrics_all = []
    
    i = 0
    for building_name in df.columns:
        print(datetime.now(), i, '/', len(df.columns), building_name, flush=True)
        df1 = df[[building_name]]
        print(datetime.now(), i, '/', len(df.columns), building_name, df1.shape, flush=True)
        df1 = df1.loc[df1.first_valid_index():]
        print(datetime.now(), i, '/', len(df.columns), building_name, df1.shape, flush=True)

        res, agg_metrics = process_building(df1)
        res['building'] = building_name
        res['filename'] = filename
        res_all.append(res)

        agg_metrics_df = pd.DataFrame([agg_metrics])
        agg_metrics_df.insert(0, 'building', building_name)
        agg_metrics_df.insert(0, 'filename', filename)
        agg_metrics_all.append(agg_metrics_df)

        i += 1
        if i % 5 == 0:
            print(datetime.now(), 'Saving...')
            res_all_df = pd.concat(res_all).round(6)
            res_all_df = res_all_df.reset_index()
            res_all_df = res_all_df.rename(columns={res_all_df.columns[0]: "timestamp" })
            res_all_df.to_csv(f'forecasts/{dataset}/{os.path.basename(filename)}', index=False)            

            agg_metrics_all_df = pd.concat(agg_metrics_all).round(6)            
            agg_metrics_all_df.to_csv(f'results/{dataset}/agg_metrics_{os.path.basename(filename)}', index=False)            
    
    
    res_all_df = pd.concat(res_all).round(6)
    res_all_df = res_all_df.reset_index()
    res_all_df = res_all_df.rename(columns={res_all_df.columns[0]: "timestamp" })
    res_all_df.to_csv(f'forecasts/{dataset}/{os.path.basename(filename)}', index=False)            

    agg_metrics_all_df = pd.concat(agg_metrics_all).round(6)   
    agg_metrics_all_df.to_csv(f'results/{dataset}/agg_metrics_{os.path.basename(filename)}', index=False)                

    return res_all_df, agg_metrics_all_df

In [10]:
files_list = glob.glob('/home/user/New_Buildings_Datasets/Mathura_and_Bareilly/dataverse_files/processed/Bareilly/*csv')

dataset = 'Bareilly-arima'
os.makedirs(f'forecasts/{dataset}/', exist_ok = True)
os.makedirs(f'results/{dataset}/', exist_ok = True)

for filename in files_list:
    print(datetime.now(), filename)
    results = process_file(filename)
    # if results is not None:
    #     results.to_csv(f'../forecasts/{dataset}/{os.path.basename(filename)}', index=False)
    print('')

2024-11-11 16:33:48.056449 /home/user/New_Buildings_Datasets/Mathura_and_Bareilly/dataverse_files/processed/Bareilly/Bareilly_2021.csv
2024-11-11 16:33:48.088808 (7296, 38)
2024-11-11 16:33:48.089369 0 / 38 BR02
2024-11-11 16:33:48.090418 0 / 38 BR02 (7296, 1)
2024-11-11 16:33:48.091319 0 / 38 BR02 (7296, 1)
2024-11-11 16:34:01.626317 1 / 38 BR04
2024-11-11 16:34:01.628320 1 / 38 BR04 (7296, 1)
2024-11-11 16:34:01.629273 1 / 38 BR04 (7296, 1)
2024-11-11 16:34:05.879654 2 / 38 BR05
2024-11-11 16:34:05.881390 2 / 38 BR05 (7296, 1)
2024-11-11 16:34:05.882570 2 / 38 BR05 (7296, 1)
2024-11-11 16:34:07.895644 3 / 38 BR06
2024-11-11 16:34:07.896785 3 / 38 BR06 (7296, 1)
2024-11-11 16:34:07.897883 3 / 38 BR06 (7296, 1)
2024-11-11 16:34:12.018921 4 / 38 BR08
2024-11-11 16:34:12.020288 4 / 38 BR08 (7296, 1)
2024-11-11 16:34:12.021152 4 / 38 BR08 (7296, 1)
2024-11-11 16:34:13.867837 Saving...
2024-11-11 16:34:14.023776 5 / 38 BR09
2024-11-11 16:34:14.025927 5 / 38 BR09 (7296, 1)
2024-11-11 16:34: