In [1]:
# -----********************-----

# Created Time: 2024/12/09

# Last updated: 2024/12/11

# Author: Tara Liu, Yiyi He

### Use Case

# This notebook explores the application of autoregressive models
# 1. 

# -----********************-----

# Libraries

In [10]:
# Import libraries
import os
import warnings
warnings.filterwarnings("ignore")

# Stats
from statsmodels.tsa.api import ARDL
import statsmodels.api as sm
import numpy as np
from statsmodels.tsa.ardl import ardl_select_order
from statsmodels.tsa.stattools import grangercausalitytests

# Geo
from shapely.geometry import Point, Polygon
import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 1000

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# Processing
from tqdm import tqdm

In [17]:
# Define functions
def find_lag(df, maxlag):

    time_start = pd.Timestamp.now()
    
    sel_res = ardl_select_order(
        df['pct_blackout'],
        exog=df[['t2m', 'wind_speed', 'tp']],
        maxlag=maxlag,
        ic='aic',
        maxorder=maxlag,
        causal=True,
        trend='ct'
        )
    
    time_selected = pd.Timestamp.now()
    print(f'time elapsed for selecting order: {time_selected-time_start}')
    
    return sel_res.model.ardl_order

In [5]:
# Load input dataframe
df_518 = pd.read_csv("/Users/yiyi/Desktop/df_for_model.csv")
df_518.head(3)

Unnamed: 0,datetime,pct_blackout,wind_forest_cover,t2m,tp,wind_speed,station_id,climate_zone_code
0,2014-11-17 17:00:00,0.0,0.0,295.09723,0.0005645638,1.31093,495,Aw
1,2014-11-29 17:00:00,0.0,0.0,293.0487,0.0,2.215274,495,Aw
2,2014-11-30 22:00:00,0.0,0.0,289.83447,4.351137e-07,2.000125,495,Aw


In [48]:
s_id = 100
station_df = df_518[df_518['station_id'] == s_id]
max_lag = 5
sel_res = ardl_select_order(
        station_df['pct_blackout'],
        exog=station_df[['t2m', 'wind_speed', 'tp']],
        maxlag=maxlag,
        ic='aic',
        maxorder=maxlag,
        causal=True,
        trend='ct'
        )
sel_res.model.ardl_order

(3, 1, 1)

# ARDL find lag

In [11]:
# Load input dataframe
df_518 = pd.read_csv("/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_518_forest_perc_50kmBu_district_climate_dummy_df.csv",
                     index_col=0)
df_518.head(3)

Unnamed: 0,datetime,pct_blackout,wind_forest_cover,t2m,tp,wind_speed,station_id,climate_zone_code,forest_perc_50kmbuffer,forest_perc_district,wind_forest_cover_50kmbu,wind_forest_cover_district,Am,Aw,BSh,Cw
0,2014-11-17 17:00:00,0.0,0.0,295.09723,0.0005645638,1.31093,495,Aw,0.0006,0.0006,0.000787,0.000787,0,1,0,0
1,2014-11-29 17:00:00,0.0,0.0,293.0487,0.0,2.215274,495,Aw,0.0006,0.0006,0.001329,0.001329,0,1,0,0
2,2014-11-30 22:00:00,0.0,0.0,289.83447,4.351137e-07,2.000125,495,Aw,0.0006,0.0006,0.0012,0.0012,0,1,0,0


In [12]:
# Create a list of unique station ids
station_id_lst = list(set(df_518.station_id.unique()))
# Initiate an empty dictionary
station_id_lag_dic = {}
# Set max lag
maxlag = 24
# Iterate through all stations
for s_id in tqdm(station_id_lst):
    # Subset station data
    station_df = df_518[df_518['station_id'] == s_id].sort_values(by='datetime')
    # Find optimum lag and store station id with optimum lag in dictionary
    sel_res = ardl_select_order(
        station_df['pct_blackout'],
        exog=station_df[['t2m', 'wind_speed', 'tp']],
        maxlag=maxlag,
        ic='bic',
        maxorder=maxlag,
        causal=True,
        trend='ct'
        )
    # optimum lag values for endogenous variables
    endo_res_lst = list(sel_res.aic.values[0][1].values())
    # insert the optimum lag value for exdogenous variable
    endo_res_lst.insert(0, sel_res.aic.values[0][0])
    # store station id and optimum lag values in dictionary
    station_id_lag_dic[s_id] = endo_res_lst # key: station id, value:[pct_blackout_lag, t2m_lag, wind_speed_lag, tp_lag]

station_id_lag_dic_copy = station_id_lag_dic
station_id_lag_df = pd.DataFrame.from_dict(station_id_lag_dic_copy, orient='index')

station_id_optimum_lag_bic_df = station_id_lag_df.reset_index().rename(columns={
                                                          'index': 'station_id',
                                                          0: 'pct_blackout_lag',
                                                          1: 't2m_lag',
                                                          2: 'wind_speed_lag',
                                                          3: 'tp_lag'
                                                          })
station_id_optimum_lag_bic_df.to_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_516_ARDL_optimum_lag_bic_max24_df.csv')

  8%|▊         | 43/516 [111:32:11<1226:54:08, 9337.95s/it] 


LinAlgError: SVD did not converge in Linear Least Squares

# Granger Causality

In [None]:
# https://www.machinelearningplus.com/time-series/granger-causality-test-in-python/
def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False):    
    """Check Granger Causality of all possible combinations of the Time series.
    The rows are the response variable, columns are predictors. The values in the table 
    are the P-Values. P-Values lesser than the significance level (0.05), implies 
    the Null Hypothesis that the coefficients of the corresponding past values is 
    zero, that is, the X does not cause Y can be rejected.

    data      : pandas dataframe containing the time series variables
    variables : list containing names of the time series variables.
    """
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

In [2]:
def select_p(train_df):
    aic, bic, fpe, hqic = [], [], [], []
    model = VAR(train_df) 
    p = np.arange(1,60)
    for i in p:
        result = model.fit(i)
        aic.append(result.aic)
        bic.append(result.bic)
        fpe.append(result.fpe)
        hqic.append(result.hqic)
    lags_metrics_df = pd.DataFrame({'AIC': aic, 
                                    'BIC': bic, 
                                    'HQIC': hqic,
                                    'FPE': fpe}, 
                                   index=p)    
    fig, ax = plt.subplots(1, 4, figsize=(15, 3), sharex=True)
    lags_metrics_df.plot(subplots=True, ax=ax, marker='o')
    plt.tight_layout()
    print(lags_metrics_df.idxmin(axis=0))

In [3]:
# Load input dataframe
df_518 = pd.read_csv("/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_518_forest_perc_50kmBu_district_climate_dummy_df.csv",
                     index_col=0)
df_518.head(3)

Unnamed: 0,datetime,pct_blackout,wind_forest_cover,t2m,tp,wind_speed,station_id,climate_zone_code,forest_perc_50kmbuffer,forest_perc_district,wind_forest_cover_50kmbu,wind_forest_cover_district,Am,Aw,BSh,Cw
0,2014-11-17 17:00:00,0.0,0.0,295.09723,0.0005645638,1.31093,495,Aw,0.0006,0.0006,0.000787,0.000787,0,1,0,0
1,2014-11-29 17:00:00,0.0,0.0,293.0487,0.0,2.215274,495,Aw,0.0006,0.0006,0.001329,0.001329,0,1,0,0
2,2014-11-30 22:00:00,0.0,0.0,289.83447,4.351137e-07,2.000125,495,Aw,0.0006,0.0006,0.0012,0.0012,0,1,0,0


In [None]:
# Create a list of unique station ids
station_id_lst = list(set(df_518.station_id.unique()))
# Initiate an empty dictionary
station_id_lag_dic = {}
# Set max lag
maxlag = 12
# Iterate through all stations
for s_id in tqdm(station_id_lst):
    # Subset station data
    station_df = df_518[df_518['station_id'] == s_id].sort_values(by='datetime')

    

    break

# VAR

In [26]:
def testMaxLagAndTime_VAR(df, maxlag):
    '''
    This function is used to test max lag selection for VAR model and record time.
    
    Args:
    df: pd.DataFrame, the dataframe with a cerntain station_id
    maxlag: int, the maximum lag to be tested
    
    Params:
    See var_select_order's parameters
    
    Returns:
    test_model: VAR model, the model with the best maxlag
    '''

    print('Begin testing...')
    print(f'testing maxlag: {maxlag}')
    time_start = pd.Timestamp.now()

    # select order with VAR
    order_selected = VAR(df[['pct_blackout', 't2m', 'wind_speed', 'tp']]).select_order(
        maxlags=maxlag)
    
    # fit VAR model
    time_selected = pd.Timestamp.now()
    print(f'time elapsed for selecting order: {time_selected-time_start}')
    
    test_model = VAR(df[['pct_blackout', 't2m', 'wind_speed', 'tp']]).fit(
        maxlags=maxlag, method='ols', ic='aic', verbose=True, trend='ctt')
    
    # print results
    print(f'order selected: {order_selected}')
    # print aic
    print(f'AIC: {test_model.aic}')
    
    
    time_fit = pd.Timestamp.now()
    print(f'time elapsed for fitting model: {time_fit-time_selected}')

    time_end = pd.Timestamp.now()
    print(f'time elapsed for the whole process: {time_end-time_start}')
    print('End testing...')
    print('model summary:')
    print(test_model.summary())

    return test_model

In [4]:
# Read input csv
df_518 = pd.read_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_518_forest_perc_50kmBu_district_climate_dummy_df.csv',
                     index_col=0)

In [5]:
s_id = 100
station_df = df_518[df_518['station_id'] == s_id]
max_lag = 5

In [None]:
df.head(10)

In [27]:
testMaxLagAndTime_VAR(station_df, 4)

Begin testing...
testing maxlag: 4
time elapsed for selecting order: 0 days 00:00:00.012770
<statsmodels.tsa.vector_ar.var_model.LagOrderResults object. Selected orders are: AIC -> 1, BIC -> 0, FPE -> 1, HQIC ->  0>
Using 1 based on aic criterion
order selected: <statsmodels.tsa.vector_ar.var_model.LagOrderResults object. Selected orders are: AIC -> 1, BIC -> 0, FPE -> 1, HQIC ->  0>
AIC: -15.533333109884271
time elapsed for fitting model: 0 days 00:00:00.010178
time elapsed for the whole process: 0 days 00:00:00.022983
End testing...
model summary:
  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Mon, 16, Dec, 2024
Time:                     19:46:21
--------------------------------------------------------------------
No. of Equations:         4.00000    BIC:                   -15.3590
Nobs:                     740.000    HQIC:                  -15.4661
Log likelihood:           1575.28    FPE:                1.794

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


<statsmodels.tsa.vector_ar.var_model.VARResultsWrapper at 0x48483bc40>