In [1]:
# -----********************-----

# Created Time: 2024/12/09

# Last updated: 2024/12/11

# Author: Yiyi He, Tara Liu

### Use Case

# This notebook explores the application of autoregressive models
# 1. 

# -----********************-----

# Libraries

In [10]:
# Import libraries
import os
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")

# Stats
from statsmodels.tsa.api import ARDL
import statsmodels.api as sm
import numpy as np
from statsmodels.tsa.ardl import ardl_select_order
from statsmodels.tsa.stattools import grangercausalitytests

# Geo
from shapely.geometry import Point, Polygon
# import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 1000

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# Processing
# from tqdm import tqdm
import functools as ft

In [2]:
# Define functions
def find_lag(df, maxlag):

    time_start = pd.Timestamp.now()
    
    sel_res = ardl_select_order(
        df['pct_blackout'],
        exog=df[['t2m', 'wind_speed', 'tp']],
        maxlag=maxlag,
        ic='aic',
        maxorder=maxlag,
        causal=True,
        trend='ct'
        )
    
    time_selected = pd.Timestamp.now()
    print(f'time elapsed for selecting order: {time_selected-time_start}')
    
    return sel_res.model.ardl_order

In [5]:
# Load input dataframe
df_518 = pd.read_csv("/Users/yiyi/Desktop/df_for_model.csv")
df_518.head(3)

Unnamed: 0,datetime,pct_blackout,wind_forest_cover,t2m,tp,wind_speed,station_id,climate_zone_code
0,2014-11-17 17:00:00,0.0,0.0,295.09723,0.0005645638,1.31093,495,Aw
1,2014-11-29 17:00:00,0.0,0.0,293.0487,0.0,2.215274,495,Aw
2,2014-11-30 22:00:00,0.0,0.0,289.83447,4.351137e-07,2.000125,495,Aw


In [48]:
s_id = 100
station_df = df_518[df_518['station_id'] == s_id]
max_lag = 5
sel_res = ardl_select_order(
        station_df['pct_blackout'],
        exog=station_df[['t2m', 'wind_speed', 'tp']],
        maxlag=maxlag,
        ic='aic',
        maxorder=maxlag,
        causal=True,
        trend='ct'
        )
sel_res.model.ardl_order

(3, 1, 1)

# ARDL find lag

## Hourly

In [11]:
# Load input dataframe
df_518 = pd.read_csv("/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_518_forest_perc_50kmBu_district_climate_dummy_df.csv",
                     index_col=0)
df_518.head(3)

Unnamed: 0,datetime,pct_blackout,wind_forest_cover,t2m,tp,wind_speed,station_id,climate_zone_code,forest_perc_50kmbuffer,forest_perc_district,wind_forest_cover_50kmbu,wind_forest_cover_district,Am,Aw,BSh,Cw
0,2014-11-17 17:00:00,0.0,0.0,295.09723,0.0005645638,1.31093,495,Aw,0.0006,0.0006,0.000787,0.000787,0,1,0,0
1,2014-11-29 17:00:00,0.0,0.0,293.0487,0.0,2.215274,495,Aw,0.0006,0.0006,0.001329,0.001329,0,1,0,0
2,2014-11-30 22:00:00,0.0,0.0,289.83447,4.351137e-07,2.000125,495,Aw,0.0006,0.0006,0.0012,0.0012,0,1,0,0


In [35]:
# Create a list of unique station ids
station_id_lst = list(set(df_518.station_id.unique()))
# Initiate an empty dictionary
station_id_lag_dic = {}
# Set max lag
maxlag = 24
# Iterate through all stations
for s_id in tqdm(station_id_lst):
    # Subset station data
    station_df = df_518[df_518['station_id'] == s_id].sort_values(by='datetime')
    # Find optimum lag and store station id with optimum lag in dictionary
    sel_res = ardl_select_order(
        station_df['pct_blackout'],
        exog=station_df[['t2m', 'wind_speed', 'tp']],
        maxlag=maxlag,
        ic='bic',
        maxorder=maxlag,
        causal=True,
        trend='ct'
        )
    # optimum lag values for endogenous variables
    endo_res_lst = list(sel_res.aic.values[0][1].values())
    # insert the optimum lag value for exdogenous variable
    endo_res_lst.insert(0, sel_res.aic.values[0][0])
    # store station id and optimum lag values in dictionary
    station_id_lag_dic[s_id] = endo_res_lst # key: station id, value:[pct_blackout_lag, t2m_lag, wind_speed_lag, tp_lag]

station_id_lag_dic_copy = station_id_lag_dic
station_id_lag_df = pd.DataFrame.from_dict(station_id_lag_dic_copy, orient='index')

station_id_optimum_lag_bic_df = station_id_lag_df.reset_index().rename(columns={
                                                          'index': 'station_id',
                                                          0: 'pct_blackout_lag',
                                                          1: 't2m_lag',
                                                          2: 'wind_speed_lag',
                                                          3: 'tp_lag'
                                                          })
station_id_optimum_lag_bic_df.to_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_516_ARDL_optimum_lag_bic_max24_df.csv')

## Daily

### 8 variables

#### Correlation

In [13]:
# Read in dataframe
station_daily_agg = pd.read_csv('/Users/yiyihe/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_518_weather_daily_agg.csv',
                                index_col=0)
correlation_matrix = station_daily_agg[['blackout_minutes',
                                        't2m_min',
                                        't2m_median',
                                        't2m_max',
                                        'wind_speed_median',
                                        'wind_speed_max',
                                        'tp_median',
                                        'tp_max']].corr()
correlation_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,blackout_minutes,t2m_min,t2m_median,t2m_max,wind_speed_median,wind_speed_max,tp_median,tp_max
blackout_minutes,1.0,0.019081,0.025478,0.022975,-0.003872,0.003098,0.034876,0.040627
t2m_min,0.019081,1.0,0.909367,0.71937,0.261768,0.353571,0.194946,0.220751
t2m_median,0.025478,0.909367,1.0,0.919263,0.181428,0.295282,0.007708,0.024257
t2m_max,0.022975,0.71937,0.919263,1.0,0.094203,0.227821,-0.172067,-0.171307
wind_speed_median,-0.003872,0.261768,0.181428,0.094203,1.0,0.871425,0.092445,0.105082
wind_speed_max,0.003098,0.353571,0.295282,0.227821,0.871425,1.0,0.068095,0.088638
tp_median,0.034876,0.194946,0.007708,-0.172067,0.092445,0.068095,1.0,0.812369
tp_max,0.040627,0.220751,0.024257,-0.171307,0.105082,0.088638,0.812369,1.0


In [14]:
correlation_matrix = station_daily_agg[['blackout_minutes',
                                        't2m_max',
                                        'wind_speed_max',
                                        'tp_max']].corr()
correlation_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,blackout_minutes,t2m_max,wind_speed_max,tp_max
blackout_minutes,1.0,0.022975,0.003098,0.040627
t2m_max,0.022975,1.0,0.227821,-0.171307
wind_speed_max,0.003098,0.227821,1.0,0.088638
tp_max,0.040627,-0.171307,0.088638,1.0


#### ARDL

In [1]:
# Read in dataframe
station_daily_agg = pd.read_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_518_weather_daily_agg.csv',
                                index_col=0)
station_daily_agg.head(3)

# Set max lag
maxlag = 5
# Target
target = "blackout_minutes"
predictors_lst = [
                't2m_min',
                't2m_median',
                't2m_max',
                'wind_speed_median',
                'wind_speed_max',
                'tp_median',
                'tp_max']

# Create a list of unique station ids
station_id_lst = list(set(station_daily_agg.station_id.unique()))

# Identify stations with fewer than 90 days worth of data
groupby_station = station_daily_agg.groupby('station_id')
fewer90_station_ids = groupby_station.count()[groupby_station.count().date<92].index.values # np array

# Initiate an empty dictionary
station_id_daily_lag_bic = {}

# Iterate through all stations
for s_id in tqdm(station_id_lst):
    if s_id in fewer90_station_ids:
        continue
    else:
        # Subset station data
        station_df = station_daily_agg[station_daily_agg['station_id'] == s_id].sort_values(by='date')
        # Find optimum lag and store station id with optimum lag in dictionary
        sel_res = ardl_select_order(
            station_df[target],
            exog=station_df[predictors_lst],
            maxlag=maxlag,
            ic='bic',
            maxorder=maxlag,
            causal=True,
            trend='ct'
            )
        # optimum lag values for endogenous variables
        endo_res_lst = list(sel_res.bic.values[0][1].values())
        # insert the optimum lag value for exdogenous variable
        endo_res_lst.insert(0, sel_res.bic.values[0][0])
        # store station id and optimum lag values in dictionary
        station_id_daily_lag_bic[s_id] = endo_res_lst # key: station id, value:[pct_blackout_lag, t2m_lag, wind_speed_lag, tp_lag]

station_id_daily_lag_bic_df = pd.DataFrame.from_dict(station_id_lag_dic_copy, orient='index')

station_id_optimum_lag_bic_df = station_id_lag_df.reset_index().rename(columns={
                                                                       'index': 'station_id',
                                                                    0: 'blackout_m_lag',
                                                                    1: 't2m_min_lag',
                                                                    2: 't2m_median_lag',
                                                                    3: 't2m_max_lag',
                                                                    4: 'wind_speed_median_lag',
                                                                    5: 'wind_speed_max_lag',
                                                                    6: 'tp_median_lag',
                                                                    7: 'tp_max_lag'
                                                                    })
station_id_optimum_lag_bic_df.to_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_id_daily_lag5_bic_df.csv')

### 3 variables

In [4]:
# Read in dataframe
station_daily_agg = pd.read_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_518_weather_daily_agg.csv',
                                index_col=0)
station_daily_agg.head(3)

# Set max lag
maxlag = 5
# Target
target = "blackout_minutes"
predictors_lst = [
                # 't2m_min',
                # 't2m_median',
                't2m_max',
                # 'wind_speed_median',
                'wind_speed_max',
                # 'tp_median',
                'tp_max']

# Create a list of unique station ids
station_id_lst = list(set(station_daily_agg.station_id.unique()))

# Identify stations with fewer than 90 days worth of data
groupby_station = station_daily_agg.groupby('station_id')
fewer90_station_ids = groupby_station.count()[groupby_station.count().date<92].index.values # np array

# Initiate an empty dictionary
station_id_daily_lag_bic = {}

# Iterate through all stations
for s_id in tqdm(station_id_lst):
    if s_id in fewer90_station_ids:
        continue
    else:
        # Subset station data
        station_df = station_daily_agg[station_daily_agg['station_id'] == s_id].sort_values(by='date')
        # Find optimum lag and store station id with optimum lag in dictionary
        sel_res = ardl_select_order(
            station_df[target],
            exog=station_df[predictors_lst],
            maxlag=maxlag,
            ic='bic',
            maxorder=maxlag,
            causal=True,
            trend='ct'
            )
        # optimum lag values for endogenous variables
        endo_res_lst = list(sel_res.bic.values[0][1].values())
        # insert the optimum lag value for exdogenous variable
        endo_res_lst.insert(0, sel_res.bic.values[0][0])
        # store station id and optimum lag values in dictionary
        station_id_daily_lag_bic[s_id] = endo_res_lst # key: station id, value:[pct_blackout_lag, t2m_lag, wind_speed_lag, tp_lag]

In [7]:
station_id_daily_lag_bic_df = pd.DataFrame.from_dict(station_id_daily_lag_bic, orient='index')

station_id_optimum_lag_bic_df = station_id_daily_lag_bic_df.reset_index().rename(columns={
                                                                    'index': 'station_id',
                                                                    0: 'blackout_m_lag',
                                                                    1: 't2m_max_lag',
                                                                    2: 'wind_speed_max_lag',
                                                                    3: 'tp_max_lag'
                                                                    })
station_id_optimum_lag_bic_df.to_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_id_daily_lag5_max_weather_bic_df.csv')

#### Output

In [17]:
# Frequency of variables that contribute to the ARDL model
station_id_optimum_lag_bic_df = pd.read_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_id_daily_lag5_max_weather_bic_df.csv',
                                            index_col=0)
station_id_optimum_lag_bic_df[["blackout_m_lag", "t2m_max_lag", "wind_speed_max_lag", "tp_max_lag"]].notna().astype(int).sum()

blackout_m_lag        397
t2m_max_lag            81
wind_speed_max_lag     28
tp_max_lag             98
dtype: int64

In [23]:
# Number of variables contributing to the final ARDL model
station_id_optimum_lag_bic_df[["blackout_m_lag", "t2m_max_lag", "wind_speed_max_lag", "tp_max_lag"]].notna().astype(int).sum(axis=1).value_counts()

1    289
2    112
0     38
3     29
4      1
Name: count, dtype: int64

# Granger Causality

## Hourly

In [68]:
# Create a list of unique station ids
station_id_lst = list(set(df_518.station_id.unique()))
# Set max lag
maxlag = 72
# Target
target = "pct_blackout"
# Predictor
predictor = "tp"

# Initiate an empty dictionary
station_id_tp_gc_max72_dic = {}

# List of stations visited
# existing_keys = station_id_t2m_gc_max72_dic.keys()
# Iterate through all stations
for s_id in tqdm(station_id_lst):
    if s_id == 518 or s_id == 563:
        continue 
    else:
        # Subset station data
        station_df = df_518[df_518['station_id'] == s_id].sort_values(by='datetime')
        test_result = grangercausalitytests(
            station_df[[target, predictor]], maxlag=maxlag, addconst=True, verbose=False)
        F_test_p_values = [round(test_result[i+1][0]['ssr_ftest'][1],4) for i in range(maxlag)]
        Chi_squared_p_values = [round(test_result[i+1][0]['ssr_chi2test'][1],4) for i in range(maxlag)]
        p_values_min = np.min(F_test_p_values+Chi_squared_p_values)
        # Key: station id, Value: list of 1. minimum F/Chi p values 2. F-test p values for all lags 3. Chi-square test p-values for all lags
        station_id_tp_gc_max72_dic[s_id] = [p_values_min, F_test_p_values, Chi_squared_p_values]

100%|██████████| 516/516 [2:03:30<00:00, 14.36s/it]  


In [86]:
# Store granger causality results (p values for 2 tests, minimum p-value) for one predictor in dataframe
station_id_tp_gc_max72_df = pd.DataFrame.from_dict(station_id_tp_gc_max72_dic, orient='index').reset_index()
station_id_tp_gc_max72_df.rename(columns={
    'index':'station_id',
    0:'tp_p-value_min',
    1:'tp_f_p-value',
    2:'tp_Chi_p-value'
}, inplace=True)
# Join dataframes
dfs = [station_id_t2m_gc_max72_df, station_id_windspeed_gc_max72_df, station_id_tp_gc_max72_df]
df_joined= ft.reduce(lambda left, right: pd.merge(left, right, on='station_id'), dfs)
# df_joined.to_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_514_gc_max72_pvalue_df.csv')

Summary of the results of granger causality

In [106]:
df_pvalues_max72 = df_joined[['station_id', 't2m_p-value_min', 'windspeed_p-value_min', 'tp_p-value_min']]
station_daily_gc_max30_df_pmin_binarydf_pvalues_max72_binary = df_pvalues_max72.set_index('station_id')<0.05
summary = pd.DataFrame({
    'True': df_pvalues_max72_binary.apply(lambda x: (x == True).sum()),
    'False': df_pvalues_max72_binary.apply(lambda x: (x == False).sum())
    })
summary
# df_pvalues_max72.to_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_514_gc_max72_pvalue_reduced.csv')

## Daily

In [181]:
# Read in dataframe
station_daily_agg = pd.read_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_518_weather_daily_agg.csv',
                                index_col=0)
station_daily_agg.head(3)

# Output folder path
out_folder = Path.home()\
/"Library/CloudStorage/"\
/"OneDrive-GeorgiaInstituteofTechnology/"\
/"Research/Energy_resilience/01_data/"\
/"2024_9_10_Tingyu/processed/daily_maxlag30_dfs/"

# Identify stations with fewer than 90 days worth of data
groupby_station = station_daily_agg.groupby('station_id')
fewer90_station_ids = groupby_station.count()[groupby_station.count().date<92].index.values

# Create a list of unique station ids
station_id_lst = list(set(station_daily_agg.station_id.unique()))

# Set max lag
maxlag = 30
# Target
target = "blackout_minutes"
# Predictors
weather_variables = ['t2m', 'wind_speed', 'tp']
aggregations = ['min', 'median', 'mean', 'max']
predictor_lst = [w + "_" + a for w in weather_variables for a in aggregations]

for predictor in predictor_lst:
    # Initiate an empty dictionary
    station_dic = {}

    for s_id in tqdm(station_id_lst):
        if s_id in fewer90_station_ids:
            continue 
        else:
            # Subset station data
            station_df = station_daily_agg[station_daily_agg['station_id'] == s_id].sort_values(by='date')
            try:
                test_result = grangercausalitytests(
                    station_df[[target, predictor]], maxlag=maxlag, addconst=True, verbose=False)
                F_test_p_values = [round(test_result[i+1][0]['ssr_ftest'][1],4) for i in range(maxlag)]
                Chi_squared_p_values = [round(test_result[i+1][0]['ssr_chi2test'][1],4) for i in range(maxlag)]
                p_values_min = np.min(F_test_p_values+Chi_squared_p_values)
                # Key: station id, Value: list of 1. minimum F/Chi p values 2. F-test p values for all lags 3. Chi-square test p-values for all lags
                station_dic[s_id] = [p_values_min, F_test_p_values, Chi_squared_p_values]
            except:
                station_dic[s_id] = [np.nan, np.nan, np.nan]

    station_gc_max30_df = pd.DataFrame.from_dict(station_dic, orient='index').reset_index()
    station_gc_max30_df.rename(columns={
    'index':'station_id',
    0:f'{predictor}_p-value_min',
    1:f'{predictor}_f_p-value',
    2:f'{predictor}_Chi_p-value'
    }, inplace=True)
    station_gc_max30_df.to_csv(Path(out_folder)/f'station_gc_max30_{predictor}.csv')


 16%|█▋        | 84/516 [00:19<01:34,  4.59it/s]

In [34]:
out_folder = Path.home()\
/"Library/CloudStorage/"\
/"OneDrive-GeorgiaInstituteofTechnology/"\
/"Research/Energy_resilience/01_data/"\
/"2024_9_10_Tingyu/processed/daily_maxlag30_dfs/"

weather_variables = ['t2m', 'wind_speed', 'tp']
aggregations = ['min', 'median', 'mean', 'max']
pmin_col_names = [w + "_" + a + "_p-value_min"\
                  for w in weather_variables\
                  for a in aggregations]

station_daily_gc_max30_df_lst = []
for file in os.listdir(out_folder):
    if file[-3:] == 'csv':
        df = pd.read_csv(os.path.join(out_folder,file),
                         index_col=0)
        station_daily_gc_max30_df_lst.append(df)
station_daily_gc_max30_df_joined= ft.reduce(lambda left, right: pd.merge(left, right, on='station_id'),
                     station_daily_gc_max30_df_lst)

station_daily_gc_max30_df_pmin = station_daily_gc_max30_df_joined[['station_id']+pmin_col_names]
station_daily_gc_max30_df_pmin_binary = station_daily_gc_max30_df_pmin.set_index('station_id')<0.05
summary = pd.DataFrame({
    'True': station_daily_gc_max30_df_pmin_binary.apply(lambda x: (x == True).sum()),
    'False': station_daily_gc_max30_df_pmin_binary.apply(lambda x: (x == False).sum())
    })
summary.to_csv(Path(out_folder)/'station_daily_gc_max30_p_value_summary.csv')

## Other

In [24]:
# https://www.machinelearningplus.com/time-series/granger-causality-test-in-python/
def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False):    
    """Check Granger Causality of all possible combinations of the Time series.
    The rows are the response variable, columns are predictors. The values in the table 
    are the P-Values. P-Values lesser than the significance level (0.05), implies 
    the Null Hypothesis that the coefficients of the corresponding past values is 
    zero, that is, the X does not cause Y can be rejected.

    data      : pandas dataframe containing the time series variables
    variables : list containing names of the time series variables.
    """
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

In [25]:
grangers_causation_matrix(station_df, ['pct_blackout', 't2m', 'wind_speed'])

Unnamed: 0,pct_blackout_x,t2m_x,wind_speed_x
pct_blackout_y,1.0,0.0271,0.2261
t2m_y,0.0472,1.0,0.0
wind_speed_y,0.0023,0.0,1.0
