In [None]:
# -----********************-----

# Created Time: 2025/07/13

# Last updated: 2025/07/13

# Author: Yiyi He

### Use Case

# This notebook explores the application of autoregressive models
# 1. 

# -----********************-----

# Libraries

In [1]:
# Import libraries
import os
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")

# Stats
from statsmodels.tsa.api import ARDL
import statsmodels.api as sm
import numpy as np
from statsmodels.tsa.ardl import ardl_select_order
from statsmodels.tsa.stattools import grangercausalitytests

# Geo
from shapely.geometry import Point, Polygon
# import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 1000

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# Processing
from tqdm import tqdm
import functools as ft

# Granger Causality

## GC with hourly data

In [5]:
home_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/'

hourly_df = pd.read_csv(home_dir + "01_data/processed/csv/hourly_519station_3weather.csv",
                        index_col=0)

In [57]:
def run_gc_hourly(maxlag, target, predictor, station_lst, hourly_df):
    # Initiate an empty dictionary
    gc_dic = {}

    # List of stations visited
    # existing_keys = station_id_t2m_gc_max72_dic.keys()
    # Iterate through all stations
    for s_id in tqdm(station_lst):
        if s_id == 518 or s_id == 563: # stations with too few records to work with
            continue
        else:
            # Subset station data
            station_df = hourly_df[hourly_df['station_id'] == s_id].sort_values(by='datetime')
            station_df
            test_result = grangercausalitytests(
                station_df[[target, predictor]], maxlag=maxlag, addconst=True, verbose=False)
            F_test_p_values = [round(test_result[i+1][0]['ssr_ftest'][1],4) for i in range(maxlag)]
            Chi_squared_p_values = [round(test_result[i+1][0]['ssr_chi2test'][1],4) for i in range(maxlag)]
            p_values_min = np.min(F_test_p_values+Chi_squared_p_values)
            # Key: station id, Value: list of 1. minimum F/Chi p values 2. F-test p values for all lags 3. Chi-square test p-values for all lags
            gc_dic[s_id] = [p_values_min, F_test_p_values, Chi_squared_p_values]
            
    gc_df = pd.DataFrame.from_dict(gc_dic, orient='index').reset_index()
    gc_df.rename(columns={
    'index':'station_id',
    0:f'{predictor}_p-value_min',
    1:f'{predictor}_f_p-value',
    2:f'{predictor}_Chi_p-value'
    }, inplace=True)
    return gc_df

In [None]:
home_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/'

hourly_df = pd.read_csv(home_dir + "01_data/processed/csv/hourly_519station_3weather.csv",
                        index_col=0)
target = 'pct_blackout'
maxlag = 72
station_lst = list(set(hourly_df.station_id.unique()))
predictor_lst = ['t2m', 'tp', 'wind_speed']
res_dic = {}
for predictor in predictor_lst:
    df_res = run_gc_hourly(maxlag, target, predictor, station_lst, hourly_df)
    res_dic[predictor] = df_res

dfs = [res_dic[p] for p in predictor_lst]
df_joined= ft.reduce(lambda left, right: pd.merge(left, right, on='station_id'), dfs)
df_joined.to_csv(home_dir + "01_data/processed/csv/granger_max{maxlag}_pvalue.csv")

  2%|▏         | 9/519 [01:10<53:16,  6.27s/it]  

In [None]:
# Create a list of unique station ids
station_id_lst = list(set(hourly_df.station_id.unique()))
# Set max lag
maxlag = 72
# Target
target = "pct_blackout"
# Predictor
predictor = "tp" # t2m, wind_speed

# Initiate an empty dictionary
tp_gc_max72_dic = {}

# List of stations visited
# existing_keys = station_id_t2m_gc_max72_dic.keys()
# Iterate through all stations
for s_id in tqdm(station_id_lst):
    if s_id == 518 or s_id == 563: # stations with too few records to work with
        continue
    else:
        # Subset station data
        station_df = hourly_df[hourly_df['station_id'] == s_id].sort_values(by='datetime')
        test_result = grangercausalitytests(
            station_df[[target, predictor]], maxlag=maxlag, addconst=True, verbose=False)
        F_test_p_values = [round(test_result[i+1][0]['ssr_ftest'][1],4) for i in range(maxlag)]
        Chi_squared_p_values = [round(test_result[i+1][0]['ssr_chi2test'][1],4) for i in range(maxlag)]
        p_values_min = np.min(F_test_p_values+Chi_squared_p_values)
        # Key: station id, Value: list of 1. minimum F/Chi p values 2. F-test p values for all lags 3. Chi-square test p-values for all lags
        tp_gc_max72_dic[s_id] = [p_values_min, F_test_p_values, Chi_squared_p_values]

100%|██████████| 519/519 [1:31:05<00:00, 10.53s/it]


In [10]:
# Create a list of unique station ids
station_id_lst = list(set(hourly_df.station_id.unique()))
# Set max lag
maxlag = 72
# Target
target = "pct_blackout"
# Predictor
predictor = "t2m" # t2m, wind_speed

# Initiate an empty dictionary
t2m_gc_max72_dic = {}

# List of stations visited
# existing_keys = station_id_t2m_gc_max72_dic.keys()
# Iterate through all stations
for s_id in tqdm(station_id_lst):
    if s_id == 518 or s_id == 563: # stations with too few records to work with
        continue
    else:
        # Subset station data
        station_df = hourly_df[hourly_df['station_id'] == s_id].sort_values(by='datetime')
        test_result = grangercausalitytests(
            station_df[[target, predictor]], maxlag=maxlag, addconst=True, verbose=False)
        F_test_p_values = [round(test_result[i+1][0]['ssr_ftest'][1],4) for i in range(maxlag)]
        Chi_squared_p_values = [round(test_result[i+1][0]['ssr_chi2test'][1],4) for i in range(maxlag)]
        p_values_min = np.min(F_test_p_values+Chi_squared_p_values)
        # Key: station id, Value: list of 1. minimum F/Chi p values 2. F-test p values for all lags 3. Chi-square test p-values for all lags
        t2m_gc_max72_dic[s_id] = [p_values_min, F_test_p_values, Chi_squared_p_values]

100%|██████████| 519/519 [1:31:24<00:00, 10.57s/it]


In [11]:
# Create a list of unique station ids
station_id_lst = list(set(hourly_df.station_id.unique()))
# Set max lag
maxlag = 72
# Target
target = "pct_blackout"
# Predictor
predictor = "wind_speed" # t2m, wind_speed

# Initiate an empty dictionary
wind_speed_gc_max72_dic = {}

# List of stations visited
# existing_keys = station_id_t2m_gc_max72_dic.keys()
# Iterate through all stations
for s_id in tqdm(station_id_lst):
    if s_id == 518 or s_id == 563: # stations with too few records to work with
        continue
    else:
        # Subset station data
        station_df = hourly_df[hourly_df['station_id'] == s_id].sort_values(by='datetime')
        test_result = grangercausalitytests(
            station_df[[target, predictor]], maxlag=maxlag, addconst=True, verbose=False)
        F_test_p_values = [round(test_result[i+1][0]['ssr_ftest'][1],4) for i in range(maxlag)]
        Chi_squared_p_values = [round(test_result[i+1][0]['ssr_chi2test'][1],4) for i in range(maxlag)]
        p_values_min = np.min(F_test_p_values+Chi_squared_p_values)
        # Key: station id, Value: list of 1. minimum F/Chi p values 2. F-test p values for all lags 3. Chi-square test p-values for all lags
        wind_speed_gc_max72_dic[s_id] = [p_values_min, F_test_p_values, Chi_squared_p_values]

100%|██████████| 519/519 [1:33:30<00:00, 10.81s/it]


In [8]:
# Store granger causality results (p values for 2 tests, minimum p-value) for one predictor in dataframe
tp_gc_max72_df = pd.DataFrame.from_dict(tp_gc_max72_dic, orient='index').reset_index()
tp_gc_max72_df.rename(columns={
    'index':'station_id',
    0:'tp_p-value_min',
    1:'tp_f_p-value',
    2:'tp_Chi_p-value'
}, inplace=True)
# Join dataframes
# dfs = [station_id_t2m_gc_max72_df, station_id_windspeed_gc_max72_df, station_id_tp_gc_max72_df]
# df_joined= ft.reduce(lambda left, right: pd.merge(left, right, on='station_id'), dfs)
# df_joined.to_csv('/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/Research/Energy_resilience/01_data/2024_9_10_Tingyu/processed/station_514_gc_max72_pvalue_df.csv')