In [None]:
# -----********************-----

# Created Time: 2025/07/13

# Last updated: 2025/07/13

# Author: Yiyi He

### Use Case

# This notebook explores the application of autoregressive models
# 1. 

# -----********************-----

# Libraries

In [1]:
# Import libraries
import os
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")

# Stats
from statsmodels.tsa.api import ARDL
import statsmodels.api as sm
import numpy as np
from statsmodels.tsa.ardl import ardl_select_order
from statsmodels.tsa.stattools import grangercausalitytests

# Geo
from shapely.geometry import Point, Polygon
# import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 1000

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# Processing
from tqdm import tqdm
import functools as ft

# Granger Causality

## GC with hourly data

In [5]:
home_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/'

hourly_df = pd.read_csv(home_dir + "01_data/processed/csv/hourly_519station_3weather.csv",
                        index_col=0)

Define function that runs the Granger Causality test on hourly station data. \
For each station, run the Granger Causality test with the following set up:\
 \
**Target variable**: \
'pct_blackout' \
**Predictor variables**: 
1. 't2m' (temperature) 
2. 'tp' (precipitation) 
3. 'wind_speed' (wind speed)

In [82]:
def run_gc_hourly(maxlag, target, predictor, station_lst, hourly_df):
    # Initiate an empty dictionary for storing test results
    gc_dic = {}

    # Iterate through all stations
    for s_id in tqdm(station_lst):
        # Extracting the dataframe for a/one station
        station_df = hourly_df[hourly_df['station_id'] == s_id].sort_values(by='datetime')
        try:
            # Check if the number of observations available at the station is sufficient for the gc model
            if len(station_df) <= maxlag + 2 + 1: # maxlag + num_variables (target, predictor) + 1
                print(f"Skipping Station {s_id} due to insufficient observations for maxlag={maxlag}")
                continue
            else:
                # Run granger causality test on station hourly data
                test_result = grangercausalitytests(
                    station_df[[target, predictor]], maxlag=maxlag, addconst=True, verbose=False)
                # Save test values
                F_test_p_values = [round(test_result[i+1][0]['ssr_ftest'][1],4) for i in range(maxlag)]
                Chi_squared_p_values = [round(test_result[i+1][0]['ssr_chi2test'][1],4) for i in range(maxlag)]
                p_values_min = np.min(F_test_p_values+Chi_squared_p_values)
                # Key: station id, Value: list of 1. minimum F/Chi p values 2. F-test p values for all lags 3. Chi-square test p-values for all lags
                gc_dic[s_id] = [p_values_min, F_test_p_values, Chi_squared_p_values]
        except ValueError:
            print(f"Skipping Station {s_id} due to insufficient observations for maxlag={maxlag}")
    # Convert to dataframe        
    gc_df = pd.DataFrame.from_dict(gc_dic, orient='index').reset_index()
    gc_df.rename(columns={
    'index':'station_id',
    0:f'{predictor}_p-value_min',
    1:f'{predictor}_f_p-value',
    2:f'{predictor}_Chi_p-value'
    }, inplace=True)
    return gc_df

In [83]:
# Set home directory
home_dir = '/Users/yiyi/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology(2)/Research/Energy_resilience/'
# Read hourly data
hourly_df = pd.read_csv(home_dir + "01_data/processed/csv/hourly_519station_3weather.csv",
                        index_col=0)
# Set target variable
target = 'pct_blackout'
# Set max lag for all variables
maxlag = 120
# Generate a list of station ids
station_lst = list(set(hourly_df.station_id.unique()))
# List of predictor variables
predictor_lst = ['t2m', 'tp', 'wind_speed']

# Initiate empty dictionary to store test output dataframes
res_dic = {}
# Loop through the list of predictors
for predictor in predictor_lst:
    # Run gc test
    df_res = run_gc_hourly(maxlag, target, predictor, station_lst, hourly_df)
    # Save output dataframe in dictionary key: predictor, value: dataframe with test values
    res_dic[predictor] = df_res

# Join resulting dataframes for all predictors together
dfs = [res_dic[p] for p in predictor_lst]
df_joined= ft.reduce(lambda left, right: pd.merge(left, right, on='station_id'), dfs)
# Save output as new csv file
df_joined.to_csv(home_dir + f"01_data/processed/csv/granger_hourly_max{maxlag}_pvalue.csv")

 10%|█         | 54/519 [30:14<4:09:05, 32.14s/it]

Skipping Station 58 due to insufficient observations for maxlag=120


 18%|█▊        | 94/519 [48:09<2:24:04, 20.34s/it]

Skipping Station 103 due to insufficient observations for maxlag=120


 23%|██▎       | 119/519 [59:00<3:34:41, 32.20s/it]

Skipping Station 130 due to insufficient observations for maxlag=120


 24%|██▎       | 123/519 [1:00:10<2:32:05, 23.04s/it]

Skipping Station 134 due to insufficient observations for maxlag=120


 68%|██████▊   | 354/519 [2:57:32<39:47, 14.47s/it]  

Skipping Station 386 due to insufficient observations for maxlag=120


 90%|████████▉ | 466/519 [3:55:57<12:56, 14.64s/it]  

Skipping Station 518 due to insufficient observations for maxlag=120


 98%|█████████▊| 509/519 [4:37:21<03:18, 19.81s/it]   

Skipping Station 563 due to insufficient observations for maxlag=120


100%|██████████| 519/519 [4:40:29<00:00, 32.43s/it]
 10%|█         | 54/519 [27:08<3:25:05, 26.46s/it]

Skipping Station 58 due to insufficient observations for maxlag=120


 18%|█▊        | 94/519 [45:17<2:45:21, 23.35s/it]

Skipping Station 103 due to insufficient observations for maxlag=120


 23%|██▎       | 119/519 [56:08<3:30:32, 31.58s/it]

Skipping Station 130 due to insufficient observations for maxlag=120


 24%|██▎       | 123/519 [57:07<2:15:26, 20.52s/it]

Skipping Station 134 due to insufficient observations for maxlag=120


 68%|██████▊   | 354/519 [2:52:20<43:25, 15.79s/it]  

Skipping Station 386 due to insufficient observations for maxlag=120


 90%|████████▉ | 466/519 [3:44:38<10:34, 11.98s/it]  

Skipping Station 518 due to insufficient observations for maxlag=120


 98%|█████████▊| 509/519 [4:22:50<03:14, 19.45s/it]   

Skipping Station 563 due to insufficient observations for maxlag=120


100%|██████████| 519/519 [4:25:41<00:00, 30.71s/it]
 10%|█         | 54/519 [27:42<3:04:59, 23.87s/it]

Skipping Station 58 due to insufficient observations for maxlag=120


 18%|█▊        | 94/519 [46:13<2:36:52, 22.15s/it]

Skipping Station 103 due to insufficient observations for maxlag=120


 23%|██▎       | 119/519 [55:56<2:44:53, 24.73s/it]

Skipping Station 130 due to insufficient observations for maxlag=120


 24%|██▎       | 123/519 [56:43<1:48:52, 16.50s/it]

Skipping Station 134 due to insufficient observations for maxlag=120


 68%|██████▊   | 354/519 [2:47:28<39:23, 14.33s/it]  

Skipping Station 386 due to insufficient observations for maxlag=120


 90%|████████▉ | 466/519 [3:39:14<08:09,  9.23s/it]  

Skipping Station 518 due to insufficient observations for maxlag=120


 98%|█████████▊| 509/519 [4:17:31<03:34, 21.41s/it]   

Skipping Station 563 due to insufficient observations for maxlag=120


100%|██████████| 519/519 [4:20:58<00:00, 30.17s/it]
