In [1]:
from statsmodels.tsa.ardl import ardl_select_order
from statsmodels.tsa.api import ARDL
from statsmodels.tsa.vector_ar.var_model import VAR
import statsmodels.api as sm

from shapely.geometry import Point, Polygon
import geopandas as gpd
import pandas as pd
import numpy as np
import os
import geopandas as gpd

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.options.display.max_rows = 1000

plt.style.use('seaborn-darkgrid')


In [2]:
input_dir = '../../data/final'

df = pd.read_csv(os.path.join(input_dir, 'df_for_model.csv'))

In [3]:
# describe df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7044379 entries, 0 to 7044378
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   datetime           object 
 1   pct_blackout       float64
 2   wind_forest_cover  float64
 3   t2m                float64
 4   tp                 float64
 5   wind_speed         float64
 6   station_id         int64  
 7   climate_zone_code  object 
dtypes: float64(5), int64(1), object(2)
memory usage: 430.0+ MB


In [15]:

def testMaxLagAndTime_VAR(df, maxlag):
    '''
    This function is used to test max lag selection for VAR model and record time.
    
    Args:
    df: pd.DataFrame, the dataframe with a cerntain station_id
    maxlag: int, the maximum lag to be tested
    
    Params:
    See var_select_order's parameters
    
    Returns:
    test_model: VAR model, the model with the best maxlag
    
    '''

    print('Begin testing...')
    print(f'testing maxlag: {maxlag}')
    time_start = pd.Timestamp.now()

    # select order with VAR
    order_selected = VAR(df[['pct_blackout', 't2m', 'wind_speed', 'tp']]).select_order(
        maxlags=maxlag)
    
    # fit VAR model

    time_selected = pd.Timestamp.now()
    print(f'time elapsed for selecting order: {time_selected-time_start}')
    
    test_model = VAR(df[['pct_blackout', 't2m', 'wind_speed', 'tp']]).fit(
        maxlags=maxlag, method='ols', ic='aic', verbose=True, trend='ctt')
    
    # print results
    print(f'order selected: {order_selected}')
    # print aic
    print(f'AIC: {test_model.aic}')
    
    
    time_fit = pd.Timestamp.now()
    print(f'time elapsed for fitting model: {time_fit-time_selected}')

    time_end = pd.Timestamp.now()
    print(f'time elapsed for the whole process: {time_end-time_start}')
    print('End testing...')
    print('model summary:')
    print(test_model.summary())

    return test_model

In [5]:

df_377 = df[df['station_id'] == 377]


In [16]:
model_377 = testMaxLagAndTime_VAR(df_377, 10)

Begin testing...
testing maxlag: 10


  self._init_dates(dates, freq)


time elapsed for selecting order: 0 days 00:00:00.224898


  self._init_dates(dates, freq)


<statsmodels.tsa.vector_ar.var_model.LagOrderResults object. Selected orders are: AIC -> 5, BIC -> 0, FPE -> 5, HQIC ->  2>
Using 5 based on aic criterion
order selected: <statsmodels.tsa.vector_ar.var_model.LagOrderResults object. Selected orders are: AIC -> 5, BIC -> 0, FPE -> 5, HQIC ->  2>
AIC: -11.542988699140455
time elapsed for fitting model: 0 days 00:00:00.262463
time elapsed for the whole process: 0 days 00:00:00.487455
End testing...
model summary:
  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Mon, 16, Dec, 2024
Time:                     15:12:57
--------------------------------------------------------------------
No. of Equations:         4.00000    BIC:                   -11.4307
Nobs:                     5405.00    HQIC:                  -11.5038
Log likelihood:           609.476    FPE:                9.70384e-06
AIC:                     -11.5430    Det(Omega_mle):     9.54041e-06
----------------