In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime

In [5]:
df1 = pd.read_csv("../input/wind_dataset_imputed1.csv")
df1['DATE']=pd.to_datetime(df1['DATE'])
df1 = df1.set_index('DATE')
df1.head()

Unnamed: 0_level_0,WIND,IND,RAIN,IND.1,T.MAX,IND.2,T.MIN,T.MIN.G
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1961-01-01,13.67,0,0.2,0.0,9.5,0.0,3.7,-1.0
1961-01-02,11.5,0,5.1,0.0,7.2,0.0,4.2,1.1
1961-01-03,11.25,0,0.4,0.0,5.5,0.0,0.5,-0.5
1961-01-04,8.63,0,0.2,0.0,5.6,0.0,0.4,-3.2
1961-01-05,11.92,0,10.4,0.0,7.2,1.0,-1.5,-7.5


In [6]:
df2 = pd.read_csv("../input/wind_dataset_imputed2.csv")
df2['DATE']=pd.to_datetime(df2['DATE'])
df2 = df2.set_index('DATE')
df2.head()

Unnamed: 0_level_0,WIND,IND,RAIN,IND.1,T.MAX,IND.2,T.MIN,T.MIN.G
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1961-01-01,13.67,0,0.2,0.0,9.5,0.0,3.7,-1.0
1961-01-02,11.5,0,5.1,0.0,7.2,0.0,4.2,1.1
1961-01-03,11.25,0,0.4,0.0,5.5,0.0,0.5,-0.5
1961-01-04,8.63,0,0.2,0.0,5.6,0.0,0.4,-3.2
1961-01-05,11.92,0,10.4,0.0,7.2,1.0,-1.5,-7.5


In [15]:
from statsmodels.tsa.stattools import adfuller

def adfuller_test(series, sig=0.05, name=''):
    res = adfuller(series, autolag='AIC')    
    p_value = round(res[1], 3) 

    if p_value <= sig:
        print(f" {name} : P-Value = {p_value} => Stationary. ")
    else:
        print(f" {name} : P-Value = {p_value} => Non-stationary.")

### df1 stationarity

In [16]:
for name, column in df1.iteritems():
    adfuller_test(column, name=column.name)

 WIND : P-Value = 0.0 => Stationary. 
 IND : P-Value = 0.0 => Stationary. 
 RAIN : P-Value = 0.0 => Stationary. 
 IND.1 : P-Value = 0.0 => Stationary. 
 T.MAX : P-Value = 0.0 => Stationary. 
 IND.2 : P-Value = 0.0 => Stationary. 
 T.MIN : P-Value = 0.0 => Stationary. 
 T.MIN.G : P-Value = 0.0 => Stationary. 


### df2 stationarity

In [17]:
for name, column in df2.iteritems():
    adfuller_test(column, name=column.name)

 WIND : P-Value = 0.0 => Stationary. 
 IND : P-Value = 0.0 => Stationary. 
 RAIN : P-Value = 0.0 => Stationary. 
 IND.1 : P-Value = 0.0 => Stationary. 
 T.MAX : P-Value = 0.0 => Stationary. 
 IND.2 : P-Value = 0.0 => Stationary. 
 T.MIN : P-Value = 0.0 => Stationary. 
 T.MIN.G : P-Value = 0.0 => Stationary. 


### df1 causality

In [11]:
from statsmodels.tsa.stattools import grangercausalitytests

variables=df1.columns  
matrix = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
for col in matrix.columns:
    for row in matrix.index:
        test_result = grangercausalitytests(df1[[row, col]], maxlag=20, verbose=False)            
        p_values = [round(test_result[i+1][0]['ssr_chi2test'][1],4) for i in range(20)]            
        min_p_value = np.min(p_values)
        matrix.loc[row, col] = min_p_value
matrix.columns = [var + '_x' for var in variables]
matrix.index = [var + '_y' for var in variables]
print(matrix)



           WIND_x   IND_x  RAIN_x  IND.1_x  T.MAX_x  IND.2_x  T.MIN_x  \
WIND_y     1.0000  0.0007  0.0000   0.0019   0.0000   0.0000   0.0000   
IND_y      0.2700  1.0000  0.0468   0.1626   0.0001   0.0931   0.0022   
RAIN_y     0.0007  0.3440  1.0000   0.2500   0.1410   0.4058   0.3219   
IND.1_y    0.0014  0.5768  0.2055   1.0000   0.3845   0.0000   0.5088   
T.MAX_y    0.0000  0.0537  0.0000   0.1235   1.0000   0.0490   0.0000   
IND.2_y    0.0000  0.0509  0.0011   0.0000   0.0000   1.0000   0.0000   
T.MIN_y    0.0000  0.0000  0.0000   0.7312   0.0000   0.1945   1.0000   
T.MIN.G_y  0.0000  0.0000  0.0000   0.8479   0.0000   0.1386   0.0000   

           T.MIN.G_x  
WIND_y        0.0000  
IND_y         0.0001  
RAIN_y        0.6413  
IND.1_y       0.3098  
T.MAX_y       0.0000  
IND.2_y       0.0000  
T.MIN_y       0.0000  
T.MIN.G_y     1.0000  


### df2 causality

In [13]:
from statsmodels.tsa.stattools import grangercausalitytests

variables=df2.columns  
matrix = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
for col in matrix.columns:
    for row in matrix.index:
        test_result = grangercausalitytests(df2[[row, col]], maxlag=20, verbose=False)            
        p_values = [round(test_result[i+1][0]['ssr_chi2test'][1],4) for i in range(20)]            
        min_p_value = np.min(p_values)
        matrix.loc[row, col] = min_p_value
matrix.columns = [var + '_x' for var in variables]
matrix.index = [var + '_y' for var in variables]
print(matrix)

           WIND_x   IND_x  RAIN_x  IND.1_x  T.MAX_x  IND.2_x  T.MIN_x  \
WIND_y     1.0000  0.0007  0.0000   0.0018   0.0000   0.0000   0.0000   
IND_y      0.2700  1.0000  0.0468   0.1594   0.0001   0.1955   0.0087   
RAIN_y     0.0007  0.3440  1.0000   0.2339   0.0696   0.4787   0.3655   
IND.1_y    0.0010  0.6031  0.1924   1.0000   0.0546   0.0000   0.3571   
T.MAX_y    0.0000  0.3840  0.0001   0.0104   1.0000   0.0184   0.0000   
IND.2_y    0.0000  0.0568  0.0011   0.0000   0.0000   1.0000   0.0000   
T.MIN_y    0.0000  0.0000  0.0000   0.5146   0.0000   0.1601   1.0000   
T.MIN.G_y  0.0000  0.0000  0.0000   0.7561   0.0000   0.0968   0.0000   

           T.MIN.G_x  
WIND_y        0.0000  
IND_y         0.0001  
RAIN_y        0.6777  
IND.1_y       0.1344  
T.MAX_y       0.0000  
IND.2_y       0.0000  
T.MIN_y       0.0000  
T.MIN.G_y     1.0000  


### cointegration test

In [19]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

def cointegration_test(df, alpha=0.05): 
    out = coint_johansen(df,-1,5)
    d = {'0.90':0, '0.95':1, '0.99':2}
    traces = out.lr1
    cvts = out.cvt[:, d[str(1-alpha)]]
    def adjust(val, length= 6): return str(val).ljust(length)

    # Summary
    print('Name   ::  Test Stat > C(95%)    =>   Signif  \n', '--'*20)
    for col, trace, cvt in zip(df.columns, traces, cvts):
        print(adjust(col), ':: ', adjust(round(trace,2), 9), ">", adjust(cvt, 8), ' =>  ' , trace > cvt)

### df1 cointegration

In [18]:
cointegration_test(df1)

Name   ::  Test Stat > C(95%)    =>   Signif  
 ----------------------------------------
WIND   ::  2920.17   > 143.6691  =>   True
IND    ::  2131.87   > 111.7797  =>   True
RAIN   ::  1384.65   > 83.9383   =>   True
IND.1  ::  749.26    > 60.0627   =>   True
T.MAX  ::  395.25    > 40.1749   =>   True
IND.2  ::  176.34    > 24.2761   =>   True
T.MIN  ::  52.28     > 12.3212   =>   True
T.MIN.G ::  5.81      > 4.1296    =>   True


### df2 cointegration

In [20]:
cointegration_test(df2)

Name   ::  Test Stat > C(95%)    =>   Signif  
 ----------------------------------------
WIND   ::  2974.58   > 143.6691  =>   True
IND    ::  2172.65   > 111.7797  =>   True
RAIN   ::  1427.22   > 83.9383   =>   True
IND.1  ::  751.79    > 60.0627   =>   True
T.MAX  ::  399.63    > 40.1749   =>   True
IND.2  ::  182.16    > 24.2761   =>   True
T.MIN  ::  52.49     > 12.3212   =>   True
T.MIN.G ::  6.07      > 4.1296    =>   True
