In [2]:
# Here, we provide an example of IV-2SLS analysis of log(production) on tavg for Alberta.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize

from collections.abc import Mapping
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS, IVGMMCUE

import os
import multiprocessing as mp

In [11]:
# prepare cPCs
covar_df = pd.read_csv("./data/user_data/_covariates/ppi_and_usd_imputed.csv")

# - min max scale each column of covar_df
from sklearn.preprocessing import MinMaxScaler
covar_sub = covar_df.drop(columns = ["date", "year", "month"])
covar_scaled = pd.DataFrame(MinMaxScaler().fit_transform(covar_sub), columns = covar_sub.columns)

# - apply PCA to covar_scaled
n_pca = 5
pca = PCA(n_components = n_pca)
covar_pca = pca.fit_transform(covar_scaled)
covar_pca = pd.DataFrame(covar_pca, columns = ["PC"+str(i) for i in range(1, n_pca+1)])
ppi_covars = covar_pca.columns.tolist()

# - add date column to covar_pca
covar_df = pd.concat([covar_df.loc[:, "date"], covar_pca], axis = 1)

print(np.round(np.sum(pca.explained_variance_ratio_), 2))
display(covar_df.head())

0.98


Unnamed: 0,date,PC1,PC2,PC3,PC4,PC5
0,1997-01-01,-1.437976,-0.032755,-0.226921,0.030429,0.183384
1,1997-02-01,-1.443793,-0.04861,-0.244162,0.050216,0.18565
2,1997-03-01,-1.439155,-0.083351,-0.263989,0.071648,0.184915
3,1997-04-01,-1.409855,-0.157256,-0.262722,0.102872,0.189721
4,1997-05-01,-1.395066,-0.13136,-0.259931,0.09258,0.19242


In [None]:
prov_short = ["AB", "BC", "MB", "NB", "NL", "NS", "ON", "PE", "QC", "SK"]
i_prov = 0
treatment = "tavg"

prod_filename = "./data/user_data/01_iv_analysis/" + prov_short[i_prov] + "/prod_temp.csv"

prod_data = pd.read_csv(prod_filename)
prod_data = prod_data.rename(columns = {"Date":"date"})
prod_data["tdiff"] = np.abs(prod_data.tmax - prod_data.tmin)
prod_data["log_population"] = np.log(prod_data.Population + 1)

# - left join the cPCs to the prod_data by date.
full_df = pd.merge(prod_data, covar_df, on='date', how='left')

# - apply lags to ppi_covars
lags = 3
for lag in range(1, lags+1):
    for covar in ppi_covars:
        full_df[covar + "_lag_" + str(lag)] = full_df[covar].shift(lag)

full_df = full_df.dropna()

prods = prod_data.columns[2:16]

result = pd.DataFrame(columns = ["industry", "param", "pval(param)", "pval(endog)", "instr"])


# - Perform IV-2SLS on each industry
for i in range(len(prods)):
    
    example_prod = prods[i]
    temperatures = ["tavg", "tmin", "tmax"]
    instruments = ["lat", "long"]

    eff_modifiers = [s + "_lag_"+str(lags) for s in ppi_covars]  # lagged covariates.

    example_cols = [example_prod] + eff_modifiers + temperatures + instruments + ["date", "year", "month", "log_population"]
    example_df = full_df.loc[:, example_cols].reset_index(drop= True)
    
    # - create log(production) column. 
    example_df = example_df.rename(columns = {example_prod:"production"})
    example_df.loc[:, "log_production"] = np.log(example_df.production + 1)

    # - create columns for the seasons of the year
    example_df.loc[:, "spring"] = (example_df.month.isin([3,4,5])).astype(int)
    example_df.loc[:, "summer"] = (example_df.month.isin([6,7,8])).astype(int)
    example_df.loc[:, "fall"] = (example_df.month.isin([9,10,11])).astype(int)
    example_df.loc[:, "winter"] = (example_df.month.isin([12,1,2])).astype(int)


    import statsmodels.formula.api as smf
    from linearmodels.iv import IV2SLS, IVGMMCUE

    instruments = ["lat", "long"]
    covariates = eff_modifiers + ["log_population", "year", "month", "spring", "summer", "fall"]

    # - Perform IV analyses
    iv_model = IV2SLS(
        dependent = example_df["log_production"],
        exog = example_df.loc[:, covariates],
        endog = example_df.loc[:, treatment],
        instruments = example_df.loc[:, ["lat", "long"]],
    ).fit()

    # - Include Wu-Hausman test. H0: all endogenous variables are exogenous.
    result.loc[i] = [example_prod, 
        np.round(iv_model.params[treatment], 3), 
        np.round(iv_model.pvalues[treatment], 3),
        np.round(iv_model.wu_hausman().pval, 3),  # endog test
        "lat_long"]
    
result.loc[:, "industry"] = result.loc[:, "industry"].str.replace("production_in_division_", "")

In [20]:
# if pval(param) or pval(endog) > 0.05, then set new_param = 0.
bonferroni_pval = 0.05/14

result["is_sig"] = np.where((result["pval(param)"] < bonferroni_pval) & (result["pval(endog)"] < bonferroni_pval), True, False)
result["new_param"] = np.where(result["is_sig"] == True, result["param"], 0)
result.loc[:, ["industry", "new_param", "pval(param)", "pval(endog)"]]


Unnamed: 0,industry,new_param,pval(param),pval(endog)
0,X22.Utilities,-0.048,0.0,0.0
1,X23.Construction,0.0,0.004,0.0
2,X31.33.Manufacturing,0.079,0.0,0.0
3,X48.49.Transportation.and.warehousing,0.0,0.924,0.0
4,X61.Educational.services,-0.035,0.0,0.0
5,X62.Health.care.and.social.assistance,0.028,0.0,0.0
6,X72.Accommodation.and.food.services,0.017,0.0,0.0
7,X81.Other.services..except.public.administration.,0.0,0.958,0.0
8,X91.Public.administration,-0.084,0.0,0.0
9,X11.Agriculture.forestry.fishing.hunting.21.Mi...,0.036,0.0,0.0
