# Pooled Cross Section Regression

For the Pooled Cross Section Regression below, I want to see if the act of dropping interest rates during recession would help make the recession less extreme. I would try to run the model on both single variable and all given variables and conclude the effect. The model would requires me to include new dummy variables for recession period and treatment effect - dropping interest rates during recession.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import scipy.stats as stat
import sklearn as sk3
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
import statsmodels.formula.api as sm
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
df = pd.read_csv("../Data/modelData.csv")
# Drop NaN values
df = df.dropna()
df.head(10)

Unnamed: 0,year,month,CPI,CPI_Inflation,avg_HPI,long_term_interest,federal_funds_rates,budget_on_education,gdp,population,employed_percent,unemployed_percent,lowest,second,third,fourth,top_5_percent
173,1987,1,111.2,0.63,60.60375,7.08,6.43,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517
175,1987,2,111.6,0.36,61.016875,7.25,6.1,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517
176,1987,3,112.1,0.45,61.31125,7.25,6.13,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517
177,1987,4,112.7,0.54,61.74625,8.02,6.37,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517
179,1987,5,113.1,0.35,62.3625,8.61,6.85,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517
181,1987,6,113.5,0.35,62.9125,8.4,6.73,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517
182,1987,7,113.8,0.26,63.451875,8.45,6.58,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517
184,1987,8,114.4,0.53,63.90125,8.76,6.73,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517
186,1987,9,115.0,0.52,64.386875,9.42,7.22,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517
190,1987,10,115.3,0.26,64.75,9.52,7.29,19475,4870217,182753,61.5,6.2,22356,42434,66239,100109.0,167517


In [3]:
df.describe()

Unnamed: 0,year,month,CPI,CPI_Inflation,avg_HPI,long_term_interest,federal_funds_rates,budget_on_education,gdp,population,employed_percent,unemployed_percent,lowest,second,third,fourth,top_5_percent
count,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0
mean,1998.5,6.5,167.047392,0.238472,109.006842,5.840278,4.359132,46896.0,9707590.0,209063.875,62.479167,5.8125,23462.791667,44233.708333,69898.833333,109575.75,192885.708333
std,6.934236,3.458061,31.172453,0.322599,40.099796,1.758579,2.441827,27294.070084,3279140.0,17355.912158,1.355432,1.381202,1076.134429,1767.243965,3251.97558,6875.556281,15987.531885
min,1987.0,1.0,111.2,-1.92,60.60375,2.42,0.11,19475.0,4870217.0,182753.0,58.5,4.0,21639.0,41342.0,64985.0,99622.0,167517.0
25%,1992.75,3.75,142.45,0.0875,74.126548,4.4425,2.4375,28353.75,6793863.0,194329.75,62.075,4.85,22687.75,42917.75,66763.25,102373.75,175536.5
50%,1998.5,6.5,164.15,0.24,91.831905,5.72,4.885,33581.0,9374896.0,206486.5,62.7,5.55,23476.0,44331.5,70528.0,112876.0,201309.5
75%,2004.25,9.25,190.925,0.4125,136.224022,7.185,5.8,63600.5,12479630.0,224038.25,63.125,6.125,24171.25,45663.0,72778.75,115515.5,206440.25
max,2010.0,12.0,219.964,1.22,193.566957,9.52,9.85,131891.0,14964370.0,237830.0,64.4,9.6,25580.0,47110.0,74475.0,118516.0,212081.0


In [4]:
df.iloc[0]['year']

1987.0

In [5]:
# The variables columns will be saved in a list
# Add a column called lgdp which is the log of all gdps and use that variable instead
# Add a column called lCPI which is the log of all CPI and use that variable instead
# Add a column called lfederal_funds_rates which is the log of all federal_funds_rate and use that variable instead
# Add a dummy variable that turns on if the timeline is during the Great Recession Dec 2007 - June 2009
# Add a dummy variable that turns on if the interest rate - federal_funds_rate decreases from previous time point.

import math
lgdp = []
lCPI = []
lfederal = []
recession = []
rate_drop = []
count = []

for i in df.gdp:
    lgdp.append(math.log(i))
for i in df.CPI:
    lCPI.append(math.log(i))
for i in df.federal_funds_rates:
    lfederal.append(math.log(i))

for i in range(288):
    if df.iloc[i]['year'] == 2007 and df.iloc[i]['month'] ==12:
        count.append(i)
        recession.append(1)
    elif 2007 < df.iloc[i]['year'] < 2009:
        recession.append(1)
    elif df.iloc[i]['year'] == 2009 and df.iloc[i]['month'] < 7:
        recession.append(1)
        count.append(i)
    else:
        recession.append(0)
        
for i in range(288):
    if i == 0:
        rate_drop.append(0)
        continue
    elif df.iloc[i]['federal_funds_rates'] < df.iloc[i-1]['federal_funds_rates']:
        rate_drop.append(1)
    else:
        rate_drop.append(0)
    
df["lgdp"] = lgdp
df["lCPI"] = lCPI
df["lfederal_funds_rates"] = lfederal
df["recession"] = recession
df["treatment"] = rate_drop

use_cols = df.columns.tolist()
use_cols.remove("lgdp")
use_cols.remove("gdp")
use_cols.remove("CPI")
use_cols.remove("federal_funds_rates")

# remove year and month since we already have time dummy variable
use_cols.remove("year")
use_cols.remove("month")
from sklearn import preprocessing

x = df[use_cols].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_model = pd.DataFrame(x_scaled)
df_model.columns = use_cols
df_model["lgdp"] = df.lgdp.tolist()
print(df_model.head())

print(count)

   CPI_Inflation   avg_HPI  long_term_interest  budget_on_education  \
0       0.812102  0.000000            0.656338                  0.0   
1       0.726115  0.003107            0.680282                  0.0   
2       0.754777  0.005321            0.680282                  0.0   
3       0.783439  0.008593            0.788732                  0.0   
4       0.722930  0.013227            0.871831                  0.0   

   population  employed_percent  unemployed_percent    lowest   second  \
0         0.0          0.508475            0.392857  0.181934  0.18932   
1         0.0          0.508475            0.392857  0.181934  0.18932   
2         0.0          0.508475            0.392857  0.181934  0.18932   
3         0.0          0.508475            0.392857  0.181934  0.18932   
4         0.0          0.508475            0.392857  0.181934  0.18932   

      third    fourth  top_5_percent      lCPI  lfederal_funds_rates  \
0  0.132139  0.025775            0.0  0.000000          

In [6]:
df_model.describe()

Unnamed: 0,CPI_Inflation,avg_HPI,long_term_interest,budget_on_education,population,employed_percent,unemployed_percent,lowest,second,third,fourth,top_5_percent,lCPI,lfederal_funds_rates,recession,treatment,lgdp
count,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0
mean,0.687412,0.364034,0.481729,0.243924,0.477711,0.674435,0.323661,0.462774,0.501336,0.517791,0.526821,0.569265,0.570392,0.744092,0.065972,0.444444,16.028522
std,0.102739,0.301586,0.247687,0.242795,0.315121,0.229734,0.246643,0.273061,0.306388,0.342674,0.363902,0.358754,0.280024,0.239005,0.248666,0.497769,0.352054
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.398649
25%,0.639331,0.101703,0.284859,0.078981,0.210192,0.605932,0.151786,0.266113,0.273188,0.187381,0.145641,0.179955,0.363065,0.689145,0.0,0.0,15.731292
50%,0.687898,0.234863,0.464789,0.12548,0.430915,0.711864,0.276786,0.466125,0.518291,0.584089,0.701493,0.758291,0.570929,0.843959,0.0,0.0,16.053081
75%,0.742834,0.568731,0.671127,0.39252,0.749591,0.783898,0.379464,0.64254,0.749133,0.821259,0.841193,0.873424,0.792441,0.882171,0.0,1.0,16.339213
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,16.521183


In [7]:
# I would have below:
# df_before is all the date before the Great Recession
# df_during is all the date during the Great Recession

In [8]:
df_full = df_model.iloc[0:270]
df_before = df_model.iloc[0:252]
df_before.tail()

Unnamed: 0,CPI_Inflation,avg_HPI,long_term_interest,budget_on_education,population,employed_percent,unemployed_percent,lowest,second,third,fourth,top_5_percent,lCPI,lfederal_funds_rates,recession,treatment,lgdp
247,0.55414,0.938581,0.316901,0.434013,0.891733,0.762712,0.107143,0.611266,0.866505,0.895153,1.0,0.948209,0.917428,0.850038,0.0,1.0,16.488116
248,0.700637,0.925815,0.295775,0.434013,0.891733,0.762712,0.107143,0.611266,0.866505,0.895153,1.0,0.948209,0.921463,0.846464,0.0,1.0,16.488116
249,0.678344,0.905826,0.297183,0.434013,0.891733,0.762712,0.107143,0.611266,0.866505,0.895153,1.0,0.948209,0.924596,0.838206,0.0,1.0,16.488116
250,0.799363,0.877737,0.243662,0.434013,0.891733,0.762712,0.107143,0.611266,0.866505,0.895153,1.0,0.948209,0.933277,0.825214,0.0,1.0,16.488116
251,0.589172,0.850881,0.23662,0.434013,0.891733,0.762712,0.107143,0.611266,0.866505,0.895153,1.0,0.948209,0.932293,0.812468,1.0,1.0,16.488116


In [9]:
df_before.describe()

Unnamed: 0,CPI_Inflation,avg_HPI,long_term_interest,budget_on_education,population,employed_percent,unemployed_percent,lowest,second,third,fourth,top_5_percent,lCPI,lfederal_funds_rates,recession,treatment,lgdp
count,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0
mean,0.692814,0.328825,0.531249,0.193299,0.408347,0.734463,0.261905,0.472324,0.511434,0.507923,0.494276,0.530485,0.512878,0.814231,0.003968,0.43254,15.960699
std,0.086407,0.304979,0.223275,0.187712,0.273442,0.144897,0.159113,0.287071,0.322834,0.36338,0.377666,0.367401,0.251086,0.127016,0.062994,0.496414,0.323634
min,0.356688,0.0,0.128169,0.0,0.0,0.508475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.486584,0.0,0.0,15.398649
25%,0.649682,0.096952,0.327465,0.070141,0.182508,0.644068,0.125,0.279117,0.279126,0.132139,0.07685,0.174401,0.331864,0.752793,0.0,0.0,15.693341
50%,0.687898,0.176234,0.500704,0.113596,0.370027,0.728814,0.267857,0.479066,0.566401,0.557429,0.508627,0.579302,0.53705,0.859793,0.0,0.0,15.968262
75%,0.742834,0.504313,0.705634,0.323468,0.632151,0.79661,0.357143,0.673179,0.778779,0.829715,0.86112,0.892559,0.713573,0.889156,0.0,1.0,16.21136
max,1.0,1.0,1.0,0.716482,0.891733,1.0,0.625,1.0,1.0,1.0,1.0,1.0,0.933277,1.0,1.0,1.0,16.488116


In [10]:
df_during = df_model.iloc[252:270]
df_during.head()

Unnamed: 0,CPI_Inflation,avg_HPI,long_term_interest,budget_on_education,population,employed_percent,unemployed_percent,lowest,second,third,fourth,top_5_percent,lCPI,lfederal_funds_rates,recession,treatment,lgdp
252,0.770701,0.81928,0.185915,0.408518,0.926612,0.627119,0.321429,0.507485,0.549584,0.69589,0.782471,0.850933,0.939562,0.796142,1.0,1.0,16.504621
253,0.703822,0.787277,0.185915,0.408518,0.926612,0.627119,0.321429,0.507485,0.549584,0.69589,0.782471,0.850933,0.943813,0.734012,1.0,1.0,16.504621
254,0.888535,0.761506,0.153521,0.408518,0.926612,0.627119,0.321429,0.507485,0.549584,0.69589,0.782471,0.850933,0.956466,0.704517,1.0,1.0,16.504621
255,0.805732,0.747854,0.177465,0.408518,0.926612,0.627119,0.321429,0.507485,0.549584,0.69589,0.782471,0.850933,0.96533,0.674443,1.0,1.0,16.504621
256,0.878981,0.738999,0.205634,0.408518,0.926612,0.627119,0.321429,0.507485,0.549584,0.69589,0.782471,0.850933,0.977623,0.643056,1.0,1.0,16.504621


In [11]:
df_during.describe()

Unnamed: 0,CPI_Inflation,avg_HPI,long_term_interest,budget_on_education,population,employed_percent,unemployed_percent,lowest,second,third,fourth,top_5_percent,lCPI,lfederal_funds_rates,recession,treatment,lgdp
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,0.659766,0.660326,0.145462,0.605679,0.938795,0.463277,0.529762,0.489385,0.528722,0.66751,0.784658,0.856304,0.963536,0.42315,1.0,0.722222,16.497761
std,0.248974,0.098813,0.072952,0.286911,0.017729,0.238425,0.30317,0.02634,0.030359,0.0413,0.003183,0.007815,0.021099,0.282011,0.0,0.460889,0.009984
min,0.0,0.525903,0.0,0.408518,0.926612,0.135593,0.321429,0.453184,0.486997,0.610748,0.782471,0.850933,0.933633,0.069004,1.0,0.0,16.484039
25%,0.597134,0.557114,0.084507,0.408518,0.926612,0.135593,0.321429,0.453184,0.486997,0.610748,0.782471,0.850933,0.947672,0.118141,1.0,0.25,16.484039
50%,0.727707,0.677897,0.178169,0.408518,0.926612,0.627119,0.321429,0.507485,0.549584,0.69589,0.782471,0.850933,0.957591,0.553693,1.0,1.0,16.504621
75%,0.799363,0.737611,0.19331,1.0,0.963161,0.627119,0.946429,0.507485,0.549584,0.69589,0.789034,0.867045,0.977524,0.646124,1.0,1.0,16.504621
max,0.933121,0.81928,0.23662,1.0,0.963161,0.627119,0.946429,0.507485,0.549584,0.69589,0.789034,0.867045,1.0,0.796142,1.0,1.0,16.504621


# Pooled Cross Section Model with just Time and treatment variable

In [12]:
# OLS Regression for treatment variable before the recession

var = "treatment"
total = "lgdp ~ " + var
residual = "resid ~ " + var
name = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']

result = sm.ols(formula= total, data=df_before).fit()
df_before['yhat'] = result.fittedvalues
df_before['resid'] = result.resid

# Check for Breusch Pagan test for Hetereoskedasticity
test = sms.het_breuschpagan(result.resid, result.model.exog)

print(result.params)
print(result.summary())
print("")

# Check for White test for Hetereoskedasticity
result = sm.ols(formula="resid**2 ~ yhat + yhat**2", data=df_before).fit()
white_test = sms.het_white(result.resid,  result.model.exog)

print("Breusch Pagan test for Hetereoskedasticity")
print(lzip(name, test))
print("White test for Hetereoskedasticity")
print(lzip(name, white_test))

Intercept    15.998821
treatment    -0.088136
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                   lgdp   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                     4.654
Date:                Tue, 19 May 2020   Prob (F-statistic):             0.0319
Time:                        03:30:22   Log-Likelihood:                -70.456
No. Observations:                 252   AIC:                             144.9
Df Residuals:                     250   BIC:                             152.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

In [13]:
# OLS Regression for treatment variable during the recession

var = "treatment"
total = "lgdp ~ " + var
residual = "resid ~ " + var

result = sm.ols(formula= total, data=df_during).fit()
df_during['yhat'] = result.fittedvalues
df_during['resid'] = result.resid

# Check for Breusch Pagan test for Hetereoskedasticity
test = sms.het_breuschpagan(result.resid, result.model.exog)

print(result.params)
print(result.summary())
print("")

# Check for White test for Hetereoskedasticity
result = sm.ols(formula="resid**2 ~ yhat + yhat**2", data=df_during).fit()
white_test = sms.het_white(result.resid,  result.model.exog)

print("Breusch Pagan test for Hetereoskedasticity")
print(lzip(name, test))
print("White test for Hetereoskedasticity")
print(lzip(name, white_test))

Intercept    16.492272
treatment     0.007600
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                   lgdp   R-squared:                       0.123
Model:                            OLS   Adj. R-squared:                  0.068
Method:                 Least Squares   F-statistic:                     2.246
Date:                Tue, 19 May 2020   Prob (F-statistic):              0.153
Time:                        03:30:22   Log-Likelihood:                 59.078
No. Observations:                  18   AIC:                            -114.2
Df Residuals:                      16   BIC:                            -112.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

In [14]:
# There is no hetereoskedasticity error and the t statistics are significant
# We see some major differences here for the coefficient of treatment - interest rates drop
# Before the recession, the coefficient is negative means that when interest rates drop , GDP increases
# However, there seems to be an opposite effect during the recession since the coefficient is positive (confirms initial expectations)
# Since I put treatment as a sign of interest rates drop, the positive switch clearly indicates that
# the treatment during a recession really does save the GDP from falling (make recession less serious)

In [15]:
# The effect of having treatment during a recession is indicated by 0.0076 - (-0.088) = 0.0956 
# (treatment can help increases GDP during recession by 9%) ? Need estimate

# Regression for treatment effect estimate DID estimator

In [16]:
# New regression model to estimate treatment effect

var = "treatment + recession + treatment*recession"
total = "lgdp ~ " + var
residual = "resid ~ " + var

result = sm.ols(formula= total, data=df_full).fit()
df_full['yhat'] = result.fittedvalues
df_full['resid'] = result.resid

# Check for Breusch Pagan test for Hetereoskedasticity
test = sms.het_breuschpagan(result.resid, result.model.exog)

print(result.params)
print(result.summary())
print("")

# Check for White test for Hetereoskedasticity
result = sm.ols(formula="resid**2 ~ yhat + yhat**2", data=df_full).fit()
white_test = sms.het_white(result.resid,  result.model.exog)


print("Breusch Pagan test for Hetereoskedasticity")
print(lzip(name, test))
print("White test for Hetereoskedasticity")
print(lzip(name, white_test))

Intercept              15.998821
treatment              -0.093483
recession               0.493451
treatment:recession     0.100243
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                   lgdp   R-squared:                       0.182
Model:                            OLS   Adj. R-squared:                  0.173
Method:                 Least Squares   F-statistic:                     19.71
Date:                Tue, 19 May 2020   Prob (F-statistic):           1.44e-11
Time:                        03:30:22   Log-Likelihood:                -64.411
No. Observations:                 270   AIC:                             136.8
Df Residuals:                     266   BIC:                             151.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|

In [17]:
# There is clearly hetereoskedasticity error even though the t statistic is not significant (0.604)
# I believe this is the issue of limited variables and limited observations

In [18]:
# However, I can still believe that a drop of interest rate does boost GDP during recession and I would test for all variables
# The predicted positive effect here is 10%

# Pooled Cross Section including all variables

In [19]:
# New regression model to estimate treatment effect with all variables

var = "treatment*recession + "
for col in use_cols:
    var += col
    if col == use_cols[-1]:
        break
    var += " + "
total = "lgdp ~ " + var

residual = "resid ~ " + var

result = sm.ols(formula= total, data=df_full).fit()
df_full['yhat'] = result.fittedvalues
df_full['resid'] = result.resid

print(result.params)
print(result.summary())
print("")

# Check for Breusch Pagan test for Hetereoskedasticity
test = sms.het_breuschpagan(result.resid, result.model.exog)

# Check for White test for Hetereoskedasticity
result = sm.ols(formula="resid**2 ~ yhat + yhat**2", data=df_full).fit()
white_test = sms.het_white(result.resid,  result.model.exog)

print("Breusch Pagan test for Hetereoskedasticity")
print(lzip(name, test))
print("White test for Hetereoskedasticity")
print(lzip(name, white_test))

Intercept               15.289598
treatment               -0.005805
recession               -0.037094
treatment:recession      0.014012
CPI_Inflation            0.011130
avg_HPI                 -0.030368
long_term_interest      -0.030561
budget_on_education      0.043396
population               0.804642
employed_percent         0.226653
unemployed_percent       0.046676
lowest                  -0.047013
second                   0.004233
third                    0.019967
fourth                  -0.083075
top_5_percent            0.100627
lCPI                     0.347052
lfederal_funds_rates    -0.004809
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                   lgdp   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 2.611e+04
Date:                Tue, 19 May 2020   Prob (F-statistic

In [20]:
# There is hetereoskedasticity error because the p-values are small. I may not reject the null hypothesis 
# under 1% significant level with White Test
# Now the t statistic for treatment*recession is much more significant (2.814) and the coefficient is positive
# This means that my theory of having interest rates drop during recession to help improve GDP is confirmed.
# The effect on GDP in this case is quite small even tho significant (1.4%)