In [1]:
"""
The following code creates an OLS regression using the statsmodel package from the state economy data generated before

This will allow us to see the impact of minimum wage on gini index relative to other factors and build a model of how
different minimum wages affect income inequality
"""

'\nThe following code creates an OLS regression using the statsmodel package from the state economy data generated before\n\nThis will allow us to see the impact of minimum wage on gini index relative to other factors and build a model of how\ndifferent minimum wages affect income inequality\n'

In [2]:
import pandas as pd

In [26]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score

In [4]:
state_econ = pd.read_csv("../Data/state_econ.csv")

In [5]:
state_econ

Unnamed: 0,State,Year,gini,min_wage,unemployment,GDP (in millions USD),Population,Residents with Bachelor's Degree or Higher,GDP per Capita,% with College Degree,Union Membership %
0,Alabama,2010,0.4720,7.25,10.375000,175470.1,4785298,439863,36668.583649,9.191967,10.1
1,Alaska,2010,0.4220,7.75,8.133333,52947.7,713985,83346,74158.000518,11.673354,22.9
2,Arizona,2010,0.4550,7.25,10.300000,248125.3,6413737,691889,38686.541091,10.787611,6.4
3,Arkansas,2010,0.4580,6.25,7.858333,100970.8,2921606,252888,34560.033078,8.655787,4.0
4,California,2010,0.4710,8.00,12.458333,1973511.9,37349363,4610875,52839.238517,12.345257,17.5
...,...,...,...,...,...,...,...,...,...,...,...
495,Virginia,2019,0.4690,7.25,2.758333,556905.2,8535519,1312800,65245.616582,15.380436,11.2
496,Washington,2019,0.4577,13.50,4.233333,612996.5,7614893,1204728,80499.686601,15.820682,4.0
497,West Virginia,2019,0.4644,8.75,4.933333,78863.9,1792147,161686,44005.262961,9.021916,10.2
498,Wisconsin,2019,0.4391,7.25,3.166667,349416.5,5822434,829878,60012.101468,14.253111,8.1


In [6]:
state_econ = state_econ.set_index(["State", "Year"])

In [7]:
state_econ

Unnamed: 0_level_0,Unnamed: 1_level_0,gini,min_wage,unemployment,GDP (in millions USD),Population,Residents with Bachelor's Degree or Higher,GDP per Capita,% with College Degree,Union Membership %
State,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,2010,0.4720,7.25,10.375000,175470.1,4785298,439863,36668.583649,9.191967,10.1
Alaska,2010,0.4220,7.75,8.133333,52947.7,713985,83346,74158.000518,11.673354,22.9
Arizona,2010,0.4550,7.25,10.300000,248125.3,6413737,691889,38686.541091,10.787611,6.4
Arkansas,2010,0.4580,6.25,7.858333,100970.8,2921606,252888,34560.033078,8.655787,4.0
California,2010,0.4710,8.00,12.458333,1973511.9,37349363,4610875,52839.238517,12.345257,17.5
...,...,...,...,...,...,...,...,...,...,...
Virginia,2019,0.4690,7.25,2.758333,556905.2,8535519,1312800,65245.616582,15.380436,11.2
Washington,2019,0.4577,13.50,4.233333,612996.5,7614893,1204728,80499.686601,15.820682,4.0
West Virginia,2019,0.4644,8.75,4.933333,78863.9,1792147,161686,44005.262961,9.021916,10.2
Wisconsin,2019,0.4391,7.25,3.166667,349416.5,5822434,829878,60012.101468,14.253111,8.1


In [8]:
corr_coef = state_econ.corr()
corr_coef

Unnamed: 0,gini,min_wage,unemployment,GDP (in millions USD),Population,Residents with Bachelor's Degree or Higher,GDP per Capita,% with College Degree,Union Membership %
gini,1.0,0.127601,0.120652,0.504289,0.506531,0.50115,0.030443,-0.065912,0.020578
min_wage,0.127601,1.0,-0.176316,0.209803,0.135136,0.190376,0.381032,0.416558,-0.069825
unemployment,0.120652,-0.176316,1.0,0.090806,0.180325,0.122991,-0.401189,-0.43164,0.178297
GDP (in millions USD),0.504289,0.209803,0.090806,1.0,0.973709,0.987217,0.303998,0.171484,0.031207
Population,0.506531,0.135136,0.180325,0.973709,1.0,0.989931,0.164326,0.081558,0.03904
Residents with Bachelor's Degree or Higher,0.50115,0.190376,0.122991,0.987217,0.989931,1.0,0.247359,0.186776,0.045451
GDP per Capita,0.030443,0.381032,-0.401189,0.303998,0.164326,0.247359,1.0,0.624083,-0.038984
% with College Degree,-0.065912,0.416558,-0.43164,0.171484,0.081558,0.186776,0.624083,1.0,-0.083877
Union Membership %,0.020578,-0.069825,0.178297,0.031207,0.03904,0.045451,-0.038984,-0.083877,1.0


In [9]:
#Removed features that are highly colinear with other features and with each other
state_econ = state_econ.drop(["Population", "GDP (in millions USD)"], axis = 1)

In [10]:
econ_train, econ_test, gini_train, gini_test = train_test_split(state_econ[state_econ.columns[1:]], state_econ["gini"],
                                                               test_size = 0.2, random_state = 69)

In [11]:
econ_train

Unnamed: 0_level_0,Unnamed: 1_level_0,min_wage,unemployment,Residents with Bachelor's Degree or Higher,GDP per Capita,% with College Degree,Union Membership %
State,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Colorado,2015,8.23,3.741667,909032,58276.933475,16.659391,8.4
Nevada,2010,6.55,13.733333,256589,45718.546114,9.486986,9.3
Kansas,2019,7.25,3.275000,414851,60581.557635,14.239831,6.3
Oklahoma,2019,2.00,3.133333,447888,51058.271592,11.318961,11.9
Hawaii,2010,7.25,6.825000,184364,50058.043987,13.520179,4.0
...,...,...,...,...,...,...,...
South Dakota,2011,7.25,4.600000,97956,50589.868484,11.886681,3.4
Delaware,2012,7.25,7.066667,111921,67459.644180,12.203901,10.4
Delaware,2019,9.25,3.591667,134287,79159.221331,13.790508,8.7
Arkansas,2014,6.25,5.900000,272411,39156.220956,9.183315,4.7


In [12]:
model = sm.OLS(gini_train, sm.add_constant(econ_train))

In [13]:
results = model.fit()

In [14]:
results.summary()

0,1,2,3
Dep. Variable:,gini,R-squared:,0.29
Model:,OLS,Adj. R-squared:,0.279
Method:,Least Squares,F-statistic:,26.72
Date:,"Tue, 28 Nov 2023",Prob (F-statistic):,1.07e-26
Time:,17:25:58,Log-Likelihood:,1070.5
No. Observations:,400,AIC:,-2127.0
Df Residuals:,393,BIC:,-2099.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4702,0.008,58.622,0.000,0.454,0.486
min_wage,0.0015,0.001,2.395,0.017,0.000,0.003
unemployment,-0.0003,0.000,-0.610,0.542,-0.001,0.001
Residents with Bachelor's Degree or Higher,1.126e-08,9.66e-10,11.655,0.000,9.36e-09,1.32e-08
GDP per Capita,-7.178e-08,1.02e-07,-0.706,0.481,-2.72e-07,1.28e-07
% with College Degree,-0.0018,0.001,-3.334,0.001,-0.003,-0.001
Union Membership %,-0.0001,0.000,-0.630,0.529,-0.000,0.000

0,1,2,3
Omnibus:,0.144,Durbin-Watson:,1.943
Prob(Omnibus):,0.931,Jarque-Bera (JB):,0.15
Skew:,0.045,Prob(JB):,0.928
Kurtosis:,2.971,Cond. No.,11900000.0


In [15]:
"""
The following features are being dropped for not meeting the threshold for statistical significance or having a coefficient
that is essentially zero (suggesting the model regards that variable as irrelevant)

Unfortunately, this means our regression only uses 2 of the initial variables to predict gini coefficient
"""

econ_train = econ_train.drop(["unemployment", "GDP per Capita", "Union Membership %",
                              "Residents with Bachelor's Degree or Higher"], axis = 1)
econ_test = econ_test.drop(["unemployment", "GDP per Capita", "Union Membership %",
                              "Residents with Bachelor's Degree or Higher"], axis = 1)

In [16]:
econ_train

Unnamed: 0_level_0,Unnamed: 1_level_0,min_wage,% with College Degree
State,Year,Unnamed: 2_level_1,Unnamed: 3_level_1
Colorado,2015,8.23,16.659391
Nevada,2010,6.55,9.486986
Kansas,2019,7.25,14.239831
Oklahoma,2019,2.00,11.318961
Hawaii,2010,7.25,13.520179
...,...,...,...
South Dakota,2011,7.25,11.886681
Delaware,2012,7.25,12.203901
Delaware,2019,9.25,13.790508
Arkansas,2014,6.25,9.183315


In [17]:
gini_model = sm.OLS(gini_train, sm.add_constant(econ_train))

In [18]:
results = gini_model.fit()

In [19]:
results.summary()

0,1,2,3
Dep. Variable:,gini,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.023
Method:,Least Squares,F-statistic:,5.737
Date:,"Tue, 28 Nov 2023",Prob (F-statistic):,0.0035
Time:,17:25:58,Log-Likelihood:,1007.8
No. Observations:,400,AIC:,-2010.0
Df Residuals:,397,BIC:,-1998.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4608,0.007,69.956,0.000,0.448,0.474
min_wage,0.0023,0.001,3.089,0.002,0.001,0.004
% with College Degree,-0.0013,0.001,-2.558,0.011,-0.002,-0.000

0,1,2,3
Omnibus:,0.903,Durbin-Watson:,2.013
Prob(Omnibus):,0.637,Jarque-Bera (JB):,0.957
Skew:,-0.113,Prob(JB):,0.62
Kurtosis:,2.918,Cond. No.,101.0


In [21]:
preds = results.predict(sm.add_constant(econ_test))

In [23]:
mse = mean_squared_error(gini_test, preds)

In [24]:
mse

0.0003302094753296045

In [None]:
#low mean squared error might indicate strong fit, but variation in gini coefficient is not considerable

In [27]:
r2 = r2_score(gini_test, preds)

In [28]:
r2

0.05625712131498162

In [None]:
"""
together, fluctuations in minimum wage and college degree percentage can only predict about 5.6 percent of the variation
in gini coefficient. This suggests that our model is not very predictive and that a more predictive model would likely have
to have a lot more variables
"""