In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GroupKFold, train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols

  from pandas.core import datetools


In [2]:
# The dataset I will be using is the cardiovascular/poverty dataset I created as the project 1 dataset.

data = pd.read_excel('master_dataset.xlsx')
data.columns = ['STATE', 'Total_Population', 'Number_of_Poor', 'Poverty_Rate', 'Year', 'Mortality_Rate']

# Machine Learning Attempt 1 - Linear Regression

In [3]:
# Let's attempt to predict the Mortality Rate using only the poverty rate.

linreg1 = ols('Mortality_Rate ~ Poverty_Rate', data).fit()
print(linreg1.summary())

                            OLS Regression Results                            
Dep. Variable:         Mortality_Rate   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.066
Method:                 Least Squares   F-statistic:                     26.10
Date:                Tue, 20 Feb 2018   Prob (F-statistic):           5.30e-07
Time:                        14:00:44   Log-Likelihood:                -450.64
No. Observations:                 357   AIC:                             905.3
Df Residuals:                     355   BIC:                             913.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        2.9430      0.157     18.804   

Clearly this model does a poor job explaining the variance in the data.  Could it be because the data is longitudinal in nature?  Let's add the year variable to see what would happen.

In [4]:
linreg2 = ols('Mortality_Rate ~ Poverty_Rate + Year', data).fit()
print(linreg2.summary())

                            OLS Regression Results                            
Dep. Variable:         Mortality_Rate   R-squared:                       0.777
Model:                            OLS   Adj. R-squared:                  0.776
Method:                 Least Squares   F-statistic:                     617.8
Date:                Tue, 20 Feb 2018   Prob (F-statistic):          3.52e-116
Time:                        14:00:44   Log-Likelihood:                -195.21
No. Observations:                 357   AIC:                             396.4
Df Residuals:                     354   BIC:                             408.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      152.1105      4.445     34.224   

We see that there is a warning about strong multicollinearity. But we know from our correlation matrix that the poverty rate and year have a Pearson Coefficient value of -.05.

Lets toss the kitchen sink at it and see what we get.

In [5]:
linreg3 = ols('Mortality_Rate ~ STATE + Total_Population + Number_of_Poor + Poverty_Rate + Year', data).fit()
print(linreg3.summary())

                            OLS Regression Results                            
Dep. Variable:         Mortality_Rate   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.973
Method:                 Least Squares   F-statistic:                     234.8
Date:                Tue, 20 Feb 2018   Prob (F-statistic):          1.30e-216
Time:                        14:00:44   Log-Likelihood:                 207.99
No. Observations:                 357   AIC:                            -306.0
Df Residuals:                     302   BIC:                            -92.71
Df Model:                          54                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

We're still dealing with a multicollinearity warning.  We know that total population and number of poor are strongly correlated, so we will remove one of them.  We will choose the Number of Poor variable since it has a higher correlation with the poverty rate than the total population variable.

In [6]:
linreg4 = ols('Mortality_Rate ~ STATE + Poverty_Rate + Total_Population + Year', data).fit()
print(linreg4.summary())

                            OLS Regression Results                            
Dep. Variable:         Mortality_Rate   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.973
Method:                 Least Squares   F-statistic:                     240.0
Date:                Tue, 20 Feb 2018   Prob (F-statistic):          8.46e-218
Time:                        14:00:44   Log-Likelihood:                 207.97
No. Observations:                 357   AIC:                            -307.9
Df Residuals:                     303   BIC:                            -98.55
Df Model:                          53                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

Again another multicollinearity warning.  We suspect this is coming from the year variable, so we remove it.

In [7]:
linreg5 = ols('Mortality_Rate ~ STATE + Total_Population + Poverty_Rate', data).fit()
print(linreg5.summary())

                            OLS Regression Results                            
Dep. Variable:         Mortality_Rate   R-squared:                       0.461
Model:                            OLS   Adj. R-squared:                  0.369
Method:                 Least Squares   F-statistic:                     5.005
Date:                Tue, 20 Feb 2018   Prob (F-statistic):           1.25e-19
Time:                        14:00:44   Log-Likelihood:                -352.90
No. Observations:                 357   AIC:                             811.8
Df Residuals:                     304   BIC:                             1017.
Df Model:                          52                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

Another multicollinearity warning, and we see that the total population variable is essentially useless. So we remove it.

In [8]:
linreg6 = ols('Mortality_Rate ~ STATE + Poverty_Rate', data).fit()
print(linreg6.summary())

                            OLS Regression Results                            
Dep. Variable:         Mortality_Rate   R-squared:                       0.249
Model:                            OLS   Adj. R-squared:                  0.123
Method:                 Least Squares   F-statistic:                     1.983
Date:                Tue, 20 Feb 2018   Prob (F-statistic):           0.000226
Time:                        14:00:44   Log-Likelihood:                -412.18
No. Observations:                 357   AIC:                             928.4
Df Residuals:                     305   BIC:                             1130.
Df Model:                          51                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [9]:
linreg7 = ols('Mortality_Rate ~ Year', data).fit()
print(linreg7.summary())

                            OLS Regression Results                            
Dep. Variable:         Mortality_Rate   R-squared:                       0.731
Model:                            OLS   Adj. R-squared:                  0.731
Method:                 Least Squares   F-statistic:                     966.8
Date:                Tue, 20 Feb 2018   Prob (F-statistic):          2.25e-103
Time:                        14:00:44   Log-Likelihood:                -228.64
No. Observations:                 357   AIC:                             461.3
Df Residuals:                     355   BIC:                             469.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    154.8594      4.861     31.856      0.0

In [10]:
linreg8 = ols('Mortality_Rate ~ STATE + Poverty_Rate + Year', data).fit()
print(linreg8.summary())

                            OLS Regression Results                            
Dep. Variable:         Mortality_Rate   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.973
Method:                 Least Squares   F-statistic:                     245.2
Date:                Tue, 20 Feb 2018   Prob (F-statistic):          5.93e-219
Time:                        14:00:44   Log-Likelihood:                 207.86
No. Observations:                 357   AIC:                            -309.7
Df Residuals:                     304   BIC:                            -104.2
Df Model:                          52                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [12]:
linreg9 = ols('Mortality_Rate ~ STATE + Year', data).fit()
print(linreg9.summary())

                            OLS Regression Results                            
Dep. Variable:         Mortality_Rate   R-squared:                       0.976
Model:                            OLS   Adj. R-squared:                  0.972
Method:                 Least Squares   F-statistic:                     241.3
Date:                Tue, 20 Feb 2018   Prob (F-statistic):          1.19e-217
Time:                        14:42:21   Log-Likelihood:                 201.07
No. Observations:                 357   AIC:                            -298.1
Df Residuals:                     305   BIC:                            -96.50
Df Model:                          51                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 