### PM2.5 and Asthma OLS Regression

In [2]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.api as sm
 

In [3]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [4]:
#retrieving the merged csv
link = 'https://drive.google.com/file/d/1UWOki05xDRVQGxkD_hh6aXYbaQvxOSRu/view?usp=share_link'

In [5]:
id = link.split('/')[5]

In [6]:
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Asthma and PM2.5.csv')

In [7]:
#read csv 
df = pd.read_csv('Asthma and PM2.5.csv')
df.head()

Unnamed: 0,DataValue,StratificationID1,statefips,ds_pm_pred,ds_pm_stdd
0,10.11,GENF,5,10.6905,4.1248
1,10.11,GENF,5,13.2459,5.112
2,10.11,GENF,5,11.1476,4.3234
3,10.11,GENF,5,11.1428,4.4293
4,10.11,GENF,5,11.0554,4.4333


In [8]:
#collected from U.S. Census Bureau's Small Area Income and Poverty Estimates (SAIPE)
avg_poverty_rate = {
    1: 17.625,
    2: 10.975,
    4: 19.15,
    5: 19.45,
    6: 16.025,
    8: 13.925,
    9: 10.6,
    10: 10.075,
    11: 18.725,
    12: 16.525,
    13: 18.425,
    15: 9.725,
    16: 15.2,
    17: 14.6,
    18: 15.225,
    19: 12.7,
    20: 14.5,
    21: 17.275,
    22: 19.025,
    23: 14.075,
    24: 10.625,
    25: 11.725,
    26: 15.725,
    27: 13.55,
    28: 19.075,
    29: 15.425,
    30: 15.525,
    31: 12.425,
    32: 15.225,
    33: 9.975,
    34: 11.975,
    35: 18.55,
    36: 15.65,
    37: 17.625,
    38: 12.3,
    39: 15.375,
    40: 16.35,
    41: 15.175,
    42: 13.35,
    44: 15.85,
    45: 16.25,
    46: 14.65,
    47: 15.025,
    48: 15.7,
    49: 11.85,
    50: 12.925,
    51: 11.175,
    53: 13.725,
    54: 17.2,
    55: 12.7,
    56: 11.775
}


In [9]:
# collected from U.S. Census Bureau's American Community Survey (ACS) 5-Year Estimate
state_unemp_dict = {
    1: 7.2,   # Alabama
    2: 7.5,   # Alaska
    4: 8.3,   # Arizona
    5: 7.0,   # Arkansas
    6: 10.0,  # California
    8: 8.2,   # Colorado
    9: 8.1,   # Connecticut
    10: 7.4,   # Delaware
    11: 7.4,   # District of Columbia
    12: 8.7,   # Florida
    13: 9.4,   # Georgia
    15: 6.2,   # Hawaii
    16: 7.2,   # Idaho
    17: 9.2,   # Illinois
    18: 8.0,   # Indiana
    19: 5.1,   # Iowa
    20: 6.5,   # Kansas
    21: 9.0,   # Kentucky
    22: 7.0,   # Louisiana
    23: 7.3,   # Maine
    24: 7.5,   # Maryland
    25: 7.0,   # Massachusetts
    26: 9.0,   # Michigan
    27: 6.6,   # Minnesota
    28: 9.3,   # Mississippi
    29: 6.4,   # Missouri
    30: 5.5,   # Montana
    31: 4.4,   # Nebraska
    32: 11.1,  # Nevada
    33: 5.5,   # New Hampshire
    34: 9.0,   # New Jersey
    35: 7.4,   # New Mexico
    36: 8.1,   # New York
    37: 9.1,   # North Carolina
    38: 3.8,   # North Dakota
    39: 7.1,   # Ohio
    40: 6.2,   # Oklahoma
    41: 8.0,   # Oregon
    42: 8.0,   # Pennsylvania
    44: 9.0,   # Rhode Island
    45: 8.3,   # South Carolina
    46: 4.3,   # South Dakota
    47: 8.0,   # Tennessee
    48: 7.1,   # Texas
    49: 4.2,   # Utah
    50: 5.0,   # Vermont
    51: 5.4,   # Virginia
    53: 8.5,   # Washington
    54: 7.4,   # West Virginia
    55: 6.2,   # Wisconsin
    56: 5.0,   # Wyoming
}


In [10]:
#clean up nulls in df
df = df[pd.to_numeric(df['ds_pm_pred'], errors='coerce').notnull()]
df.head()

Unnamed: 0,DataValue,StratificationID1,statefips,ds_pm_pred,ds_pm_stdd
0,10.11,GENF,5,10.6905,4.1248
1,10.11,GENF,5,13.2459,5.112
2,10.11,GENF,5,11.1476,4.3234
3,10.11,GENF,5,11.1428,4.4293
4,10.11,GENF,5,11.0554,4.4333


In [11]:
#add poverty rate to df
df['poverty_rate'] = df['statefips'].map(state_unemp_dict)
df.head()

Unnamed: 0,DataValue,StratificationID1,statefips,ds_pm_pred,ds_pm_stdd,poverty_rate
0,10.11,GENF,5,10.6905,4.1248,7.0
1,10.11,GENF,5,13.2459,5.112,7.0
2,10.11,GENF,5,11.1476,4.3234,7.0
3,10.11,GENF,5,11.1428,4.4293,7.0
4,10.11,GENF,5,11.0554,4.4333,7.0


In [12]:
#add unemployment rate to df
df['unemployment_rate'] = df['statefips'].map(avg_poverty_rate)
df.head()

Unnamed: 0,DataValue,StratificationID1,statefips,ds_pm_pred,ds_pm_stdd,poverty_rate,unemployment_rate
0,10.11,GENF,5,10.6905,4.1248,7.0,19.45
1,10.11,GENF,5,13.2459,5.112,7.0,19.45
2,10.11,GENF,5,11.1476,4.3234,7.0,19.45
3,10.11,GENF,5,11.1428,4.4293,7.0,19.45
4,10.11,GENF,5,11.0554,4.4333,7.0,19.45


In [13]:
#run OLS 
y = df['DataValue']
X = sm.add_constant(df[['ds_pm_pred', 'unemployment_rate', 'poverty_rate']])
sm.OLS(y, X).fit()


<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f9887a23a30>