In [0]:
import requests
import pyspark.sql.functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.impute import SimpleImputer
from pyspark.sql.functions import *
from pyspark.sql.types import StringType,DecimalType
from pyspark.sql.functions import input_file_name, substring
from pyspark.sql.functions import isnan, when, count, col
from sodapy import Socrata



In [0]:
###### Mount Point 1 through Oauth security.
storageAccount = "gen10datafund2207"
storageContainer = "healthcare-capstone-group3"
clientSecret = "Cty8Q~AvEO_qC-MjvPvosYauiNsffOHKnMpj7cmd"
clientid = "2ca50102-5717-4373-b796-39d06568588d"
mount_point = "/mnt/healthcare/modeldataIn" # the mount point will be unique to you

configs = {"fs.azure.account.auth.type": "OAuth",
       "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
       "fs.azure.account.oauth2.client.id": clientid,
       "fs.azure.account.oauth2.client.secret": clientSecret,
       "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
       "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

try: 
    dbutils.fs.unmount(mount_point)
except:
    pass

dbutils.fs.mount(
source = "abfss://"+storageContainer+"@"+storageAccount+".dfs.core.windows.net/",
mount_point = mount_point,
extra_configs = configs)

/mnt/healthcare/modeldataIn has been unmounted.
Out[2]: True

In [0]:
display(dbutils.fs.ls("/mnt/healthcare/modeldataIn/ModelData"))

path,name,size,modificationTime
dbfs:/mnt/healthcare/modeldataIn/ModelData/Demographics Test Data 2020.csv,Demographics Test Data 2020.csv,3303579,1664465328000
dbfs:/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics 2015.csv,Health Insurance Characteristics 2015.csv,1484769,1664223416000
dbfs:/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics 2016.csv,Health Insurance Characteristics 2016.csv,1484129,1664223416000
dbfs:/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics 2017.csv,Health Insurance Characteristics 2017.csv,1473853,1664223416000
dbfs:/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics 2018.csv,Health Insurance Characteristics 2018.csv,1476522,1664223416000
dbfs:/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics.csv,Health Insurance Characteristics.csv,1480572,1664374667000


**Cleaning Health15 Dataset**

In [0]:
health15 = spark.read.options(header = 'True').csv("/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics 2015.csv").toPandas()

# Counting and dropping all null values within each dataset
health15.dropna(inplace = True)

# Resetting the index for each dataset
rows = []
for i in range(0, health15.shape[0]):
    rows.append(i)

health15[''] = rows
health15 = health15.set_index('')

Category = []

for i in health15['Label (Grouping)']:
    Category.append(i.strip())

health15['Label (Grouping)'] = Category

newcolumns15 = list(health15['Label (Grouping)'])

# Transposing so the states become the rows and the categories become the columns
health15 = health15.transpose()

categories = list(health15.index)[1:]
Counties = ['County']
States = ['State']
Insurance = ['PopCategory']

for i in categories:
    state = i.split(',')[1]
    Counties.append(i.split(',')[0])
    States.append(state.split('!!')[0])
    Insurance.append(state.split('!!')[1])

health15['61'] = Counties
health15['62'] = States
health15['63'] = Insurance

newcolumns15.append('County')
newcolumns15.append('State')
newcolumns15.append('PopCategory')

health15.columns = newcolumns15

health15.drop(index = 'Label (Grouping)', inplace = True)

indeces = []

for i in range(0, health15.shape[0]):
    indeces.append(i)
health15[''] = indeces
health15.set_index('', inplace = True)

health15.replace('N', 0, inplace = True)

dropColumns = ['18 to 64 years', '65 years and older', \
    'Two or more races', 'White alone, not Hispanic or Latino', \
    'In family households', 'In married couple families', 'In other families', 'Male householder, no wife present', \
    'Female householder, no husband present', 'In non-family households and other living arrangements', \
    'With a disability', 'No disability', 'Civilian noninstitutionalized population 25 years and over', \
    'Civilian noninstitutionalized population 18 years and over', 'In labor force', 'Employed', \
    'Unemployed', 'Not in labor force', 'Civilian noninstitutionalized population 18 to 64 years', \
    'Worked full-time, year round in the past 12 months', 'Worked less than full-time, year round in the past 12 months', \
    'Did not work', 'Total household population', 'Civilian noninstitutionalized population for whom poverty status is determined', \
    'Below 138 percent of the poverty threshold', '138 to 199 percent of the poverty threshold', \
    '200 to 399 percent of the poverty threshold', 'At or above 400 percent of the poverty threshold']
health15.drop(columns = dropColumns, inplace = True)
health15.rename(columns = {'Civilian noninstitutionalized population':'Total Population', 'PopCategory':'Insurance Category'}, inplace = True)

sortColumns = ['State', 'County', 'Insurance Category', 'Total Population', 'Under 6 years', '6 to 17 years', 'Under 18 years', '18 to 24 years', '19 to 25 years', '25 to 34 years', '35 to 44 years', '45 to 54 years', '55 to 64 years', \
    '65 to 74 years', '75 years and older', 'Male', 'Female', 'White alone', 'Black or African American alone', 'American Indian and Alaska Native alone', 'Asian alone', \
    'Native Hawaiian and Other Pacific Islander alone', 'Some other race alone', 'Hispanic or Latino (of any race)', 'Native born', 'Foreign born', 'Naturalized', \
    'Not a citizen', 'Less than high school graduate', 'High school graduate (includes equivalency)', "Some college or associate's degree", "Bachelor's degree or higher", \
    'Under $25,000', '$25,000 to $49,999', '$50,000 to $74,999', '$75,000 to $99,999', '$100,000 and over']
health15 = health15[sortColumns]

columns = list(health15.columns)[3:]
for i in columns:
       health15[i] = health15[i].str.replace(',', '')
       health15[i] = pd.to_numeric(health15[i])

# Imputing the 0 values for the mean of the population
health15.fillna(0, inplace = True)
imp = SimpleImputer(strategy = 'mean', missing_values = 0)

health15[columns] = imp.fit_transform(health15[columns])
health15[columns] = health15[columns].astype(int)

health15['State'] = health15['State'].str.lstrip()
health15['State'] = health15['State'].str.replace(' ', '_')
health15['State'] = health15['State'].replace('District_of_Columbia', 'DC')

health15['6 to 18 years'] = (health15['Under 18 years'] - health15['6 to 17 years']) + health15['6 to 17 years']
health15.drop(columns = ['6 to 17 years', 'Under 18 years'], inplace = True)

health15['26 to 34 years'] = health15['25 to 34 years'] - (health15['19 to 25 years'] - (health15['Under 6 years'] + health15['6 to 18 years'] + health15['18 to 24 years']))
health15.drop(columns = ['25 to 34 years', '18 to 24 years'], inplace = True)

health15 = health15[['State', 'County', 'Insurance Category', 'Total Population',
       'Under 6 years', '6 to 18 years',  '19 to 25 years', '26 to 34 years', '35 to 44 years',
       '45 to 54 years', '55 to 64 years', '65 to 74 years',
       '75 years and older', 'Male', 'Female', 'White alone',
       'Black or African American alone',
       'American Indian and Alaska Native alone', 'Asian alone',
       'Native Hawaiian and Other Pacific Islander alone',
       'Some other race alone', 'Hispanic or Latino (of any race)',
       'Native born', 'Foreign born', 'Naturalized', 'Not a citizen',
       'Less than high school graduate',
       'High school graduate (includes equivalency)',
       "Some college or associate's degree", "Bachelor's degree or higher",
       'Under $25,000', '$25,000 to $49,999', '$50,000 to $74,999',
       '$75,000 to $99,999', '$100,000 and over']]

health15.columns = ['State', 'County', 'Insurance_Category', \
    'Total_Population', 'Under_6Y', '_6_to_18Y', '_19_to_25Y', '_26_to_34Y', '_35_to_44Y', \
    '_45_to_54Y', '_55_to_64Y', '_65_to_74Y', '_75_and_Older', 'Male', \
    'Female', 'White', 'African_American', 'American_Indian', 'Asian', 'Pacific_Islander', \
    'Some_Other_Race', 'Hispanic', \
    'Native_Born', 'Foreign_Born', 'Naturalized', 'Not_A_Citizen', \
    'Less_Than_High_School', 'High_School_or_Equivalent', \
    "Some_College", "Bachelors_or_Higher", 'Under_25000S', '_25000_to_49999S', \
    '_50000_to_74999S', '_75000_to_99999S', 'Over_100000S']

**Cleaning Health16 Dataset**

In [0]:
health16 = spark.read.options(header = 'True').csv("/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics 2016.csv").toPandas()

# Counting and dropping all null values within each dataset
health16.dropna(inplace = True)

# Resetting the index for each dataset
rows = []
for i in range(0, health16.shape[0]):
    rows.append(i)

health16[''] = rows
health16 = health16.set_index('')

Category = []

for i in health16['Label (Grouping)']:
    Category.append(i.strip())

health16['Label (Grouping)'] = Category

newcolumns15 = list(health16['Label (Grouping)'])

# Transposing so the states become the rows and the categories become the columns
health16 = health16.transpose()

categories = list(health16.index)[1:]
Counties = ['County']
States = ['State']
Insurance = ['PopCategory']

for i in categories:
    state = i.split(',')[1]
    Counties.append(i.split(',')[0])
    States.append(state.split('!!')[0])
    Insurance.append(state.split('!!')[1])

health16['61'] = Counties
health16['62'] = States
health16['63'] = Insurance

newcolumns15.append('County')
newcolumns15.append('State')
newcolumns15.append('PopCategory')

health16.columns = newcolumns15

health16.drop(index = 'Label (Grouping)', inplace = True)

indeces = []

for i in range(0, health16.shape[0]):
    indeces.append(i)
health16[''] = indeces
health16.set_index('', inplace = True)

health16.replace('N', 0, inplace = True)

dropColumns = ['18 to 64 years', '65 years and older', \
    'Two or more races', 'White alone, not Hispanic or Latino', \
    'In family households', 'In married couple families', 'In other families', 'Male householder, no wife present', \
    'Female householder, no husband present', 'In non-family households and other living arrangements', \
    'With a disability', 'No disability', 'Civilian noninstitutionalized population 25 years and over', \
    'Civilian noninstitutionalized population 18 years and over', 'In labor force', 'Employed', \
    'Unemployed', 'Not in labor force', 'Civilian noninstitutionalized population 18 to 64 years', \
    'Worked full-time, year round in the past 12 months', 'Worked less than full-time, year round in the past 12 months', \
    'Did not work', 'Total household population', 'Civilian noninstitutionalized population for whom poverty status is determined', \
    'Below 138 percent of the poverty threshold', '138 to 199 percent of the poverty threshold', \
    '200 to 399 percent of the poverty threshold', 'At or above 400 percent of the poverty threshold']
health16.drop(columns = dropColumns, inplace = True)
health16.rename(columns = {'Civilian noninstitutionalized population':'Total Population', 'PopCategory':'Insurance Category'}, inplace = True)

sortColumns = ['State', 'County', 'Insurance Category', 'Total Population', 'Under 6 years', '6 to 17 years', 'Under 18 years', '18 to 24 years', '19 to 25 years', '25 to 34 years', '35 to 44 years', '45 to 54 years', '55 to 64 years', \
    '65 to 74 years', '75 years and older', 'Male', 'Female', 'White alone', 'Black or African American alone', 'American Indian and Alaska Native alone', 'Asian alone', \
    'Native Hawaiian and Other Pacific Islander alone', 'Some other race alone', 'Hispanic or Latino (of any race)', 'Native born', 'Foreign born', 'Naturalized', \
    'Not a citizen', 'Less than high school graduate', 'High school graduate (includes equivalency)', "Some college or associate's degree", "Bachelor's degree or higher", \
    'Under $25,000', '$25,000 to $49,999', '$50,000 to $74,999', '$75,000 to $99,999', '$100,000 and over']
health16 = health16[sortColumns]

columns = list(health16.columns)[3:]
for i in columns:
       health16[i] = health16[i].str.replace(',', '')
       health16[i] = pd.to_numeric(health16[i])

# Imputing the 0 values for the mean of the population
health16.fillna(0, inplace = True)
imp = SimpleImputer(strategy = 'mean', missing_values = 0)

health16[columns] = imp.fit_transform(health16[columns])
health16[columns] = health16[columns].astype(int)

health16['State'] = health16['State'].str.lstrip()
health16['State'] = health16['State'].str.replace(' ', '_')
health16['State'] = health16['State'].replace('District_of_Columbia', 'DC')

health16['6 to 18 years'] = (health16['Under 18 years'] - health16['6 to 17 years']) + health16['6 to 17 years']
health16.drop(columns = ['6 to 17 years', 'Under 18 years'], inplace = True)

health16['26 to 34 years'] = health16['25 to 34 years'] - (health16['19 to 25 years'] - (health16['Under 6 years'] + health16['6 to 18 years'] + health16['18 to 24 years']))
health16.drop(columns = ['25 to 34 years', '18 to 24 years'], inplace = True)

health16 = health16[['State', 'County', 'Insurance Category', 'Total Population',
       'Under 6 years', '6 to 18 years',  '19 to 25 years', '26 to 34 years', '35 to 44 years',
       '45 to 54 years', '55 to 64 years', '65 to 74 years',
       '75 years and older', 'Male', 'Female', 'White alone',
       'Black or African American alone',
       'American Indian and Alaska Native alone', 'Asian alone',
       'Native Hawaiian and Other Pacific Islander alone',
       'Some other race alone', 'Hispanic or Latino (of any race)',
       'Native born', 'Foreign born', 'Naturalized', 'Not a citizen',
       'Less than high school graduate',
       'High school graduate (includes equivalency)',
       "Some college or associate's degree", "Bachelor's degree or higher",
       'Under $25,000', '$25,000 to $49,999', '$50,000 to $74,999',
       '$75,000 to $99,999', '$100,000 and over']]

health16.columns = ['State', 'County', 'Insurance_Category', \
    'Total_Population', 'Under_6Y', '_6_to_18Y', '_19_to_25Y', '_26_to_34Y', '_35_to_44Y', \
    '_45_to_54Y', '_55_to_64Y', '_65_to_74Y', '_75_and_Older', 'Male', \
    'Female', 'White', 'African_American', 'American_Indian', 'Asian', 'Pacific_Islander', \
    'Some_Other_Race', 'Hispanic', \
    'Native_Born', 'Foreign_Born', 'Naturalized', 'Not_A_Citizen', \
    'Less_Than_High_School', 'High_School_or_Equivalent', \
    "Some_College", "Bachelors_or_Higher", 'Under_25000S', '_25000_to_49999S', \
    '_50000_to_74999S', '_75000_to_99999S', 'Over_100000S']

**Cleaning Health17 Dataset**

In [0]:
health17 = spark.read.options(header = 'True').csv("/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics 2017.csv").toPandas()

# Counting and dropping all null values in each dataset
health17.dropna(inplace = True)

# Resetting the index for each dataset
rows = []
for i in range(0, health17.shape[0]):
    rows.append(i)

health17[''] = rows
health17 = health17.set_index('')

Category = []

for i in health17['Label (Grouping)']:
    Category.append(i.strip())

health17['Label (Grouping)'] = Category

newcolumns15 = list(health17['Label (Grouping)'])

# Transposing so the states become the rows and the categories become the columns
health17 = health17.transpose()

categories = list(health17.index)[1:]
Counties = ['County']
States = ['State']
Insurance = ['PopCategory']

for i in categories:
    state = i.split(',')[1]
    Counties.append(i.split(',')[0])
    States.append(state.split('!!')[0])
    Insurance.append(state.split('!!')[1])

health17['61'] = Counties
health17['62'] = States
health17['63'] = Insurance

newcolumns15.append('County')
newcolumns15.append('State')
newcolumns15.append('PopCategory')

health17.columns = newcolumns15

health17.drop(index = 'Label (Grouping)', inplace = True)

indeces = []

for i in range(0, health17.shape[0]):
    indeces.append(i)
health17[''] = indeces
health17.set_index('', inplace = True)

health17.replace('N', 0, inplace = True)

dropColumns = ['Under 19 years', '19 to 64 years', '65 years and older', 'Two or more races', 'White alone, not Hispanic or Latino', 'In family households', \
    'In married couple families', 'In other families', 'Male householder, no wife present', 'Female householder, no husband present', \
    'In non-family households and other living arrangements', 'With a disability', 'No disability', 'Civilian noninstitutionalized population 26 years and over', \
    'Civilian noninstitutionalized population 19 to 64 years', 'In labor force', 'Employed', 'Unemployed', 'Not in labor force', \
    'Civilian noninstitutionalized population 19 to 64 years', 'Worked full-time, year round in the past 12 months', \
    'Worked less than full-time, year round in the past 12 months', 'Did not work', 'Total household population', \
    'Civilian noninstitutionalized population for whom poverty status is determined', 'Below 138 percent of the poverty threshold', \
    '138 to 399 percent of the poverty threshold', 'At or above 400 percent of the poverty threshold', 'Below 100 percent of the poverty threshold']

health17.drop(columns = dropColumns, inplace = True)
health17.rename(columns = {'Civilian noninstitutionalized population':'Total Population', 'PopCategory':'Insurance Category'}, inplace = True)

sortColumns = ['State', 'County', 'Insurance Category', 'Total Population', 'Under 6 years', '6 to 18 years', '19 to 25 years', '26 to 34 years', '35 to 44 years', '45 to 54 years', '55 to 64 years', \
    '65 to 74 years', '75 years and older', 'Male', 'Female', 'White alone', 'Black or African American alone', 'American Indian and Alaska Native alone', 'Asian alone', \
    'Native Hawaiian and Other Pacific Islander alone', 'Some other race alone', 'Hispanic or Latino (of any race)', 'Native born', 'Foreign born', 'Naturalized', \
    'Not a citizen', 'Less than high school graduate', 'High school graduate (includes equivalency)', "Some college or associate's degree", "Bachelor's degree or higher", \
    'Under $25,000', '$25,000 to $49,999', '$50,000 to $74,999', '$75,000 to $99,999', '$100,000 and over']

health17 = health17[sortColumns]

columns = list(health17.columns)[3:]
for i in columns:
       health17[i] = health17[i].str.replace(',', '')
       health17[i] = pd.to_numeric(health17[i])

# Imputing the 0 values for the mean of the population
health17.fillna(0, inplace = True)
imp = SimpleImputer(strategy = 'mean', missing_values = 0)

health17[columns] = imp.fit_transform(health17[columns])
health17[columns] = health17[columns].astype(int)

health17.columns = ['State', 'County', 'Insurance_Category', \
    'Total_Population', 'Under_6Y', '_6_to_18Y', '_19_to_25Y', '_26_to_34Y', '_35_to_44Y', \
    '_45_to_54Y', '_55_to_64Y', '_65_to_74Y', '_75_and_Older', 'Male', \
    'Female', 'White', 'African_American', 'American_Indian', 'Asian', 'Pacific_Islander', \
    'Some_Other_Race', 'Hispanic', \
    'Native_Born', 'Foreign_Born', 'Naturalized', 'Not_A_Citizen', \
    'Less_Than_High_School', 'High_School_or_Equivalent', \
    "Some_College", "Bachelors_or_Higher", 'Under_25000S', '_25000_to_49999S', \
    '_50000_to_74999S', '_75000_to_99999S', 'Over_100000S']

health17['State'] = health17['State'].str.lstrip()
health17['State'] = health17['State'].str.replace(' ', '_')
health17['State'] = health17['State'].replace('District_of_Columbia', 'DC')

**Cleaning Health18 Dataset**

In [0]:
health18 = spark.read.options(header = 'True').csv("/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics 2018.csv").toPandas()

# Counting and dropping all null values in each dataset
health18.dropna(inplace = True)

# Resetting the index for each dataset
rows = []
for i in range(0, health18.shape[0]):
    rows.append(i)

health18[''] = rows
health18 = health18.set_index('')

Category = []

for i in health18['Label (Grouping)']:
    Category.append(i.strip())

health18['Label (Grouping)'] = Category

newcolumns15 = list(health18['Label (Grouping)'])

# Transposing so the states become the rows and the categories become the columns
health18 = health18.transpose()

categories = list(health18.index)[1:]
Counties = ['County']
States = ['State']
Insurance = ['PopCategory']

for i in categories:
    state = i.split(',')[1]
    Counties.append(i.split(',')[0])
    States.append(state.split('!!')[0])
    Insurance.append(state.split('!!')[1])

health18['61'] = Counties
health18['62'] = States
health18['63'] = Insurance

newcolumns15.append('County')
newcolumns15.append('State')
newcolumns15.append('PopCategory')

health18.columns = newcolumns15

health18.drop(index = 'Label (Grouping)', inplace = True)

indeces = []

for i in range(0, health18.shape[0]):
    indeces.append(i)
health18[''] = indeces
health18.set_index('', inplace = True)

health18.replace('N', 0, inplace = True)

dropColumns = ['Under 19 years', '19 to 64 years', '65 years and older', 'Two or more races', 'White alone, not Hispanic or Latino', 'In family households', \
    'In married couple families', 'In other families', 'Male householder, no wife present', 'Female householder, no husband present', \
    'In non-family households and other living arrangements', 'With a disability', 'No disability', 'Civilian noninstitutionalized population 26 years and over', \
    'Civilian noninstitutionalized population 19 to 64 years', 'In labor force', 'Employed', 'Unemployed', 'Not in labor force', \
    'Civilian noninstitutionalized population 19 to 64 years', 'Worked full-time, year round in the past 12 months', \
    'Worked less than full-time, year round in the past 12 months', 'Did not work', 'Total household population', \
    'Civilian noninstitutionalized population for whom poverty status is determined', 'Below 138 percent of the poverty threshold', \
    '138 to 399 percent of the poverty threshold', 'At or above 400 percent of the poverty threshold', 'Below 100 percent of the poverty threshold']

health18.drop(columns = dropColumns, inplace = True)
health18.rename(columns = {'Civilian noninstitutionalized population':'Total Population', 'PopCategory':'Insurance Category'}, inplace = True)

sortColumns = ['State', 'County', 'Insurance Category', 'Total Population', 'Under 6 years', '6 to 18 years', '19 to 25 years', '26 to 34 years', '35 to 44 years', '45 to 54 years', '55 to 64 years', \
    '65 to 74 years', '75 years and older', 'Male', 'Female', 'White alone', 'Black or African American alone', 'American Indian and Alaska Native alone', 'Asian alone', \
    'Native Hawaiian and Other Pacific Islander alone', 'Some other race alone', 'Hispanic or Latino (of any race)', 'Native born', 'Foreign born', 'Naturalized', \
    'Not a citizen', 'Less than high school graduate', 'High school graduate (includes equivalency)', "Some college or associate's degree", "Bachelor's degree or higher", \
    'Under $25,000', '$25,000 to $49,999', '$50,000 to $74,999', '$75,000 to $99,999', '$100,000 and over']

health18 = health18[sortColumns]

columns = list(health18.columns)[3:]
for i in columns:
       health18[i] = health18[i].str.replace(',', '')
       health18[i] = pd.to_numeric(health18[i])

# Imputing the 0 values for the mean of the population
health18.fillna(0, inplace = True)
imp = SimpleImputer(strategy = 'mean', missing_values = 0)

health18[columns] = imp.fit_transform(health18[columns])
health18[columns] = health18[columns].astype(int)

health18.columns = ['State', 'County', 'Insurance_Category', \
    'Total_Population', 'Under_6Y', '_6_to_18Y', '_19_to_25Y', '_26_to_34Y', '_35_to_44Y', \
    '_45_to_54Y', '_55_to_64Y', '_65_to_74Y', '_75_and_Older', 'Male', \
    'Female', 'White', 'African_American', 'American_Indian', 'Asian', 'Pacific_Islander', \
    'Some_Other_Race', 'Hispanic', \
    'Native_Born', 'Foreign_Born', 'Naturalized', 'Not_A_Citizen', \
    'Less_Than_High_School', 'High_School_or_Equivalent', \
    "Some_College", "Bachelors_or_Higher", 'Under_25000S', '_25000_to_49999S', \
    '_50000_to_74999S', '_75000_to_99999S', 'Over_100000S']

health18['State'] = health18['State'].str.lstrip()
health18['State'] = health18['State'].str.replace(' ', '_')
health18['State'] = health18['State'].replace('District_of_Columbia', 'DC')

**Cleaning Health19 Dataset**

In [0]:
health19 = spark.read.options(header = 'True').csv("/mnt/healthcare/modeldataIn/ModelData/Health Insurance Characteristics.csv").toPandas()

# Counting the NA values in each column
health19.isna().sum()

# Dropping the NA rows since there was 5 in every row
health19.dropna(inplace=True)

# Making sure all NA values are gone
health19.isna().sum()

rows = []
for i in range(0, health19.shape[0]):
    rows.append(i)

health19[''] = rows
health19 = health19.set_index('')

Category = []

for i in health19['Label (Grouping)']:
    Category.append(i.strip())

health19['Label (Grouping)'] = Category

newcolumns = list(health19['Label (Grouping)'])

# Transposing so the states become the rows and the categories become the columns
health19 = health19.transpose()

categories = list(health19.index)[1:]
Counties = ['County']
States = ['State']
Insurance = ['PopCategory']

for i in categories:
    state = i.split(',')[1]
    Counties.append(i.split(',')[0])
    States.append(state.split('!!')[0])
    Insurance.append(state.split('!!')[1])

health19['61'] = Counties
health19['62'] = States
health19['63'] = Insurance

newcolumns.append('County')
newcolumns.append('State')
newcolumns.append('PopCategory')

health19.columns = newcolumns

health19.drop(index = 'Label (Grouping)', inplace = True)

indeces = []

for i in range(0, health19.shape[0]):
    indeces.append(i)
health19[''] = indeces
health19.set_index('', inplace = True)

health19.replace('N', 0, inplace = True)

health19.drop(columns = ['Total household population', 'In non-family households and other living arrangements', \
    'In family households', 'In married couple families', 'In other families', 'Male reference person, no spouse present', \
    'Female reference person, no spouse present', 'With a disability', 'No disability', 'Civilian noninstitutionalized population 19 to 64 years', \
    'In labor force', 'Employed', 'Unemployed', 'Not in labor force', 'Worked full-time, year round in the past 12 months', \
    'Worked less than full-time, year round in the past 12 months', 'Did not work', 'Civilian noninstitutionalized population for whom poverty status is determined', \
    'Civilian noninstitutionalized population 26 years and over', 'Under 19 years', '19 to 64 years', '65 years and older', 'Two or more races', \
    'White alone, not Hispanic or Latino', 'Below 138 percent of the poverty threshold', '138 to 399 percent of the poverty threshold', \
    'At or above 400 percent of the poverty threshold', 'Below 100 percent of the poverty threshold'], inplace = True)

columns = list(health19.columns)[:-3]
for i in columns:
       health19[i] = health19[i].str.replace(',', '')
       health19[i] = pd.to_numeric(health19[i])

# Imputing the 0 values for the mean of the population
health19.fillna(0, inplace = True)
imp = SimpleImputer(strategy = 'mean', missing_values = 0)

health19[columns] = imp.fit_transform(health19[columns])
health19[columns] = health19[columns].astype(int)

newColumns = ['State', 'County', 'PopCategory', \
    'Civilian noninstitutionalized population', 'Under 6 years', '6 to 18 years', '19 to 25 years', '26 to 34 years', '35 to 44 years', \
    '45 to 54 years', '55 to 64 years', '65 to 74 years', '75 years and older', 'Male', \
    'Female', 'White alone', 'Black or African American alone', 'American Indian and Alaska Native alone', 'Asian alone', 'Native Hawaiian and Other Pacific Islander alone', \
    'Some other race alone', 'Hispanic or Latino (of any race)', \
    'Native born', 'Foreign born', 'Naturalized', 'Not a citizen', \
    'Less than high school graduate', 'High school graduate (includes equivalency)', \
    "Some college or associate's degree", "Bachelor's degree or higher", 'Under $25,000', '$25,000 to $49,999', \
    '$50,000 to $74,999', '$75,000 to $99,999', '$100,000 and over']

health19 = health19[newColumns]

health19.columns = ['State', 'County', 'Insurance_Category', \
    'Total_Population', 'Under_6Y', '_6_to_18Y', '_19_to_25Y', '_26_to_34Y', '_35_to_44Y', \
    '_45_to_54Y', '_55_to_64Y', '_65_to_74Y', '_75_and_Older', 'Male', \
    'Female', 'White', 'African_American', 'American_Indian', 'Asian', 'Pacific_Islander', \
    'Some_Other_Race', 'Hispanic', \
    'Native_Born', 'Foreign_Born', 'Naturalized', 'Not_A_Citizen', \
    'Less_Than_High_School', 'High_School_or_Equivalent', \
    "Some_College", "Bachelors_or_Higher", 'Under_25000S', '_25000_to_49999S', \
    '_50000_to_74999S', '_75000_to_99999S', 'Over_100000S']

health19['State'] = health19['State'].str.lstrip()
health19['State'] = health19['State'].str.replace(' ', '_')
health19['State'] = health19['State'].replace('District_of_Columbia', 'DC')

In [0]:
health15['Year'] = 2015
health16['Year'] = 2016
health17['Year'] = 2017
health18['Year'] = 2018
health19['Year'] = 2019

**ANOVA tests**

In [0]:
# ANOVA Test for 2015

healthAnova = health15.rename(columns = {'Insurance_Category':'InsuranceCategory', 'Total_Population':'TotalPopulation', 'Under_6Y':'Undersixyears', \
            '_6_to_18Y':'sixtoeighteenyears',  '_19_to_25Y':'nineteentotwentyfiveyears', '_26_to_34Y':'twentysixtothirtyfouryears', \
            '_35_to_44Y':'thirtyfivetofourtyfouryears', '_45_to_54Y':'fourtyfivetofiftyfouryears', '_55_to_64Y':'fiftyfivetosixtyfouryears', \
            '_65_to_74Y':'sixtyfivetoseventyfouryears', '_75_and_Older':'seventyfiveandmoreyears', \
            'African_American':'AfricanAmerican', 'American_Indian':'AmericanIndian', \
            'Pacific_Islander':'PacificIslander', 'Some_Other_Race':'Someotherrace', \
            'Native_Born':'Nativeborn', 'Foreign_Born':'Foreignborn', 'Not_A_Citizen':'Notacitizen', \
            'Less_Than_High_School':'Lessthanhighschoolgraduate', 'High_School_or_Equivalent':'Highschoolgraduate',  \
            "Some_College":"Somecollege", "Bachelors_or_Higher":"Bachelors", 'Under_25000S':'Undertwentyfive', \
            '_25000_to_49999S':'twentyfivetofourtyninesalary', '_50000_to_74999S':'fiftytoseventyfoursalary', \
            '_75000_to_99999S':'seventyfivetoninetyninesalary', 'Over_100000S':'onehundredandoversalary'}).copy()

anovaColumns = []
for i in list(healthAnova.columns):
    anovaColumns.append(i)

anovaColumns = anovaColumns[4:-1]

for i in anovaColumns:
    data15 = healthAnova[[i, 'InsuranceCategory', 'State']]
    health15_lm = ols(f'{i} ~ C(InsuranceCategory, Sum)*C(State, Sum)', data = data15).fit()

    table = sm.stats.anova_lm(health15_lm, typ = 2)
    print(i, table['PR(>F)'])

Undersixyears C(InsuranceCategory, Sum)                  7.693176e-45
C(State, Sum)                              1.981502e-17
C(InsuranceCategory, Sum):C(State, Sum)    6.103525e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
sixtoeighteenyears C(InsuranceCategory, Sum)                  3.065630e-52
C(State, Sum)                              2.928655e-20
C(InsuranceCategory, Sum):C(State, Sum)    7.466754e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
nineteentotwentyfiveyears C(InsuranceCategory, Sum)                  1.837883e-39
C(State, Sum)                              1.957823e-22
C(InsuranceCategory, Sum):C(State, Sum)    9.793527e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
twentysixtothirtyfouryears C(InsuranceCategory, Sum)                  5.806177e-45
C(State, Sum)                              5.464271e-20
C(InsuranceCategory, Sum):C(State, Sum)    

In [0]:
# ANOVA Test for 2016

healthAnova = health16.rename(columns = {'Insurance_Category':'InsuranceCategory', 'Total_Population':'TotalPopulation', 'Under_6Y':'Undersixyears', \
            '_6_to_18Y':'sixtoeighteenyears',  '_19_to_25Y':'nineteentotwentyfiveyears', '_26_to_34Y':'twentysixtothirtyfouryears', \
            '_35_to_44Y':'thirtyfivetofourtyfouryears', '_45_to_54Y':'fourtyfivetofiftyfouryears', '_55_to_64Y':'fiftyfivetosixtyfouryears', \
            '_65_to_74Y':'sixtyfivetoseventyfouryears', '_75_and_Older':'seventyfiveandmoreyears', \
            'African_American':'AfricanAmerican', 'American_Indian':'AmericanIndian', \
            'Pacific_Islander':'PacificIslander', 'Some_Other_Race':'Someotherrace', \
            'Native_Born':'Nativeborn', 'Foreign_Born':'Foreignborn', 'Not_A_Citizen':'Notacitizen', \
            'Less_Than_High_School':'Lessthanhighschoolgraduate', 'High_School_or_Equivalent':'Highschoolgraduate',  \
            "Some_College":"Somecollege", "Bachelors_or_Higher":"Bachelors", 'Under_25000S':'Undertwentyfive', \
            '_25000_to_49999S':'twentyfivetofourtyninesalary', '_50000_to_74999S':'fiftytoseventyfoursalary', \
            '_75000_to_99999S':'seventyfivetoninetyninesalary', 'Over_100000S':'onehundredandoversalary'}).copy()

anovaColumns = []
for i in list(healthAnova.columns):
    anovaColumns.append(i)

anovaColumns = anovaColumns[4:-1]

for i in anovaColumns:
    data16 = healthAnova[[i, 'InsuranceCategory', 'State']]
    health16_lm = ols(f'{i} ~ C(InsuranceCategory, Sum)*C(State, Sum)', data = data16).fit()

    table = sm.stats.anova_lm(health16_lm, typ = 2)
    print(i, table['PR(>F)'])

Undersixyears C(InsuranceCategory, Sum)                  2.355100e-45
C(State, Sum)                              2.289768e-18
C(InsuranceCategory, Sum):C(State, Sum)    6.617319e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
sixtoeighteenyears C(InsuranceCategory, Sum)                  1.280698e-52
C(State, Sum)                              2.555523e-20
C(InsuranceCategory, Sum):C(State, Sum)    7.181955e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
nineteentotwentyfiveyears C(InsuranceCategory, Sum)                  1.266526e-41
C(State, Sum)                              1.215429e-22
C(InsuranceCategory, Sum):C(State, Sum)    9.386632e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
twentysixtothirtyfouryears C(InsuranceCategory, Sum)                  1.006376e-45
C(State, Sum)                              7.125087e-20
C(InsuranceCategory, Sum):C(State, Sum)    

In [0]:
# ANOVA Test for 2017

healthAnova = health17.rename(columns = {'Insurance_Category':'InsuranceCategory', 'Total_Population':'TotalPopulation', 'Under_6Y':'Undersixyears', \
            '_6_to_18Y':'sixtoeighteenyears',  '_19_to_25Y':'nineteentotwentyfiveyears', '_26_to_34Y':'twentysixtothirtyfouryears', \
            '_35_to_44Y':'thirtyfivetofourtyfouryears', '_45_to_54Y':'fourtyfivetofiftyfouryears', '_55_to_64Y':'fiftyfivetosixtyfouryears', \
            '_65_to_74Y':'sixtyfivetoseventyfouryears', '_75_and_Older':'seventyfiveandmoreyears', \
            'African_American':'AfricanAmerican', 'American_Indian':'AmericanIndian', \
            'Pacific_Islander':'PacificIslander', 'Some_Other_Race':'Someotherrace', \
            'Native_Born':'Nativeborn', 'Foreign_Born':'Foreignborn', 'Not_A_Citizen':'Notacitizen', \
            'Less_Than_High_School':'Lessthanhighschoolgraduate', 'High_School_or_Equivalent':'Highschoolgraduate',  \
            "Some_College":"Somecollege", "Bachelors_or_Higher":"Bachelors", 'Under_25000S':'Undertwentyfive', \
            '_25000_to_49999S':'twentyfivetofourtyninesalary', '_50000_to_74999S':'fiftytoseventyfoursalary', \
            '_75000_to_99999S':'seventyfivetoninetyninesalary', 'Over_100000S':'onehundredandoversalary'}).copy()

anovaColumns = []
for i in list(healthAnova.columns):
    anovaColumns.append(i)

anovaColumns = anovaColumns[4:-1]

for i in anovaColumns:
    data17 = healthAnova[[i, 'InsuranceCategory', 'State']]
    health17_lm = ols(f'{i} ~ C(InsuranceCategory, Sum)*C(State, Sum)', data = data17).fit()

    table = sm.stats.anova_lm(health17_lm, typ = 2)
    print(i, table['PR(>F)'])

Undersixyears C(InsuranceCategory, Sum)                  2.950097e-45
C(State, Sum)                              1.831959e-18
C(InsuranceCategory, Sum):C(State, Sum)    6.284932e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
sixtoeighteenyears C(InsuranceCategory, Sum)                  5.903963e-54
C(State, Sum)                              9.120012e-22
C(InsuranceCategory, Sum):C(State, Sum)    6.920769e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
nineteentotwentyfiveyears C(InsuranceCategory, Sum)                  5.006703e-42
C(State, Sum)                              1.627442e-22
C(InsuranceCategory, Sum):C(State, Sum)    9.309926e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
twentysixtothirtyfouryears C(InsuranceCategory, Sum)                  4.311536e-33
C(State, Sum)                              6.505066e-20
C(InsuranceCategory, Sum):C(State, Sum)    

In [0]:
# ANOVA Test for 2018

healthAnova = health18.rename(columns = {'Insurance_Category':'InsuranceCategory', 'Total_Population':'TotalPopulation', 'Under_6Y':'Undersixyears', \
            '_6_to_18Y':'sixtoeighteenyears',  '_19_to_25Y':'nineteentotwentyfiveyears', '_26_to_34Y':'twentysixtothirtyfouryears', \
            '_35_to_44Y':'thirtyfivetofourtyfouryears', '_45_to_54Y':'fourtyfivetofiftyfouryears', '_55_to_64Y':'fiftyfivetosixtyfouryears', \
            '_65_to_74Y':'sixtyfivetoseventyfouryears', '_75_and_Older':'seventyfiveandmoreyears', \
            'African_American':'AfricanAmerican', 'American_Indian':'AmericanIndian', \
            'Pacific_Islander':'PacificIslander', 'Some_Other_Race':'Someotherrace', \
            'Native_Born':'Nativeborn', 'Foreign_Born':'Foreignborn', 'Not_A_Citizen':'Notacitizen', \
            'Less_Than_High_School':'Lessthanhighschoolgraduate', 'High_School_or_Equivalent':'Highschoolgraduate',  \
            "Some_College":"Somecollege", "Bachelors_or_Higher":"Bachelors", 'Under_25000S':'Undertwentyfive', \
            '_25000_to_49999S':'twentyfivetofourtyninesalary', '_50000_to_74999S':'fiftytoseventyfoursalary', \
            '_75000_to_99999S':'seventyfivetoninetyninesalary', 'Over_100000S':'onehundredandoversalary'}).copy()

anovaColumns = []
for i in list(healthAnova.columns):
    anovaColumns.append(i)

anovaColumns = anovaColumns[4:-1]

for i in anovaColumns:
    data18 = healthAnova[[i, 'InsuranceCategory', 'State']]
    health18_lm = ols(f'{i} ~ C(InsuranceCategory, Sum)*C(State, Sum)', data = data18).fit()

    table = sm.stats.anova_lm(health18_lm, typ = 2)
    print(i, table['PR(>F)'])

Undersixyears C(InsuranceCategory, Sum)                  3.771909e-46
C(State, Sum)                              4.920595e-19
C(InsuranceCategory, Sum):C(State, Sum)    6.456554e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
sixtoeighteenyears C(InsuranceCategory, Sum)                  2.262457e-54
C(State, Sum)                              4.457221e-22
C(InsuranceCategory, Sum):C(State, Sum)    6.537141e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
nineteentotwentyfiveyears C(InsuranceCategory, Sum)                  1.441693e-42
C(State, Sum)                              1.743460e-22
C(InsuranceCategory, Sum):C(State, Sum)    9.397849e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
twentysixtothirtyfouryears C(InsuranceCategory, Sum)                  2.523872e-33
C(State, Sum)                              3.298694e-20
C(InsuranceCategory, Sum):C(State, Sum)    

In [0]:
# ANOVA Test for 2019

healthAnova = health19.rename(columns = {'Insurance_Category':'InsuranceCategory', 'Total_Population':'TotalPopulation', 'Under_6Y':'Undersixyears', \
            '_6_to_18Y':'sixtoeighteenyears',  '_19_to_25Y':'nineteentotwentyfiveyears', '_26_to_34Y':'twentysixtothirtyfouryears', \
            '_35_to_44Y':'thirtyfivetofourtyfouryears', '_45_to_54Y':'fourtyfivetofiftyfouryears', '_55_to_64Y':'fiftyfivetosixtyfouryears', \
            '_65_to_74Y':'sixtyfivetoseventyfouryears', '_75_and_Older':'seventyfiveandmoreyears', \
            'African_American':'AfricanAmerican', 'American_Indian':'AmericanIndian', \
            'Pacific_Islander':'PacificIslander', 'Some_Other_Race':'Someotherrace', \
            'Native_Born':'Nativeborn', 'Foreign_Born':'Foreignborn', 'Not_A_Citizen':'Notacitizen', \
            'Less_Than_High_School':'Lessthanhighschoolgraduate', 'High_School_or_Equivalent':'Highschoolgraduate',  \
            "Some_College":"Somecollege", "Bachelors_or_Higher":"Bachelors", 'Under_25000S':'Undertwentyfive', \
            '_25000_to_49999S':'twentyfivetofourtyninesalary', '_50000_to_74999S':'fiftytoseventyfoursalary', \
            '_75000_to_99999S':'seventyfivetoninetyninesalary', 'Over_100000S':'onehundredandoversalary'}).copy()

anovaColumns = []
for i in list(healthAnova.columns):
    anovaColumns.append(i)

anovaColumns = anovaColumns[4:-1]

for i in anovaColumns:
    data19 = healthAnova[[i, 'InsuranceCategory', 'State']]
    health19_lm = ols(f'{i} ~ C(InsuranceCategory, Sum)*C(State, Sum)', data = data19).fit()

    table = sm.stats.anova_lm(health19_lm, typ = 2)
    print(i, table['PR(>F)'])

Undersixyears C(InsuranceCategory, Sum)                  9.978424e-47
C(State, Sum)                              2.082104e-18
C(InsuranceCategory, Sum):C(State, Sum)    6.692977e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
sixtoeighteenyears C(InsuranceCategory, Sum)                  2.674696e-54
C(State, Sum)                              8.826126e-22
C(InsuranceCategory, Sum):C(State, Sum)    7.429791e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
nineteentotwentyfiveyears C(InsuranceCategory, Sum)                  1.013831e-42
C(State, Sum)                              3.777651e-22
C(InsuranceCategory, Sum):C(State, Sum)    9.654195e-01
Residual                                            NaN
Name: PR(>F), dtype: float64
twentysixtothirtyfouryears C(InsuranceCategory, Sum)                  3.696843e-33
C(State, Sum)                              1.041934e-19
C(InsuranceCategory, Sum):C(State, Sum)    

**Joining all datasets together for the model**

In [0]:
health_model = pd.concat([health15, health16, health17, health18, health19], axis = 0).copy()

columns = list(health_model.columns)[3:-2]
health_model.fillna(0, inplace = True)
imp = SimpleImputer(strategy = 'mean', missing_values = 0)

health_model[columns] = imp.fit_transform(health_model[columns])
health_model[columns] = health_model[columns].astype(int)

health_model = health_model[health_model['Insurance_Category'] != 'Total']

**Cleaning testing data to match training data**

In [0]:
testDemo = spark.read.options(header = 'True').csv("/mnt/healthcare/modeldataIn/CleanedData/TestDemographics.csv").sort('State', 'County').toPandas()

# Making sure the columns are the same for the model and the testing data
counties = []
for i in list(testDemo['County'].sort_values().unique()):
    if i not in list(health_model['County'].sort_values().unique()):
        counties.append(i)

indeces = list(testDemo[testDemo.County.isin(counties) == True].index)
testDemo.drop(index = indeces, inplace = True)

**Pushing to SQL**

In [0]:
database = "Healthcare-Capstone"
server = "gen10-data-fundamentals-22-07-sql-server.database.windows.net"

k_user = "HealthKylee"
k_password  = "IWantToBeADataScientist123!@#"

In [0]:
health_model = spark.createDataFrame(health_model)
testDemo = spark.createDataFrame(testDemo)

table = 'dbo.HealthModel'

health_model.sort('State', 'County').write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
        .mode("append") \
        .option("dbtable", table) \
        .option("user", k_user) \
        .option("password", k_password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .save()

table = "dbo.TestDemo"

testDemo.sort('State', 'County').write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
        .mode("append") \
        .option("dbtable", table) \
        .option("user", k_user) \
        .option("password", k_password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .save()