In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
sns.set()

import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
display(HTML(r"<style>.output {display: flex; \
              align-items: center; \
              text-align: center;} \
              </style>"))

## Inferential Statistics

In [9]:
heart_prev = pd.read_csv("heart_disease.csv")
heart_prev.drop("Unnamed: 0", axis=1, inplace=True)
display(heart_prev.head())
display(heart_prev.info())

Unnamed: 0,Zip Code,Smoking Prevalence,Hypertension Prevalence,Obesity Prevalence,Sedentarism Prevalence,Cholesterol Prevalence,Diabetes Prevalence,Heart Disease Prevalence,Restaurant Count,Population,Population Group,Restaurant Group,median_household_income,Income Group
0,1104,24.269272,33.257045,37.437441,37.721353,38.39299,14.837576,8.816472,5.0,6726,<10000,0-7,32273.0,<50000
1,1105,28.596151,32.879892,42.176132,43.694299,37.409454,15.840128,7.849705,4.0,6075,<10000,0-7,18402.0,<50000
2,1108,23.844497,30.236068,35.381122,34.251449,35.500119,12.581968,6.943035,2.0,7038,<10000,0-7,34064.0,<50000
3,1109,23.329629,32.28536,37.212078,33.779183,34.154532,13.474276,6.706302,2.0,7577,<10000,0-7,33376.0,<50000
4,1119,21.656562,30.13657,33.192432,29.277953,34.842082,11.433115,6.894488,2.0,5361,<10000,0-7,46055.0,<50000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3659 entries, 0 to 3658
Data columns (total 14 columns):
Zip Code                    3659 non-null int64
Smoking Prevalence          3659 non-null float64
Hypertension Prevalence     3659 non-null float64
Obesity Prevalence          3659 non-null float64
Sedentarism Prevalence      3659 non-null float64
Cholesterol Prevalence      3659 non-null float64
Diabetes Prevalence         3659 non-null float64
Heart Disease Prevalence    3659 non-null float64
Restaurant Count            3659 non-null float64
Population                  3659 non-null int64
Population Group            3659 non-null object
Restaurant Group            3659 non-null object
median_household_income     3659 non-null float64
Income Group                3659 non-null object
dtypes: float64(9), int64(2), object(3)
memory usage: 400.3+ KB


None

### Income groups and heart disease

In [10]:
_ = heart_prev.groupby("Income Group")["Heart Disease Prevalence"].mean().reset_index()
display(_)

Unnamed: 0,Income Group,Heart Disease Prevalence
0,<50000,6.478394
1,>=50000,4.992856


We saw that the mean prevalence of heart disease is higher in zip codes with a median household income below 50000.

#### Is the observed difference in the samples due to chance?

*  __Null Hypothesis__: There is no difference in the mean heart disease prevalence between groups.
*  __Alternative Hypothesis__: The mean heart disease prevalence is higher when median household income is <50000.

alpha = 0.05

To test the hypotheses we draw bootstrap samples of both groups and compare the boostrap difference in means to the observed difference in means.

In [11]:
# Functions to draw bootstrap replicates
def bootstrap_replicate_1d(data, func):
    return func(np.random.choice(data, size=len(data)))

def draw_bs_reps(data, func, size=1):
    bs_replicates = np.empty(size)
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data,func)
    return bs_replicates

# Compute the mean of heart disease prevalence
mean_prev = np.mean(heart_prev["Heart Disease Prevalence"])

# Separate heart disease prevalence by low and high income
low_income_prev = heart_prev[heart_prev["Income Group"] == "<50000"]["Heart Disease Prevalence"]
high_income_prev = heart_prev[heart_prev["Income Group"] == ">=50000"]["Heart Disease Prevalence"]

# Calculate the observed difference in means
empirical_diff_means = np.mean(low_income_prev) - np.mean(high_income_prev)

# Generate shifted arrays
low_income_shifted = low_income_prev - np.mean(low_income_prev) + mean_prev
high_income_shifted = high_income_prev - np.mean(high_income_prev) + mean_prev

# Compute 10,000 bootstrap replicates from shifted arrays
bs_replicates_l = draw_bs_reps(low_income_shifted, np.mean, size=10000)
bs_replicates_h = draw_bs_reps(high_income_shifted, np.mean, size=10000)

# Get replicates of difference of means
bs_replicates = bs_replicates_l - bs_replicates_h

# Compute and print p-value: 
p = np.sum(bs_replicates >= empirical_diff_means) / len(bs_replicates)
print('p-value =', p)


p-value = 0.0


A p-value close to 0 indicates we reject the null hypothesis. There is a difference in the means of both groups. The alternative hypothesis suggests that the mean heart disease prevalence is higher when median household income is below 50000 in a zip code.

### Population groups and heart disease

In [12]:
_ = heart_prev.groupby("Population Group")["Heart Disease Prevalence"].mean().reset_index()
display(_)

Unnamed: 0,Population Group,Heart Disease Prevalence
0,<10000,5.791016
1,>=10000,4.848598


We saw that the mean prevalence of heart disease is higher in zip codes with a population below 10000.

#### Is the observed difference in the samples due to chance?

*  __Null Hypothesis__: There is no difference in the mean heart disease prevalence between groups.
*  __Alternative Hypothesis__: The mean heart disease prevalence is higher when population <10000.

alpha = 0.05

To test the hypotheses we draw bootstrap samples of both groups and compare the boostrap difference in means to the observed difference in means.

In [13]:
# Functions to draw bootstrap replicates
def bootstrap_replicate_1d(data, func):
    return func(np.random.choice(data, size=len(data)))

def draw_bs_reps(data, func, size=1):
    bs_replicates = np.empty(size)
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data,func)
    return bs_replicates

# Compute the mean of heart disease prevalence
mean_prev = np.mean(heart_prev["Heart Disease Prevalence"])

# Separate heart disease prevalence by population
low_pop_prev = heart_prev[heart_prev["Population Group"] == "<10000"]["Heart Disease Prevalence"]
high_pop_prev = heart_prev[heart_prev["Population Group"] == ">=10000"]["Heart Disease Prevalence"]

# Calculate the observed difference in means
empirical_diff_means = np.mean(low_pop_prev) - np.mean(high_pop_prev)

# Generate shifted arrays
low_pop_shifted = low_pop_prev - np.mean(low_pop_prev) + mean_prev
high_pop_shifted = high_pop_prev - np.mean(high_pop_prev) + mean_prev

# Compute 10,000 bootstrap replicates from shifted arrays
bs_replicates_l = draw_bs_reps(low_pop_shifted, np.mean, size=10000)
bs_replicates_h = draw_bs_reps(high_pop_shifted, np.mean, size=10000)

# Get replicates of difference of means
bs_replicates = bs_replicates_l - bs_replicates_h

# Compute and print p-value: 
p = np.sum(bs_replicates >= empirical_diff_means) / len(bs_replicates)
print('p-value =', p)


p-value = 0.0


A p-value close to 0 indicates we reject the null hypothesis. There is a difference in the means of both groups. The alternative hypothesis suggests that the mean heart disease prevalence is higher when zip code population is below 10000.

#### Summary

The following features have moderate to strong correlations with heart disease prevalence and will be useful for our model:

1. High cholesterol prevalence
2. Hypertension prevalence
3. Diabetes prevalence
4. Sedendtarism prevalence
5. Obesity prevalence
6. Smoking prevalence
7. Median household income
8. Population

Restaurant count per zip code may not be helpful for our model. It has a weak correlation with heart disease prevalence in populations >= 10000.