# Inferential Statistics

In [1]:
# package imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

## Import clean DataFrames from 2.0-jkg-data-wrangling

In [2]:
# import pickle files
df_clean = pd.read_pickle('../data/interim/clean_df.pickle')
df_patients = pd.read_pickle('../data/interim/patients_df.pickle')
df_appointments = pd.read_pickle('../data/interim/appointments_df.pickle')

## Means

In [3]:
# create numpy arrays for male and female patient ages, using minimum age
# for patients with multiple values for age (due to multiple appointments)

# males
df_male = df_clean[df_clean['Gender'] == 'M']
ages_male = df_male.groupby('Patient_ID').Age.agg('min').values

# females
df_female = df_clean[df_clean['Gender'] == 'F']
ages_female = df_female.groupby('Patient_ID').Age.agg('min').values

# calculate mean age for males and females 
mean_age_male = np.mean(ages_male)
mean_age_female = np.mean(ages_female)

# print out mean ages, rounded off to one decimal point
print('The mean patient age is', round(mean_age_male, 1), 'years for males and', round(mean_age_female, 1), 'years for females.')

The mean patient age is 33.3 years for males and 38.6 years for females.


## Pearson Correlation Coefficient

### Hypertension and Diabetes

In [4]:
# create numpy arrays for the binary variables hypertension and diabetes from the patients dataframe
patients_hypertension = df_patients.Hypertension.values
patients_diabetes = df_patients.Diabetes.values

# compute the pearson correlation coefficient (r) between hypertension and diabetes
r_hypertension_diabetes = np.corrcoef(patients_hypertension, patients_diabetes)[0,1]

# print out r, rounded to four decimal places
print('The Pearson Correlation Coefficient between hypertension and diabetes is r =', round(r_hypertension_diabetes, 4))

# show a cross-tab of this relationship
print('\nCrosstab:\n')
print(pd.crosstab(df_patients['Hypertension'], df_patients['Diabetes'], margins=True, normalize=True))

The Pearson Correlation Coefficient between hypertension and diabetes is r = 0.4274

Crosstab:

Diabetes             0         1       All
Hypertension                              
0             0.790125  0.013371  0.803496
1             0.138991  0.057513  0.196504
All           0.929116  0.070884  1.000000


### SMS Sent and No-show

In [5]:
# create numpy arrays for the binary variables SMS_sent and No_show from the appointments dataframe
appt_sms = df_appointments.SMS_sent.values
appt_noshow = df_appointments.No_show.map({'Yes': 1, 'No': 0}).values

# compute the pearson correlation coefficient (r) between SMS_sent and No_show
r_sms_noshow = np.corrcoef(appt_sms, appt_noshow)[0,1]

# print out r, rounded to four decimal places
print('The Pearson Correlation Coefficient between SMS_sent and No_show is r =', round(r_sms_noshow, 4))

# show a cross-tab of this relationship
print('\nCrosstab:\n')
print(pd.crosstab(df_appointments['SMS_sent'], df_appointments['No_show'], margins=True, normalize=True))

The Pearson Correlation Coefficient between SMS_sent and No_show is r = 0.1264

Crosstab:

No_show         No       Yes       All
SMS_sent                              
0         0.565563  0.113411  0.678974
1         0.232504  0.088521  0.321026
All       0.798067  0.201933  1.000000


### Gender and No-show

In [6]:
# create numpy arrays for the binary variables SMS_sent and No_show from the appointments dataframe
appt_gender = df_appointments.Gender.map({'M': 1, 'F': 0}).values
appt_noshow = df_appointments.No_show.map({'Yes': 1, 'No': 0}).values

# compute the pearson correlation coefficient (r) between SMS_sent and No_show
r_gender_noshow = np.corrcoef(appt_gender, appt_noshow)[0,1]

# print out r, rounded to four decimal places
print('The Pearson Correlation Coefficient between Gender and No_show is r =', round(r_gender_noshow, 4))

# show a cross-tab of this relationship
print('\nCrosstab:\n')
print(pd.crosstab(df_appointments['Gender'], df_appointments['No_show'], margins=True, normalize=True))

The Pearson Correlation Coefficient between Gender and No_show is r = -0.0041

Crosstab:

No_show        No       Yes       All
Gender                               
F        0.517937  0.132040  0.649977
M        0.280131  0.069892  0.350023
All      0.798067  0.201933  1.000000
