In [2]:
import pandas as pd
import scipy.stats as stats

# Load the data
df = pd.read_csv("../data/Diabetes_US_Hospitals_dataset/diabetic_data.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


Univariate Test for Normality Application


In [8]:
# One-Sample T-Test (Testing average hospital stay against 5 days)
stay_mean = df["time_in_hospital"].mean()
t_stat, p_value = stats.ttest_1samp(df["time_in_hospital"].dropna(), 5)
print(f"One-Sample T-Test: t={t_stat}, p={p_value}")

One-Sample T-Test: t=-64.54873206547254, p=0.0


In [12]:
# Independent Sample T-Test (Comparing medication counts between genders)
male_meds = df[df['gender'] == 'Male']["num_medications"].dropna()
female_meds = df[df['gender'] == 'Female']["num_medications"].dropna()
ind_stat, ind_p = stats.ttest_ind(male_meds, female_meds, equal_var=False)
print(f"Independent Sample T-Test: t={ind_stat}, p={ind_p}")

Independent Sample T-Test: t=-6.99456730357579, p=2.6778773925414267e-12


In [5]:
# Mapping categorical glucose values to numeric values
glucose_mapping = {
    "None": None,  # No measurement available
    "Norm": 100,   # Normal glucose level (approximate)
    ">200": 200,   # High glucose level
    ">300": 300    # Very high glucose level
}

a1c_mapping = {
    "None": None,
    "Norm": 5.5,   # Normal A1C level
    ">7": 7.5,     # Elevated A1C
    ">8": 8.5      # Very high A1C
}

# Convert categorical glucose and A1C values to numerical
df["max_glu_serum_numeric"] = df["max_glu_serum"].map(glucose_mapping)
df["A1Cresult_numeric"] = df["A1Cresult"].map(a1c_mapping)

# Drop rows with missing values
df_cleaned = df.dropna(subset=["max_glu_serum_numeric", "A1Cresult_numeric"])

# Perform paired t-test
t_stat, p_value = stats.ttest_rel(df_cleaned["max_glu_serum_numeric"], df_cleaned["A1Cresult_numeric"])

# Display results
print(f"Paired t-test Results:\nT-statistic = {t_stat:.4f}, P-value = {p_value:.4f}")


Paired t-test Results:
T-statistic = 40.5821, P-value = 0.0000
