In [133]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

## 1. Initial Data Loading

### 1.1 Data Structure Analysis

In [None]:
df = pd.read_csv('./data/diabetic_data.csv')

In [147]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [148]:
df.shape

(101766, 48)

In [96]:
df.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

### 1.2 Target Variable Analysis

#### Readmission of Patients
##### 53.92% No Readmission
##### 34.93% Readmission after 30 Days
##### 11.16% Readmission before 30 Days

In [97]:
df.readmitted.value_counts(normalize=True)

readmitted
NO     0.539119
>30    0.349282
<30    0.111599
Name: proportion, dtype: float64

## 2. Data Quality Assessment

### 2.1 Missing Data Analysis

In [99]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


*Note Dataset used '?' instead of Null*

In [100]:
df = df.replace({'?': np.nan}) #update null values to null

In [101]:
df.isnull().sum().sort_values(ascending=False) / len(df) * 100

weight                      96.858479
max_glu_serum               94.746772
A1Cresult                   83.277322
medical_specialty           49.082208
payer_code                  39.557416
race                         2.233555
diag_3                       1.398306
diag_2                       0.351787
diag_1                       0.020636
patient_nbr                  0.000000
time_in_hospital             0.000000
admission_source_id          0.000000
num_lab_procedures           0.000000
encounter_id                 0.000000
admission_type_id            0.000000
discharge_disposition_id     0.000000
gender                       0.000000
age                          0.000000
number_inpatient             0.000000
number_emergency             0.000000
number_outpatient            0.000000
num_medications              0.000000
num_procedures               0.000000
number_diagnoses             0.000000
metformin                    0.000000
repaglinide                  0.000000
nateglinide 

#### 2.1.1 Exploring Type of Missing Data

##### 2.1.1.1 Weight

High missing value might be due to the fact that, prior to the HITECH legislation of the American Reinvestment and Recovery Act in 2009, hospitals and clinics were not required to capture it in a structured format.


In [102]:
df.weight.value_counts(dropna=False).sort_values(ascending=False) / len(df) * 100

weight
NaN          96.858479
[75-100)      1.312816
[50-75)       0.881434
[100-125)     0.614154
[125-150)     0.142484
[25-50)       0.095317
[0-25)        0.047167
[150-175)     0.034393
[175-200)     0.010809
>200          0.002948
Name: count, dtype: float64

##### 2.1.1.2 Glucose Serum Test Result
Null values as test was not administered

In [103]:
df.max_glu_serum.value_counts(dropna=False).sort_values(ascending=False) / len(df) * 100

max_glu_serum
NaN     94.746772
Norm     2.551933
>200     1.459230
>300     1.242065
Name: count, dtype: float64

##### 2.1.1.3 A1C Result
Null values as test was not administered

In [104]:
df.A1Cresult.value_counts(dropna=False).sort_values(ascending=False) / len(df) * 100

A1Cresult
NaN     83.277322
>8       8.073423
Norm     4.903406
>7       3.745848
Name: count, dtype: float64

##### 2.1.1.4 Medical Specialty of Admitting Physician
Null values most likely result of unspecialized/general speciality physician

In [105]:
df.medical_specialty.value_counts(dropna=False).sort_values(ascending=False) / len(df) * 100

medical_specialty
NaN                       49.082208
InternalMedicine          14.381031
Emergency/Trauma           7.433720
Family/GeneralPractice     7.310890
Cardiology                 5.259124
                            ...    
Proctology                 0.000983
Speech                     0.000983
SportsMedicine             0.000983
Perinatology               0.000983
Neurophysiology            0.000983
Name: count, Length: 73, dtype: float64

In [106]:
df.payer_code.value_counts(dropna=False).sort_values(ascending=False) / len(df) * 100

payer_code
NaN    39.557416
MC     31.876069
HM      6.165124
SP      4.920111
BC      4.574219
MD      3.470707
CP      2.489043
UN      2.405519
CM      1.903386
OG      1.015074
PO      0.581727
DM      0.539473
CH      0.143466
WC      0.132657
OT      0.093351
MP      0.077629
SI      0.054046
FR      0.000983
Name: count, dtype: float64

In [107]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# build contingency table
contingency = pd.crosstab(df["payer_code"], df["readmitted"])

chi2, p, dof, expected = chi2_contingency(contingency)

n = contingency.sum().sum()
phi2 = chi2/n
r,k = contingency.shape
cramers_v = np.sqrt(phi2/min(k-1, r-1))

print("Cramer's V:", cramers_v)

Cramer's V: 0.06422447751635021


### 2.3 Duplicate Analysis

In [111]:
df.duplicated().sum()

np.int64(0)

### 2.4 Outlier Detection

In [124]:
df_numerical = df.select_dtypes(include=[np.number])
df_categorical = df.select_dtypes(exclude=[np.number])

(101766, 13)
(101766, 35)
(101766, 48)


4. Data Type Corrections
4.1 Categorical Variable Encoding
4.2 Numerical Variable Transformations
Age groupings
Medication counts
Length of stay categories
5. Export Cleaned Dataset

1. Target Variable Deep Dive
1.1 Readmission Rate Analysis

# Overall readmission statistics
4. Univariate Analysis
2.1 Demographic Variables

Age distribution and readmission rates
Gender patterns
Race/ethnicity analysis

2.2 Clinical Variables

3. Bivariate Analysis
3.1 Clinical Factors vs Readmissions
3.2 Hospital-Level Analysis
3.3 Medication Patterns

4. Key Statistical Relationships
4.1 Correlation Analysis
4.2 Chi-square Tests for Categorical Variables
4.3 ANOVA for Continuous Variables

5. Visualization Summary

Key charts and interpretations
Most significant findings

## 3. Initial Observations & Data Quality Issues

### 3.1 Missing Data Treatment

In [None]:
df = df.drop(columns=['weight','payer_code'])

### 3.2 Outlier Treatment

## 4. Data Type Corrections

### 4.1 Categorical Variable Encoding

### 4.2 Numerical Variable Transformations

## 5. Exploratory Data Analysis

### 5.1 Univariate Analysis

#### 5.1.1 Demographic Variables

#### 5.1.2 Clinical Variables

#### 5.1.3 Operational Variables

### 5.2 Bivariate Analysis

#### 5.2.1 Clinical Factors vs Readmissions

#### 5.2.2 Hospital-Level Analysis

#### 5.2.3 Medication Patterns

### 5.3 Key Statistical Relationships

#### 5.3.1 Correlation Analysis

#### 5.3.2 Chi-square Tests for Categorical Variables

#### 5.3.3 ANOVA for Continuous Variables

### 5.4 Visualization Summary
Key charts and interpretations \
Most significant findings

## 6. Actionable Recommendations

### 6.1 Clinical Interventions

Enhanced discharge planning for high-medication patients
Medication reconciliation protocols


### 6.2 Operational Improvements

Weekend discharge protocols
Risk-based follow-up scheduling


### 6.3 Strategic Initiatives

Best practice sharing across hospitals
Quality improvement programs

## 7. Success Metrics & Monitoring

KPI definitions
Monitoring dashboard requirements
Review schedule

## 8. Limitations & Future Work
Data limitations
Additional analysis opportunities
Long-term research questions