In [8]:
#Import libraries
import pandas as pd

In [9]:
#Load the data and view
dat = pd.read_csv("hospital_readmissions.csv")
dat.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


In [35]:
#Fix age range column and create a midpoint for optimal data analysis
print(dat['age'].dtype)

def calculate_midpoint(age_range):
    lower, upper = age_range.strip('[]()').split('-')
    return (int(lower) + int(upper)) / 2

# Apply the function to the 'age' column
dat['age_midpoint'] = dat['age'].apply(calculate_midpoint)

#Drop the old columns
dat.drop(['age'], axis = 1)

dat.head()

object


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted,age_midpoint
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,0,1,0,75.0
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,0,1,0,75.0
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,1,1,1,55.0
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,1,1,1,75.0
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,0,1,0,65.0


In [28]:
len(dat)

dat = dat.dropna()
len(dat)

#Drop N/A values (there were none)

25000

In [31]:
#Several vars have yes/no values, let's encode them as 0/1

def encode_yes_no(value):
    if value == 'yes':
        return 1
    elif value == 'no':
        return 0
    return value

# Apply encoding to each column that contains 'yes' or 'no'
for col in dat.columns:
    if dat[col].dtype == 'object':  # Ensuring we're looking at object type columns
        unique_vals = dat[col].unique()
        if set(unique_vals).issubset({'yes', 'no'}):  # Check if column values are only 'yes' or 'no'
            dat[col] = dat[col].apply(encode_yes_no)

# Display the updated DataFrame
dat.head()


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted,age_midpoint
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,0,1,0,75.0
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,0,1,0,75.0
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,1,1,1,55.0
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,1,1,1,75.0
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,0,1,0,65.0


In [30]:
#Make a dictionary of all the unique values of every column 

unique_values = {col: dat[col].unique().tolist() for col in dat.columns}

# Identify non-numeric columns by checking the data type of the first element in each list
categorical_data = {col: vals for col, vals in unique_values.items() if isinstance(dat[col].iloc[0], str)}

# Convert the dictionary to a DataFrame for a nicer display
categorical_df = pd.DataFrame(list(categorical_data.items()), columns=['Column', 'Unique Values'])
categorical_df


Unnamed: 0,Column,Unique Values
0,age,"[[70-80), [50-60), [60-70), [40-50), [80-90), ..."
1,medical_specialty,"[Missing, Other, InternalMedicine, Family/Gene..."
2,diag_1,"[Circulatory, Other, Injury, Digestive, Respir..."
3,diag_2,"[Respiratory, Other, Circulatory, Injury, Diab..."
4,diag_3,"[Other, Circulatory, Diabetes, Respiratory, In..."
5,glucose_test,"[no, normal, high]"
6,A1Ctest,"[no, normal, high]"


Now the data has met the following requirements:
1. Dropped missing values
2. Age is now a numerical variable
3. One-hot encodings of binary variables
4. Summary of the unique categorical variables

In [36]:
#Export the data as a csv and rename it (This is the data to be worked with from this point on)

dat.to_csv('readmissions_data_clean.csv', index=False)  