#### Data Cleaning

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_excel('Healthcare_dataset_only.xlsx')

# Replace unknown values with NaN
df['Ethnicity'].replace('Unknown', np.nan, inplace=True)
df['Race'].replace('Other/Unknown', np.nan, inplace=True)
df['Region'].replace('Other/Unknown', np.nan, inplace=True)
df['Ntm_Speciality'].replace('unknown', np.nan, inplace=True)

# Drop unwanted columns
df.drop(['Risk_Segment_During_Rx',
         'Tscore_Bucket_During_Rx',
         'Change_Risk_Segment'], axis=1, inplace=True)

# Replace Risk_Segment_Prior_Ntm values
df['Risk_Segment_Prior_Ntm'] = df['Risk_Segment_Prior_Ntm'].replace({'VLR_LR': 1, 'HR_VHR': 0})

# Replace Tscore_Bucket_Prior_Ntm values
df['Tscore_Bucket_Prior_Ntm'] = df['Tscore_Bucket_Prior_Ntm'].replace({'>-2.5': 1, '<=-2.5': 0})

# Replace 'Y' and 'N' with 1 and 0
df = df.replace({'Y': 1, 'N': 0})

# Replace age bucket values
df['Age_Bucket'] = df['Age_Bucket'].replace({'>75': 0, '65-75': 1, '55-65': 2, '<55': 3})


# Impute missing values with mode
cols_to_impute = ['Ethnicity', 'Race', 'Region', 'Ntm_Speciality']
for col in cols_to_impute:
    df[col].fillna(df[col].mode()[0], inplace=True)


df['Ntm_Speciality_Bucket'] = df['Ntm_Speciality_Bucket'].replace({'OB/GYN/Others/PCP/Unknown': 0, 'Endo/Onc/Uro': 1 ,'Rheum':2})

df['Change_T_Score'] = df['Change_T_Score'].replace({'No change': 0, 'Unknown': 1 ,'Worsened':2, 'Improved':3})

df['Region'] = df['Region'].replace({'West': 0, 'Midwest': 1 ,'South':2, 'Northeast':3})

df['Race'] = df['Race'].replace({'Caucasian': 0, 'Asian': 1 ,'African American':2})

gender = pd.get_dummies(df['Gender'],drop_first=True)
adherent = pd.get_dummies(df['Adherent_Flag'],drop_first=True)
persistent = pd.get_dummies(df['Persistency_Flag'],drop_first=True)
ethnicity = pd.get_dummies(df['Ethnicity'],drop_first=True)
specialist = pd.get_dummies(df['Ntm_Specialist_Flag'],drop_first=True)

df.drop(['Gender','Adherent_Flag','Persistency_Flag','Ethnicity','Region','Ptid','Ntm_Speciality','Ntm_Specialist_Flag' ],axis=1,inplace=True)

df = pd.concat([df,gender,adherent,persistent,ethnicity,specialist],axis=1)

# Save cleaned dataset as CSV
df.to_csv('cleaned_dataset_final.csv', index=False)