In [None]:
import pandas as pd
import numpy as np

data = {
    'PatientID': [1, 2, 3, 4, 5],
    'dob': ['1992-07-10', '1986/05/15', '20-11-1978', 'February 20, 2001', '1992-07-10'],
    'Age': ['29', '34', '43', None, '29'],
    'Diagnosis': ['Cancer', 'Flu', 'Asthma', 'Cold', 'Diabetes'],
    'Medical_History': ['Hypertension', 'None', 'Asthma', 'Cold', 'None']
}

df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Standardize column names
df.rename(columns={
    'dob': 'date_of_birth',
    'Age': 'age',
    'Medical_History': 'medical_history'
}, inplace=True)

# Convert date formats
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce')

# Convert age to numeric type
df['age'] = pd.to_numeric(df['age'], errors='coerce')

# Remove duplicate patient records
df = df.drop_duplicates(subset='PatientID')

# Handle missing values
df['Diagnosis'] = df['Diagnosis'].fillna('Unknown')
df = df.dropna(subset=['medical_history'])

# Final cleaned data
print("\nCleaned Data:")
print(df)

# Save cleaned data to a CSV file
df.to_csv('cleaned_healthcare_data.csv', index=False)
print("\nData saved to 'cleaned_healthcare_data.csv'")


Original Data:
   PatientID                dob   Age Diagnosis Medical_History
0          1         1992-07-10    29    Cancer    Hypertension
1          2         1986/05/15    34       Flu            None
2          3         20-11-1978    43    Asthma          Asthma
3          4  February 20, 2001  None      Cold            Cold
4          5         1992-07-10    29  Diabetes            None

Cleaned Data:
   PatientID date_of_birth   age Diagnosis medical_history
0          1    1992-07-10  29.0    Cancer    Hypertension
1          2           NaT  34.0       Flu            None
2          3           NaT  43.0    Asthma          Asthma
3          4           NaT   NaN      Cold            Cold
4          5    1992-07-10  29.0  Diabetes            None

Data saved to 'cleaned_healthcare_data.csv'
