In [2]:
import pandas as pd
import numpy as np

Load the Dataset

In [3]:
df = pd.read_csv('heart_disease_uci.csv')

print(f"Initial Dataset Shape: {df.shape}")

Initial Dataset Shape: (920, 16)


In [3]:
df.head(5)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


Data Preprocessing

In [6]:
# Check and Remove Duplicate Records
duplicates_count = df.duplicated().sum()
if duplicates_count > 0:
    print(f"Found {duplicates_count} duplicate records. Removing them...")
    df = df.drop_duplicates()
    print(f"Shape after removing duplicates: {df.shape}")
else:
    print("No duplicate records found.")

# Handle Target Variable 'num'
# Mapping 0 (No Disease) to 0, and 1-4 (Disease) to 1 for high Recall
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

# Handle Missing Values
# Clinical data often contains NaNs. We fill them to maintain instance count > 500.
for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype == 'bool':
        # Categorical: Fill with most frequent value (Mode)
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        # Numerical: Fill with Median
        df[col] = df[col].fillna(df[col].median())

# Final Check
print("\n--- Final Preprocessing Summary ---")
print(f"Total Rows: {df.shape[0]}")
print(f"Total Columns: {df.shape[1]}")
print(f"Target Distribution:\n{df['target'].value_counts(normalize=True)}")

# Save for the Modeling Step
df.to_csv('Heart_Disease_Cleaned.csv', index=False)
print("\nSuccess: 'Heart_Disease_Cleaned.csv' is ready for modeling.")

No duplicate records found.

--- Final Preprocessing Summary ---
Total Rows: 920
Total Columns: 17
Target Distribution:
target
1    0.553261
0    0.446739
Name: proportion, dtype: float64

Success: 'Heart_Disease_Cleaned.csv' is ready for modeling.


  df[col] = df[col].fillna(df[col].mode()[0])
