# Data Cleaning and Preprocessing

### Import Libraries and Load Dataset

In [138]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression


file_path = "heart.csv"  
data = pd.read_csv(file_path)


print("First 5 rows of the dataset:")
print(data.head())

print("\nDataset Info:")
print(data.info())



First 5 rows of the dataset:
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------  

In [142]:
print("\nSummary Statistics:")
print(data.describe())


print("\nMissing Values:")
print(data.isnull().sum())


duplicates = data.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")


Summary Statistics:
              Age   RestingBP  Cholesterol   FastingBS       MaxHR  \
count  918.000000  918.000000   918.000000  918.000000  918.000000   
mean    53.510893  132.396514   198.799564    0.233115  136.809368   
std      9.432617   18.514154   109.384145    0.423046   25.460334   
min     28.000000    0.000000     0.000000    0.000000   60.000000   
25%     47.000000  120.000000   173.250000    0.000000  120.000000   
50%     54.000000  130.000000   223.000000    0.000000  138.000000   
75%     60.000000  140.000000   267.000000    0.000000  156.000000   
max     77.000000  200.000000   603.000000    1.000000  202.000000   

          Oldpeak  HeartDisease  
count  918.000000    918.000000  
mean     0.887364      0.553377  
std      1.066570      0.497414  
min     -2.600000      0.000000  
25%      0.000000      0.000000  
50%      0.600000      1.000000  
75%      1.500000      1.000000  
max      6.200000      1.000000  

Missing Values:
Age               0
Sex  

### Handling Missing Values and Encoding Categorical Variables

In [146]:

numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


data[numerical_cols] = data[numerical_cols].replace(0, np.nan)  # 
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())  


data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])


label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  


categorical_columns = ['ChestPainType', 'Sex', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

In [148]:

print("\nUpdated Dataset Head:")
print(data.head())

print("\nRemaining Missing Values:")
print(data.isnull().sum())



Updated Dataset Head:
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   40    1              1      140.0        289.0        1.0           1   
1   49    0              2      160.0        180.0        1.0           1   
2   37    1              1      130.0        283.0        1.0           2   
3   48    0              0      138.0        214.0        1.0           1   
4   54    1              2      150.0        195.0        1.0           1   

   MaxHR  ExerciseAngina   Oldpeak  ST_Slope  HeartDisease  
0    172               0  1.481091         2             0  
1    156               0  1.000000         1             1  
2     98               0  1.481091         2             0  
3    108               1  1.500000         1             1  
4    122               0  1.481091         2             0  

Remaining Missing Values:
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
R

### Validation for Numerical Columns to Determine Outliers

In [150]:

for col in numerical_cols:
    print(f"Unique values in {col}:")
    print(data[col].unique())


Unique values in Age:
[40 49 37 48 54 39 45 58 42 38 43 60 36 44 53 52 51 56 41 32 65 35 59 50
 47 31 46 57 55 63 66 34 33 61 29 62 28 30 74 68 72 64 69 67 73 70 77 75
 76 71]
Unique values in RestingBP:
[140.         160.         130.         138.         150.
 120.         110.         136.         115.         100.
 124.         113.         125.         145.         112.
 132.         118.         170.         142.         190.
 135.         180.         108.         155.         128.
 106.          92.         200.         122.          98.
 105.         133.          95.          80.         137.
 185.         165.         126.         152.         116.
 132.54089422 144.         154.         134.         104.
 139.         131.         141.         178.         146.
 158.         123.         102.          96.         143.
 172.         156.         114.         127.         101.
 174.          94.         148.         117.         192.
 129.         164.        ]
Unique values 

### Observations from the Results
RestingBP and Cholesterol:

Some values are highly unusual or unrealistic, such as:
RestingBP values like 0.
Cholesterol values as high as 603 or as low as 100.
These need domain-specific thresholds (e.g., minimum/maximum values) to identify outliers and potentially remove them.

FastingBS:

Only one unique value (1.0). This feature might not be informative as it doesn't vary. It may need further investigation or exclusion during feature selection.
Oldpeak:

Contains negative values (e.g., -2.6, -1.5), which are invalid for this feature (it represents depression in ST-segment, and negatives don’t make sense). These values should be removed or corrected.

### Handle Outliers

Outliers were removed based on domain-specific thresholds: RestingBP: Values between 50 and 200 were kept. Cholesterol: Values between 100 and 500 were retained. Oldpeak: Negative values were removed.

In [152]:

data = data[(data['RestingBP'] > 50) & (data['RestingBP'] <= 200)]  
data = data[(data['Cholesterol'] >= 100) & (data['Cholesterol'] <= 500)]  
data = data[data['Oldpeak'] >= 0]  

In [160]:

print("\nUpdated Unique Values for Numerical Features After Outlier Removal:")
for col in ['RestingBP', 'Cholesterol', 'Oldpeak']:
    print(f"{col}: {data[col].unique()}")


Updated Unique Values for Numerical Features After Outlier Removal:
RestingBP: [140.         160.         130.         138.         150.
 120.         110.         136.         115.         100.
 124.         113.         125.         112.         132.
 170.         142.         118.         190.         135.
 180.         108.         145.         155.         128.
 106.          92.         200.         122.          98.
 105.         133.          95.          80.         137.
 185.         165.         126.         152.         116.
 132.54089422 144.         154.         134.         104.
 139.         131.         141.         178.         146.
 158.         102.          96.         143.         172.
 156.         114.         127.         101.         174.
  94.         148.         117.         192.         123.
 129.         164.        ]
Cholesterol: [289.         180.         283.         214.         195.
 339.         237.         208.         207.         284.
 211.    

### Drop Uninformative Columns
I will drop FastingBs because it has no variance and all the values are 1.
I dropped the columns of ST_Slope_str and ChestPaintType because they are irrelevant. These features were already label encoded so this feature adds no information.

In [156]:

if 'FastingBS' in data.columns:
    print("\nUnique values and their counts in FastingBS:")
    print(data['FastingBS'].value_counts())

 
    if data['FastingBS'].nunique() == 1:
        data.drop(columns=['FastingBS'], inplace=True)
        print("\nDropped 'FastingBS' as it has no variance.")
else:
    print("'FastingBS' column is not in the DataFrame.")


redundant_columns = ['ST_Slope_str', 'ChestPainType_str']
columns_to_drop = [col for col in redundant_columns if col in data.columns]

if columns_to_drop:
    data.drop(columns=columns_to_drop, inplace=True)
    print(f"Dropped columns: {columns_to_drop}")
else:
    print(f"No redundant columns to drop: {redundant_columns}")






'FastingBS' column is not in the DataFrame.
No redundant columns to drop: ['ST_Slope_str', 'ChestPainType_str']


 ### Feature Encoding and Transformation

In [165]:

categorical_columns = ['ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numerical_columns = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns), 
        ('cat', OneHotEncoder(), categorical_columns) 
    ])

In [158]:

data.to_csv("cleaned_heart_disease_dataset.csv", index=False)