In [30]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

In [31]:
# Load the dataset
file_path = 'heart_disease_uci.csv'
heart_df= pd.read_csv(file_path)
heart_df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [32]:
# Step 1: Data Exploration
print("First 5 Rows:")
print(heart_df.head())
print("\nData Types and Missing Values:")
print(heart_df.info())
print("\nMissing Values:")
print(heart_df.isnull().sum())

First 5 Rows:
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal  

In [33]:
# Step 2: Handling Missing Data
# Impute missing values in 'thal' and 'ca' using the most frequent values
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(sparse=False)
encoded_df=encoder.fit_transform(heart_df[['thal','ca']])
pd.DataFrame(encoded_df,columns=encoder.get_feature_names_out(['thal','ca']))




Unnamed: 0,thal_fixed defect,thal_normal,thal_reversable defect,thal_nan,ca_0.0,ca_1.0,ca_2.0,ca_3.0,ca_nan
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
915,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
916,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
917,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
918,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [34]:
# Step 3: Feature Creation
# Age Groups: Categorizing into <40, 40-60, and >60
heart_df['age_group'] = pd.cut(heart_df['age'], bins=[0, 40, 60, heart_df['age'].max()], labels=['<40', '40-60', '>60'])

# Cholesterol Level: Categorizing cholesterol levels into Low, Normal, High
heart_df['chol_level'] = pd.cut(heart_df['chol'], bins=[0, 200, 240, heart_df['chol'].max()], labels=['Low', 'Normal', 'High'])

# IsRisk: Creating a binary feature for high-risk patients based on conditions
heart_df['is_risk'] = ((heart_df['chol_level'] == 'High') | (heart_df['trestbps'] > 130) | (heart_df['age'] > 60)).astype(int)

print("\nNew Features (First 5 Rows):")
print(heart_df[['age', 'age_group', 'chol', 'chol_level', 'trestbps', 'is_risk']].head())


New Features (First 5 Rows):
   age age_group   chol chol_level  trestbps  is_risk
0   63       >60  233.0     Normal     145.0        1
1   67       >60  286.0       High     160.0        1
2   67       >60  229.0     Normal     120.0        1
3   37       <40  250.0       High     130.0        1
4   41     40-60  204.0     Normal     130.0        0


In [35]:
# Step 4: Feature Transformation
# Convert categorical features to numerical using Label Encoding
label_encoders = {}
categorical_cols = ['sex', 'cp', 'thal', 'age_group']

for col in categorical_cols:
    le = LabelEncoder()
    heart_df[col] = le.fit_transform(heart_df[col])
    label_encoders[col] = le 

# Normalize numerical features like chol, trestbps, and thalch using MinMaxScaler
scaler = MinMaxScaler()
heart_df[['chol', 'trestbps', 'thalch']] = scaler.fit_transform(heart_df[['chol', 'trestbps', 'thalch']])

print("\nTransformed Features (First 5 Rows):")
print(heart_df[['sex', 'cp', 'thal', 'age_group', 'chol', 'trestbps', 'thalch']].head())


Transformed Features (First 5 Rows):
   sex  cp  thal  age_group      chol  trestbps    thalch
0    1   3     0          2  0.386401     0.725  0.633803
1    1   0     1          2  0.474295     0.800  0.338028
2    1   0     2          2  0.379768     0.600  0.485915
3    1   2     1          1  0.414594     0.650  0.894366
4    0   1     1          0  0.338308     0.650  0.788732


In [36]:
# Step 5: Feature Interaction
# BP-Chol Interaction: Multiply trestbps and chol to create an interaction feature
heart_df['bp_chol_interaction'] = heart_df['trestbps'] * heart_df['chol']

# Exercise-Induced Angina Risk: Create a binary feature based on exang and thalach threshold
heart_df['exang_risk'] = ((heart_df['exang'] == 1) & (heart_df['thalch'] < 0.5)).astype(int)

print("\nInteraction Features (First 5 Rows):")
print(heart_df[['bp_chol_interaction', 'exang_risk']].head())




Interaction Features (First 5 Rows):
   bp_chol_interaction  exang_risk
0             0.280141           0
1             0.379436           1
2             0.227861           1
3             0.269486           0
4             0.219900           0


In [37]:
# Step 6: Feature Selection
heart_df = heart_df.drop(columns=['id'])

print("\nFinal Dataset (First 5 Rows):")
print(heart_df.head())


Final Dataset (First 5 Rows):
   age  sex    dataset  cp  trestbps      chol    fbs         restecg  \
0   63    1  Cleveland   3     0.725  0.386401   True  lv hypertrophy   
1   67    1  Cleveland   0     0.800  0.474295  False  lv hypertrophy   
2   67    1  Cleveland   0     0.600  0.379768  False  lv hypertrophy   
3   37    1  Cleveland   2     0.650  0.414594  False          normal   
4   41    0  Cleveland   1     0.650  0.338308  False  lv hypertrophy   

     thalch  exang  oldpeak        slope   ca  thal  num  age_group  \
0  0.633803  False      2.3  downsloping  0.0     0    0          2   
1  0.338028   True      1.5         flat  3.0     1    2          2   
2  0.485915   True      2.6         flat  2.0     2    1          2   
3  0.894366  False      3.5  downsloping  0.0     1    0          1   
4  0.788732  False      1.4    upsloping  0.0     1    0          0   

  chol_level  is_risk  bp_chol_interaction  exang_risk  
0     Normal        1             0.280141    