# Heart Disease Data

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer


Column Descriptions:
    
- id (Unique id for each patient)

- age (Age of the patient in years)

- origin (place of study)

- sex (Male/Female)

- cp chest pain type ([typical angina, atypical angina, non-anginal, asymptomatic])

- trestbps resting blood pressure (resting blood pressure (in mm Hg on admission to the hospital))

- chol (serum cholesterol in mg/dl)

- fbs (if fasting blood sugar > 120 mg/dl)

- restecg (resting electrocardiographic results) [normal, stt abnormality, lv hypertrophy]

- thalach: maximum heart rate achieved

- exang: exercise-induced angina (True/ False)

- oldpeak: ST depression induced by exercise relative to rest

- slope: the slope of the peak exercise ST segment

- ca: number of major vessels (0-3) colored by fluoroscopy

- thal: [normal; fixed defect; reversible defect]

- num: the predicted attribute :target [0=no heart disease; 1,2,3,4 = stages of heart disease ]

Dataset is  from 4  hospitals:

- V.A. Long Beach 

- Cleveland Clinic Foundation

- Hungary

- Switzerland


In [4]:
df=pd.read_csv("C:/Users/user/Desktop/heart_disease_uci.csv")
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [3]:
df.shape

(920, 16)

In [4]:
# We have 920 row and 16 columns

In [5]:
# Coloumns name
df.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [6]:
#Check nulls
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [7]:
# We have missing values: trestbps 59, chol 30, fbs 90, restecg 2, thalch  55, exang    55, oldpeak  62, slope   309,thal   486
#  the most is ca  611, so we drop it

In [8]:
# Check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [9]:
# We have 8 numeric , 8 catigorical data ,  920 observation and we see missing values 

In [10]:
df.nunique()

id          920
age          50
sex           2
dataset       4
cp            4
trestbps     61
chol        217
fbs           2
restecg       3
thalch      119
exang         2
oldpeak      53
slope         3
ca            4
thal          3
num           5
dtype: int64

In [11]:
# Numeric columns: Use median imputation
numeric_columns = ['trestbps', 'chol', 'thalch', 'oldpeak']
numeric_imputer = SimpleImputer(strategy='median')
df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])

# Categorical columns with mode imputation
categorical_columns = ['fbs', 'restecg', 'exang', 'slope', 'thal']
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])

# Columns with a significant amount of missing data
df.drop(['ca'], axis=1, inplace=True)  # Removing 'ca' due to high missing values

In [12]:
# ca has a lot of missing values, we drop it, in numeric we add median and in categorical me add mode

In [13]:
#  check missing
df.isnull().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
thal        0
num         0
dtype: int64

In [14]:
# We see that all variable is correct

In [15]:
df.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,num
count,920.0,920.0,920.0,920.0,920.0,920.0,920.0
mean,460.5,53.51087,131.995652,199.908696,137.692391,0.853261,0.995652
std,265.725422,9.424685,18.4513,109.040171,25.145235,1.058049,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0
25%,230.75,47.0,120.0,177.75,120.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,1.0
75%,690.25,60.0,140.0,267.0,156.0,1.5,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,4.0


In [None]:
# We have 920 diseaseds,  
# average  age 53 to 54, minimum age 28 and maximum age 77 
# average resting blood pressure in mm Hg 131.9,minimum 0 and maximum 200
# average cholesterol level in mg/dl 199.9, minumum 0 and maximum 603
# average maximum heart rate achieved 137.6, minimum  60 and maximum 202
# average ST depression 0.85, minimum -2.6 and maximum 6.2




In [16]:

df['heart_disease_category'] = df['num'].map({
        0: 'No disease',
        1: '1 stage ',
        2: '2 stage ',
        3: '3 stage ',
        4: '4 stage '
})
print(df[['num', 'heart_disease_category']])


     num heart_disease_category
0      0             No disease
1      2               2 stage 
2      1               1 stage 
3      0             No disease
4      0             No disease
..   ...                    ...
915    1               1 stage 
916    0             No disease
917    2               2 stage 
918    0             No disease
919    1               1 stage 

[920 rows x 2 columns]


In [None]:
# create new column which called heart_disease category . It means num change number of category  to characters

In [17]:
df = df.drop(columns=['num'])

In [None]:
# drop num coloumn

In [13]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,thal,heart_disease_category
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,fixed defect,No disease
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,normal,2 stage
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,reversable defect,1 stage
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,normal,No disease
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,normal,No disease


In [14]:
df.to_csv('heart_disease_data.csv', index=False)

In [None]:
# save as csv file