In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

file_path = './heart_disease_uci.csv'
df = pd.read_csv(file_path)

print("### 데이터 상위 5개 행 ###")
display(df.head())

print("\n### 데이터 기본 정보 ###")
df.info()

print("\n### 수치형 데이터 통계 요약 ###")
display(df.describe())

print("\n### 컬럼별 결측치 개수 ###")
print(df.isnull().sum())

### 데이터 상위 5개 행 ###


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0



### 데이터 기본 정보 ###
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB

### 수치형 데이터 통계 요약 ###


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0



### 컬럼별 결측치 개수 ###
id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


## 2. 데이터 전처리 (Preprocessing)

In [None]:
#print(f"중복된 행 개수: {df.duplicated().sum()}")
#0

#df.drop_duplicates(inplace=True)

#print(f"중복 제거 후 행 개수: {len(df)}")
#print(f"제거 후 중복된 행 개수: {df.duplicated().sum()}")

중복 제거 후 행 개수: 920
제거 후 중복된 행 개수: 0


In [None]:
# 비정상적인 값(0) 처리

df[['trestbps', 'chol']] = df[['trestbps', 'chol']].replace(0, np.nan)
#print(df.isnull().sum())

df['trestbps'].fillna(df['trestbps'].median(), inplace=True)
df['chol'].fillna(df['chol'].median(), inplace=True)

print("\n### 결측치 처리 확인 ###")
print(df.isnull().sum())


### 결측치 처리 확인 ###
id            0
age           0
sex           0
dataset       0
cp            0
trestbps      0
chol          0
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [10]:
# 범주형(Categorical) 변수 처리, 최빈값
categorical_cols = ['fbs', 'restecg', 'exang', 'slope', 'thal']

for col in categorical_cols:
    mode_val = df[col].mode()[0]
    df[col].fillna(mode_val, inplace=True)
    print(f"'{col}' 범주형 컬럼 결측치 처리, 대체된 값: {mode_val}")



'fbs' 범주형 컬럼 결측치 처리, 대체된 값: False
'restecg' 범주형 컬럼 결측치 처리, 대체된 값: normal
'exang' 범주형 컬럼 결측치 처리, 대체된 값: False
'slope' 범주형 컬럼 결측치 처리, 대체된 값: flat
'thal' 범주형 컬럼 결측치 처리, 대체된 값: normal


In [11]:
# 수치형(Numerical) 변수 처리
numerical_cols = ['thalch', 'oldpeak', 'ca']

for col in numerical_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)
    print(f"'{col}' 수치형 컬럼 결측치 처리 완료. 대체된 값: {median_val}")


'thalch' 수치형 컬럼 결측치 처리 완료. 대체된 값: 140.0
'oldpeak' 수치형 컬럼 결측치 처리 완료. 대체된 값: 0.5
'ca' 수치형 컬럼 결측치 처리 완료. 대체된 값: 0.0


In [12]:
print(df.isnull().sum())

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


## 3. 탐색적 데이터 분석 (EDA)