In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 1. 데이터 로드
df = pd.read_csv('diabetes.csv')

# 2. 결측치 처리 (0값을 NaN으로 대체 후 평균으로 대체)
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zero] = df[cols_with_zero].replace(0, np.nan)

# 결측치 평균으로 대체
for col in cols_with_zero:
    df[col].fillna(df[col].mean(), inplace=True)

# 3. 이상치 처리 (SkinThickness, Insulin 상위 1% 이상치를 평균으로 대체)
for col in ['SkinThickness', 'Insulin']:
    upper_threshold = df[col].quantile(0.99)
    mean_value = df[col].mean()
    df[col] = df[col].apply(lambda x: mean_value if x > upper_threshold else x)

# 4. 정규화 (Age 열만 MinMaxScaler 사용)
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])  # 2D로 넣어야 함

# 5. EDA 수행
print("각 열의 결측치 개수 :")
print(df.isnull().sum())

print(" Outcome 별 Glucose 평균 :")
print(df.groupby('Outcome')['Glucose'].mean())

print("전처리 후 상위 5개 행 :")
print(df.head())


각 열의 결측치 개수 :
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
 Outcome 별 Glucose 평균 :
Outcome
0    110.710121
1    142.165573
Name: Glucose, dtype: float64
전처리 후 상위 5개 행 :
   Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin   BMI  \
0            6    148.0           72.0       35.00000  155.548223  33.6   
1            1     85.0           66.0       29.00000  155.548223  26.6   
2            8    183.0           64.0       29.15342  155.548223  23.3   
3            1     89.0           66.0       23.00000   94.000000  28.1   
4            0    137.0           40.0       35.00000  168.000000  43.1   

   DiabetesPedigreeFunction       Age  Outcome  
0                     0.627  0.483333        1  
1                     0.351  0.166667       

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
