In [None]:
import pandas as pd
import numpy as np

# 예시 데이터프레임 생성
data = {
    '연속형1': [10, 12, np.nan, 15, 18, 20, np.nan, 25],
    '연속형2': [1.1, np.nan, 2.3, 3.4, 4.5, np.nan, 6.7, 7.8],
    '범주형1': ['A', 'B', 'A', np.nan, 'C', 'B', 'A', 'C'],
    '범주형2': ['X', 'Y', np.nan, 'X', 'Z', 'Y', 'Z', 'X'],
    'y': [0, 0, 1, 1, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

print("원본 데이터프레임:")
print(df)

df.info()

원본 데이터프레임:
   연속형1  연속형2 범주형1 범주형2  y
0  10.0   1.1    A    X  0
1  12.0   NaN    B    Y  0
2   NaN   2.3    A  NaN  1
3  15.0   3.4  NaN    X  1
4  18.0   4.5    C    Z  1
5  20.0   NaN    B    Y  0
6   NaN   6.7    A    Z  1
7  25.0   7.8    C    X  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   연속형1    6 non-null      float64
 1   연속형2    6 non-null      float64
 2   범주형1    7 non-null      object 
 3   범주형2    7 non-null      object 
 4   y       8 non-null      int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 452.0+ bytes


In [None]:
X_train = df.drop('y', axis=1)
y = df['y']

In [None]:
# 숫자형 컬럼만 선택
num_cols = X_train.select_dtypes('number').columns

# 범주형 컬럼만 선택
ob_cols = X_train.select_dtypes('object').columns

print(num_cols, ob_cols)

Index(['연속형1', '연속형2'], dtype='object') Index(['범주형1', '범주형2'], dtype='object')


In [None]:
# 숫자형 결측값 채우기
X_train['연속형1'] = X_train['연속형1'].fillna(X_train['연속형1'].median())
X_train['연속형2'] = X_train['연속형2'].fillna(X_train['연속형2'].median())

# 범주형 결측값 최빈값으로 채우기
X_train['범주형1'] = X_train['범주형1'].fillna(X_train['범주형1'].mode()[0])
X_train['범주형2'] = X_train['범주형2'].fillna(X_train['범주형2'].mode()[0])

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   연속형1    8 non-null      float64
 1   연속형2    8 non-null      float64
 2   범주형1    8 non-null      object 
 3   범주형2    8 non-null      object 
dtypes: float64(2), object(2)
memory usage: 388.0+ bytes


In [None]:
# 숫자형 컬럼과 y의 상관관계 계산
correlation_with_y = df[num_cols].corrwith(df['y'])

print(correlation_with_y)

# 범주형 컬럼별 y의 평균 계산
for col in ob_cols:
    print(df.groupby(col)['y'].mean())

연속형1   -0.023466
연속형2   -0.045158
dtype: float64
범주형1
A    0.666667
B    0.000000
C    0.500000
Name: y, dtype: float64
범주형2
X    0.333333
Y    0.000000
Z    1.000000
Name: y, dtype: float64


In [None]:
# 범주형 컬럼 라벨인코딩(순서가 있는 범주형)

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

X_train['연속형1'] = le.fit_transform(X_train['연속형1'])
X_test['연속형1'] = le.transform(X_test['연속형1'])

In [None]:
# 범주형 컬럼 원핫인코딩
X_train = pd.get_dummies(X_train, columns=ob_cols)
X_train

Unnamed: 0,연속형1,연속형2,범주형1_A,범주형1_B,범주형1_C,범주형2_X,범주형2_Y,범주형2_Z
0,10.0,1.1,True,False,False,True,False,False
1,12.0,3.95,False,True,False,False,True,False
2,16.5,2.3,True,False,False,True,False,False
3,15.0,3.4,True,False,False,True,False,False
4,18.0,4.5,False,False,True,False,False,True
5,20.0,3.95,False,True,False,False,True,False
6,16.5,6.7,True,False,False,False,False,True
7,25.0,7.8,False,False,True,True,False,False


In [None]:
# 숫자형 컬럼 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# 평가지표 확인방법
# dir(sklearn.metrics)

In [None]:
# 분류
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
y_pred = rf.predict_proba(X_val) #확률로 도출

score = accuracy_score(y_val, y_pred)

score

In [None]:
# 회귀
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = root_mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(mse, rmse, mae, r2)

In [None]:
# 최종 예측 밎 저장

y_pred = rf.predict(X_test)
result = pd.DataFrame({'pred': y_pred})
result.to_csv('result.csv', index=False)