In [19]:
import pandas as pd

x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/x_train.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/y_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/x_test.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/smoke/y_test.csv')

In [20]:
# 범주형 변수인데 적당히 많은 unique값을 가진 컬럼은 날린다. 구강검진수검여부의 경우 unique값이 한개이기 때문에 제거한다.
drop_columns = ['ID', '구강검진수검여부']

x_train_drop = x_train.drop(columns=drop_columns)
x_test_drop = x_test.drop(columns=drop_columns)

In [21]:
y_train = y_train['흡연상태']
y_test = y_test['흡연상태']

In [22]:
x_train_dummy = pd.get_dummies(x_train_drop)
x_test_dummy = pd.get_dummies(x_test_drop)[x_train_dummy.columns]

In [23]:
from sklearn.model_selection import train_test_split

X_t, X_v, y_t, y_v = train_test_split(x_train_dummy, y_train, test_size=0.33, random_state=42, stratify=y_train)

In [24]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=23)
rfc.fit(X_t, y_t)

In [25]:
ptl = rfc.predict(X_t)
ptp = rfc.predict_proba(X_t)[:,1]
pvl = rfc.predict(X_v)
pvp = rfc.predict_proba(X_v)[:,1]

In [26]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

print(f'train accuracy score : {accuracy_score(y_t, ptl)}')
print(f'train f1 score : {f1_score(y_t, ptl)}')
print(f'train recall score : {recall_score(y_t, ptl)}')
print(f'train precision score : {precision_score(y_t, ptl)}')
print(f'train roc_auc score : {roc_auc_score(y_t, ptp)}')

print(f'validation accuracy score : {accuracy_score(y_v, pvl)}')
print(f'validation f1 score : {f1_score(y_v, pvl)}')
print(f'validation recall score : {recall_score(y_v, pvl)}')
print(f'validation precision score : {precision_score(y_v, pvl)}')
print(f'validation roc_auc score : {roc_auc_score(y_v, pvp)}')

train accuracy score : 1.0
train f1 score : 1.0
train recall score : 1.0
train precision score : 1.0
train roc_auc score : 1.0
validation accuracy score : 0.75637624974495
validation f1 score : 0.6791472590469366
validation recall score : 0.7025574499629355
validation precision score : 0.657246879334258
validation roc_auc score : 0.834807387299372


In [27]:
p_test_l = rfc.predict(x_test_dummy)
p_test_p = rfc.predict_proba(x_test_dummy)[:,1]

print(f'test accuracy score : {accuracy_score(y_test, p_test_l)}')
print(f'test f1 score : {f1_score(y_test, p_test_l)}')
print(f'test recall score : {recall_score(y_test, p_test_l)}')
print(f'test precision score : {precision_score(y_test, p_test_l)}')
print(f'test roc_auc score : {roc_auc_score(y_test, p_test_p)}')

test accuracy score : 0.9219858156028369
test f1 score : 0.8949595068294451
test recall score : 0.9022666341701194
test precision score : 0.8877697841726618
test roc_auc score : 0.9767054348258329
