In [1]:
import sqlite3

con=sqlite3. connect('./data/waitlist.db')
cur=con.cursor()

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [3]:
df = pd.read_sql_query("SELECT * FROM kidney_waitlist", con)
df.head()

Unnamed: 0,patient_id,age_cat,dialysis_duration,gender,underlying_disease,blood_group,gestation,prior_transplant,cPRA_cat,HLA_A1,...,HLA_B1,HLA_B2,HLA_DR1,HLA_DR2,DR_00,B_00,A_00,death,transplanted_or_not,waiting_time
0,1,Over60,1.0,M,others,A,No,No,Zero,1,...,44,51,3,7,Heterozygote,Heterozygote,Heterozygote,No,No,13.07
1,2,From18to60,4.0,M,diabetes,A,No,No,Zero,1,...,18,35,11,0,Homozygote,Heterozygote,Heterozygote,Yes,No,68.87
2,3,From18to60,2.0,M,HUS,O,No,No,From50To80,24,...,14,18,1,15,Heterozygote,Heterozygote,Heterozygote,No,No,12.17
3,4,From18to60,17.0,M,diabetes,O,No,Yes,ZeroTo50,24,...,14,18,1,15,Heterozygote,Heterozygote,Heterozygote,No,No,12.17
4,5,Over60,68.0,M,HUS,A,No,No,Zero,24,...,14,27,13,15,Heterozygote,Heterozygote,Heterozygote,Yes,No,6.47


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48153 entries, 0 to 48152
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   patient_id           48153 non-null  int64  
 1   age_cat              48153 non-null  object 
 2   dialysis_duration    48153 non-null  float64
 3   gender               48153 non-null  object 
 4   underlying_disease   48153 non-null  object 
 5   blood_group          48153 non-null  object 
 6   gestation            48153 non-null  object 
 7   prior_transplant     48153 non-null  object 
 8   cPRA_cat             48153 non-null  object 
 9   HLA_A1               48153 non-null  int64  
 10  HLA_A2               48153 non-null  int64  
 11  HLA_B1               48153 non-null  int64  
 12  HLA_B2               48153 non-null  int64  
 13  HLA_DR1              48153 non-null  int64  
 14  HLA_DR2              48153 non-null  int64  
 15  DR_00                48153 non-null 

In [5]:
# separate feature / target
y = df['waiting_time']
x = df.drop(columns='waiting_time')

# separate train / test
x_train, x_test,y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=42)

In [6]:
# 범주형 데이터 인코딩
from sklearn.preprocessing import LabelEncoder

cat_features = x.select_dtypes(include=['object']).columns
le = LabelEncoder()

for column in cat_features:
    x_train[column] = le.fit_transform(x_train[column])
    x_test[column] = le.transform(x_test[column])

In [7]:
# 인코딩 확인
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38522 entries, 7671 to 15795
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   patient_id           38522 non-null  int64  
 1   age_cat              38522 non-null  int32  
 2   dialysis_duration    38522 non-null  float64
 3   gender               38522 non-null  int32  
 4   underlying_disease   38522 non-null  int32  
 5   blood_group          38522 non-null  int32  
 6   gestation            38522 non-null  int32  
 7   prior_transplant     38522 non-null  int32  
 8   cPRA_cat             38522 non-null  int32  
 9   HLA_A1               38522 non-null  int64  
 10  HLA_A2               38522 non-null  int64  
 11  HLA_B1               38522 non-null  int64  
 12  HLA_B2               38522 non-null  int64  
 13  HLA_DR1              38522 non-null  int64  
 14  HLA_DR2              38522 non-null  int64  
 15  DR_00                38522 non-nu

In [8]:
model = make_pipeline(
    MinMaxScaler(),
    SimpleImputer(strategy='median'), 
    RandomForestRegressor(
        n_estimators=446,
        max_depth=10,
        max_features=0.3305222296947492,
        random_state=42)
)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [9]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'mse: {mse}\n')
print(f'rmse: {rmse}\n')
print(f'mae: {mae}\n')
print(f'r2: {r2}')

mse: 846.4701743420312

rmse: 29.09416048525943

mae: 21.392314376612926

r2: 0.10830889093970864


In [10]:
import pickle

with open('./model/model.pkl', 'wb') as pickle_file:
    pickle.dump(model, pickle_file)