In [1]:
!pip install pandas
!pip install numpy
!pip install -U scikit-learn



In [4]:
import sqlite3

con=sqlite3. connect('../data/waitlist.db')
cur=con.cursor()

In [5]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [6]:
df = pd.read_sql_query("SELECT * FROM kidney_waitlist", con)
df.head()

Unnamed: 0,patient_id,age_cat,dialysis_duration,gender,underlying_disease,blood_group,gestation,prior_transplant,cPRA_cat,HLA_A1,...,HLA_B1,HLA_B2,HLA_DR1,HLA_DR2,DR_00,B_00,A_00,death,transplanted_or_not,waiting_time
0,1,Over60,1.0,M,others,A,No,No,Zero,1,...,44,51,3,7,Heterozygote,Heterozygote,Heterozygote,No,No,13.07
1,2,From18to60,4.0,M,diabetes,A,No,No,Zero,1,...,18,35,11,0,Homozygote,Heterozygote,Heterozygote,Yes,No,68.87
2,3,From18to60,2.0,M,HUS,O,No,No,From50To80,24,...,14,18,1,15,Heterozygote,Heterozygote,Heterozygote,No,No,12.17
3,4,From18to60,17.0,M,diabetes,O,No,Yes,ZeroTo50,24,...,14,18,1,15,Heterozygote,Heterozygote,Heterozygote,No,No,12.17
4,5,Over60,68.0,M,HUS,A,No,No,Zero,24,...,14,27,13,15,Heterozygote,Heterozygote,Heterozygote,Yes,No,6.47


In [7]:
df['blood_group'].unique()

array(['A', 'O', 'B', 'AB'], dtype=object)

In [8]:
df.loc[[7671]].T

Unnamed: 0,7671
patient_id,7672
age_cat,From18to60
dialysis_duration,9.0
gender,F
underlying_disease,GNS
blood_group,O
gestation,No
prior_transplant,Yes
cPRA_cat,Over80
HLA_A1,1


In [9]:
# separate feature / target
y = df['waiting_time']
x = df.drop(columns=['waiting_time', 'patient_id', 'death',\
                     'transplanted_or_not', 'HLA_A1', 'HLA_A2',\
                     'HLA_B1', 'HLA_B2', 'HLA_DR1', 'HLA_DR2', 'prior_transplant', 'gender'])

In [10]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48153 entries, 0 to 48152
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age_cat             48153 non-null  object 
 1   dialysis_duration   48153 non-null  float64
 2   underlying_disease  48153 non-null  object 
 3   blood_group         48153 non-null  object 
 4   gestation           48153 non-null  object 
 5   cPRA_cat            48153 non-null  object 
 6   DR_00               48153 non-null  object 
 7   B_00                48153 non-null  object 
 8   A_00                48153 non-null  object 
dtypes: float64(1), object(8)
memory usage: 3.3+ MB


In [11]:
# separate train / test
x_train, x_test,y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=42)

In [12]:
x_train.head().T

Unnamed: 0,7671,30097,20348,17366,17050
age_cat,From18to60,From18to60,From18to60,Over60,From18to60
dialysis_duration,9.0,3.0,2.0,25.0,5.0
underlying_disease,GNS,HUS,others,diabetes,HUS
blood_group,O,O,A,O,O
gestation,No,No,No,Yes,No
cPRA_cat,Over80,Zero,Zero,Zero,Zero
DR_00,Heterozygote,Heterozygote,Heterozygote,Heterozygote,Heterozygote
B_00,Heterozygote,Heterozygote,Heterozygote,Heterozygote,Heterozygote
A_00,Heterozygote,Heterozygote,Heterozygote,Heterozygote,Heterozygote


In [13]:
# 범주형 데이터 인코딩
from sklearn.preprocessing import LabelEncoder

cat_features = x.select_dtypes(include=['object']).columns
le = LabelEncoder()

for column in cat_features:
    x_train[column] = le.fit_transform(x_train[column])
    x_test[column] = le.transform(x_test[column])

In [14]:
# 인코딩 확인
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38522 entries, 7671 to 15795
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age_cat             38522 non-null  int32  
 1   dialysis_duration   38522 non-null  float64
 2   underlying_disease  38522 non-null  int32  
 3   blood_group         38522 non-null  int32  
 4   gestation           38522 non-null  int32  
 5   cPRA_cat            38522 non-null  int32  
 6   DR_00               38522 non-null  int32  
 7   B_00                38522 non-null  int32  
 8   A_00                38522 non-null  int32  
dtypes: float64(1), int32(8)
memory usage: 1.8 MB


In [15]:
x_train.head().T

Unnamed: 0,7671,30097,20348,17366,17050
age_cat,1.0,1.0,1.0,2.0,1.0
dialysis_duration,9.0,3.0,2.0,25.0,5.0
underlying_disease,0.0,1.0,4.0,3.0,1.0
blood_group,3.0,3.0,0.0,3.0,3.0
gestation,0.0,0.0,0.0,1.0,0.0
cPRA_cat,1.0,2.0,2.0,2.0,2.0
DR_00,0.0,0.0,0.0,0.0,0.0
B_00,0.0,0.0,0.0,0.0,0.0
A_00,0.0,0.0,0.0,0.0,0.0


In [30]:
x_test.head()

Unnamed: 0,age_cat,dialysis_duration,underlying_disease,blood_group,gestation,cPRA_cat,DR_00,B_00,A_00
8755,1,7.0,0,3,0,2,0,0,0
15418,1,97.0,0,3,0,2,0,1,0
21100,2,5.0,4,3,0,2,0,0,0
19038,1,11.0,0,2,1,2,0,0,0
7155,1,1.0,4,3,0,2,1,1,1


In [32]:
x_test['age_cat'][:2]

8755     1
15418    1
Name: age_cat, dtype: int32

In [36]:
le = LabelEncoder()
df = pd.DataFrame(columns=['age_cat', 'underlying_disease',\
                 'blood_group', 'gestation',\
                  'cPRA_cat'])
df['dialysis_duration'] =  

for x in df:
    print(df[x])
      

KeyError: 'dialysis_duration'

In [16]:
x_test.head().T

Unnamed: 0,8755,15418,21100,19038,7155
age_cat,1.0,1.0,2.0,1.0,1.0
dialysis_duration,7.0,97.0,5.0,11.0,1.0
underlying_disease,0.0,0.0,4.0,0.0,4.0
blood_group,3.0,3.0,3.0,2.0,3.0
gestation,0.0,0.0,0.0,1.0,0.0
cPRA_cat,2.0,2.0,2.0,2.0,2.0
DR_00,0.0,0.0,0.0,0.0,1.0
B_00,0.0,1.0,0.0,0.0,1.0
A_00,0.0,0.0,0.0,0.0,1.0


In [17]:
x_test['cPRA_cat'].unique()

array([2, 3, 1, 0])

In [37]:
model = make_pipeline(
    MinMaxScaler(),
    SimpleImputer(strategy='median'), 
    RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42)
)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [19]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'mse: {mse}\n')
print(f'rmse: {rmse}\n')
print(f'mae: {mae}\n')
print(f'r2: {r2}')

mse: 888.2570265853877

rmse: 29.803641163210038

mae: 21.879004655199296

r2: 0.06428966173298523


In [20]:
import pickle

with open('./model/model.pkl', 'wb') as pickle_file:
    pickle.dump(model, pickle_file)

FileNotFoundError: [Errno 2] No such file or directory: './model/model.pkl'

In [38]:
print(y_pred)

[38.22496636 47.08202763 35.30563783 ... 35.72312082 59.99350223
 39.11865056]


In [None]:
print(y_test)

In [None]:
print(y_train)