In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import pickle
import math


from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, mean_absolute_error,mean_squared_error

In [2]:
encounters = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/encounters.csv")

patients = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/patients.csv")
patients = patients[['Id','GENDER','MARITAL','ETHNICITY','STATE','CITY']]
patients['MARITAL'] = patients['MARITAL'].fillna('Not Applicable')


conditions = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/conditions.csv")
conditions = conditions[['ENCOUNTER', 'PATIENT','CODE']]

In [3]:
def process_condition_categorical(conditions):
    conditions['CODE'] = conditions['CODE'].astype(str)
    conditions_group = conditions.groupby('ENCOUNTER').agg({'CODE' : ','.join, 'PATIENT' : ','.join}).reset_index()
    conditions_group['CODE'] = conditions_group['CODE'].apply(lambda x : x.split(','))
    conditions_group['PATIENT'] = conditions_group['PATIENT'].apply(lambda x : x.split(',')[0])
    
    for i in range(0,15):
        conditions_group[f'COND_CODE{i}'] = -1
        conditions_group[f'COND_CODE{i}'] = conditions_group['CODE'].apply(lambda x  : -1 if len(x) < i+1 else int(x[i]))    
    conditions_group.drop(columns = ['CODE'], inplace = True)
    return conditions_group
condition_group = process_condition_categorical(conditions)

In [4]:
encounters = encounters.merge(patients, left_on = 'PATIENT',right_on = 'Id')
encounters = encounters.merge(condition_group, left_on = 'Id_x', right_on = 'ENCOUNTER')

In [5]:
encounters = encounters[['ENCOUNTERCLASS', 'CODE', 'DESCRIPTION', 'BASE_ENCOUNTER_COST', 'REASONCODE', 
     'GENDER', 'MARITAL', 'ETHNICITY', 'STATE', 'CITY', 'ENCOUNTER',
       'PATIENT_y', 'COND_CODE0', 'COND_CODE1', 'COND_CODE2', 'COND_CODE3',
       'COND_CODE4', 'COND_CODE5', 'COND_CODE6', 'COND_CODE7', 'COND_CODE8',
       'COND_CODE9', 'COND_CODE10', 'COND_CODE11', 'COND_CODE12',
       'COND_CODE13', 'COND_CODE14']]

In [6]:
X = encounters[['ENCOUNTERCLASS', 'CODE', 'DESCRIPTION', 'REASONCODE', 
     'GENDER', 'MARITAL', 'ETHNICITY', 'STATE', 'CITY', 'ENCOUNTER',
       'PATIENT_y', 'COND_CODE0', 'COND_CODE1', 'COND_CODE2', 'COND_CODE3',
       'COND_CODE4', 'COND_CODE5', 'COND_CODE6', 'COND_CODE7', 'COND_CODE8',
       'COND_CODE9', 'COND_CODE10', 'COND_CODE11', 'COND_CODE12',
       'COND_CODE13', 'COND_CODE14']]
y = encounters['BASE_ENCOUNTER_COST']
y = y.map({129.16  :1, 77.49 : 0})

In [7]:
X_train, X_test,y_train, y_test = train_test_split(X, y , test_size = 0.2, shuffle = True, random_state = True)
int_cols = X_train.dtypes[X_train.dtypes != object].keys()
cat_cols = X_train.dtypes[X_train.dtypes == object].keys()

In [8]:
scaler = MinMaxScaler()
scaler.fit(X_train[int_cols])
X_train[int_cols] = scaler.transform(X_train[int_cols])
X_test[int_cols] = scaler.transform(X_test[int_cols])

X_train[cat_cols] = X_train[cat_cols].apply(lambda x : x.astype('category'))

clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

In [9]:
X_test[cat_cols] = X_test[cat_cols].apply(lambda x : x.astype('category'))
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred),confusion_matrix(y_test, y_pred)

(1.0,
 array([[  4583,      0],
        [     0, 104956]]))

In [10]:
with open('encounter_data_scaler.pkl', 'wb') as fid:
    pickle.dump(scaler, fid) 

with open('encounter_model.pkl', 'wb') as fid:
    pickle.dump(clf, fid)