In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import pickle
import math


from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, mean_absolute_error,mean_squared_error

In [2]:
procedures = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/procedures.csv")
procedures.drop(columns = ['DATE','PATIENT','DESCRIPTION', 'REASONDESCRIPTION'], inplace = True)
procedures = procedures.fillna(0)

patients = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/patients.csv")
patients = patients[['Id','GENDER','MARITAL','ETHNICITY','STATE','CITY']]
patients['MARITAL'] = patients['MARITAL'].fillna('Not Applicable')


conditions = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/conditions.csv")
conditions = conditions[['ENCOUNTER', 'PATIENT','CODE']]

In [3]:
def process_categorical(df, df_name, max_feature = None):
    df.groupby('ENCOUNTER').agg({'CODE':'count'})
    df['CODE'] = df['CODE'].astype(str)

    if 'BASE_COST' in df.columns:
        df_group = df.groupby('ENCOUNTER').agg({'CODE' : ','.join ,'BASE_COST' : 'sum'}).reset_index()
    else:
        df_group = df.groupby('ENCOUNTER').agg({'CODE' : ','.join }).reset_index()
    df_group['CODE'] = df_group['CODE'].apply(lambda x : x.split(','))

    to_delete = ['CODE']

    if max_feature is None:
        df_group['len'] = df_group['CODE'].apply(lambda x : len(x))
        max_feature = int(df['len'].max())
        to_delete.append('len')
      
    for i in range(0,max_feature):
        df_group[f'{df_name}_CODE{i}'] = -1
        df_group[f'{df_name}_CODE{i}'] = df_group['CODE'].apply(lambda x  : -1 if len(x) < i+1 else int(x[i]))  
      
    df_group.drop(columns = to_delete, inplace = True)
    return df_group

def process_condition_categorical(conditions):
    conditions['CODE'] = conditions['CODE'].astype(str)
    conditions_group = conditions.groupby('ENCOUNTER').agg({'CODE' : ','.join, 'PATIENT' : ','.join}).reset_index()
    conditions_group['CODE'] = conditions_group['CODE'].apply(lambda x : x.split(','))
    conditions_group['PATIENT'] = conditions_group['PATIENT'].apply(lambda x : x.split(',')[0])
    
    for i in range(0,15):
        conditions_group[f'COND_CODE{i}'] = -1
        conditions_group[f'COND_CODE{i}'] = conditions_group['CODE'].apply(lambda x  : -1 if len(x) < i+1 else int(x[i]))    
    conditions_group.drop(columns = ['CODE'], inplace = True)
    return conditions_group

In [4]:
procedure_group = process_categorical(procedures, "PROC",12)
condition_group = process_condition_categorical(conditions)

In [5]:
patients.columns =['PATIENT', 'GENDER', 'MARITAL', 'ETHNICITY', 'STATE', 'CITY']
merged = condition_group.merge(patients, on = 'PATIENT' )
target_merged = merged.merge(procedure_group, on = 'ENCOUNTER')

In [6]:
df = target_merged.iloc[:, 2:]
df.columns

Index(['COND_CODE0', 'COND_CODE1', 'COND_CODE2', 'COND_CODE3', 'COND_CODE4',
       'COND_CODE5', 'COND_CODE6', 'COND_CODE7', 'COND_CODE8', 'COND_CODE9',
       'COND_CODE10', 'COND_CODE11', 'COND_CODE12', 'COND_CODE13',
       'COND_CODE14', 'GENDER', 'MARITAL', 'ETHNICITY', 'STATE', 'CITY',
       'BASE_COST', 'PROC_CODE0', 'PROC_CODE1', 'PROC_CODE2', 'PROC_CODE3',
       'PROC_CODE4', 'PROC_CODE5', 'PROC_CODE6', 'PROC_CODE7', 'PROC_CODE8',
       'PROC_CODE9', 'PROC_CODE10', 'PROC_CODE11'],
      dtype='object')

In [7]:
cost = df[['BASE_COST']]
df = df[['COND_CODE0', 'COND_CODE1', 'COND_CODE2', 'COND_CODE3', 'COND_CODE4',
       'COND_CODE5', 'COND_CODE6', 'COND_CODE7', 'COND_CODE8', 'COND_CODE9',
       'COND_CODE10', 'COND_CODE11', 'COND_CODE12', 'COND_CODE13',
       'COND_CODE14', 'GENDER', 'MARITAL', 'ETHNICITY', 'STATE', 'CITY',
       'PROC_CODE0', 'PROC_CODE1', 'PROC_CODE2', 'PROC_CODE3',
       'PROC_CODE4', 'PROC_CODE5', 'PROC_CODE6', 'PROC_CODE7', 'PROC_CODE8',
       'PROC_CODE9', 'PROC_CODE10', 'PROC_CODE11']]

In [8]:
x_train, x_test = df.iloc[:-30000, :], df.iloc[-30000:,:]
y_train, y_test = cost.iloc[:-30000, :], cost.iloc[-30000:,:]

In [9]:
def train_and_scale(x_train, x_test, y_train, y_test,names):
    cat_columns = x_train.dtypes[x_train.dtypes == object].keys()
    num_columns = x_train.dtypes[x_train.dtypes != object].keys()
    
    data_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()
    data_scaler.fit(x_train[num_columns])
    target_scaler.fit(y_train)

    x_train[num_columns] = data_scaler.transform(x_train[num_columns])
    x_test[num_columns] = data_scaler.transform(x_test[num_columns])

    y_train = target_scaler.transform(y_train)
    y_test = target_scaler.transform(y_test)

    x_train[cat_columns] = x_train[cat_columns].apply(lambda x : x.astype('category'))
    x_test[cat_columns] = x_test[cat_columns].apply(lambda x : x.astype('category'))


    params = {
        'task': 'train', 
        'boosting': 'gbdt',
        'objective': 'regression',
        'num_leaves': 10,
        'learnnig_rage': 0.05,
        'metric': {'l2','l1'},
        'verbose': -1
    }
    lgb_train = lgb.Dataset(x_train, y_train.reshape(1,-1)[0])
    lgb_eval = lgb.Dataset(x_test, y_test.reshape(1,-1)[0]
                           , reference=lgb_train)

    model = lgb.train(params,
                     train_set=lgb_train,
                     valid_sets=lgb_eval,
                     early_stopping_rounds=30)

    preds = model.predict(x_test)
    preds = target_scaler.inverse_transform(preds.reshape(-1, 1))
    target = target_scaler.inverse_transform(y_test)
    print(math.sqrt(mean_squared_error(target, preds)),mean_absolute_error(target,preds))

    with open(f'price_{names}_data_scaler.pkl', 'wb') as fid:
        pickle.dump(data_scaler, fid) 
    with open(f'price_{names}_target_scaler.pkl', 'wb') as fid:
        pickle.dump(target_scaler, fid) 
    with open(f'price_{names}_model.pkl', 'wb') as fid:
        pickle.dump(model, fid) 


In [10]:
train_and_scale(x_train, x_test, y_train, y_test,'procedure')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]






[1]	valid_0's l1: 0.0670391	valid_0's l2: 0.00865235
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l1: 0.0610818	valid_0's l2: 0.00716112
[3]	valid_0's l1: 0.0555711	valid_0's l2: 0.00593859
[4]	valid_0's l1: 0.0507923	valid_0's l2: 0.00494929
[5]	valid_0's l1: 0.0463401	valid_0's l2: 0.00413444
[6]	valid_0's l1: 0.0422297	valid_0's l2: 0.00347081
[7]	valid_0's l1: 0.0387512	valid_0's l2: 0.0029357
[8]	valid_0's l1: 0.0353831	valid_0's l2: 0.00247954
[9]	valid_0's l1: 0.032368	valid_0's l2: 0.00212182
[10]	valid_0's l1: 0.0296549	valid_0's l2: 0.00181881
[11]	valid_0's l1: 0.027238	valid_0's l2: 0.00157289
[12]	valid_0's l1: 0.0251746	valid_0's l2: 0.00137322
[13]	valid_0's l1: 0.0233224	valid_0's l2: 0.00121056
[14]	valid_0's l1: 0.0215372	valid_0's l2: 0.00107352
[15]	valid_0's l1: 0.0199342	valid_0's l2: 0.0009626
[16]	valid_0's l1: 0.0185171	valid_0's l2: 0.000871324
[17]	valid_0's l1: 0.0173072	valid_0's l2: 0.000798563
[18]	valid_0's l1: 0.0161758	val

# Medicine Pricing

In [11]:
medications = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/medications.csv")
medications = medications[['PATIENT', 'ENCOUNTER' , 'CODE', 'TOTALCOST', 'REASONCODE']]

medications['CODE'] = medications['CODE'].astype(str)
fin = medications.groupby('ENCOUNTER').agg({'CODE' : ','.join,'PATIENT' : ','.join ,'TOTALCOST' : 'sum'}).reset_index()
fin['CODE'] = fin['CODE'].apply(lambda x : x.split(','))
fin['PATIENT'] = fin['PATIENT'].apply(lambda x : x.split(',')[0])

for i in range(0,17):
    fin[f'MED_CODE{i}'] = -1
    fin[f'MED_CODE{i}'] = fin['CODE'].apply(lambda x  : -1 if len(x) < i+1 else int(x[i]))  
fin.drop(columns = ['CODE'], inplace = True)

In [12]:
med_merged = target_merged.merge(fin, on = 'ENCOUNTER')

cost_med = med_merged[['TOTALCOST']]
df_med = med_merged[['COND_CODE0', 'COND_CODE1', 'COND_CODE2', 'COND_CODE3', 'COND_CODE4',
       'COND_CODE5', 'COND_CODE6', 'COND_CODE7', 'COND_CODE8', 'COND_CODE9',
       'COND_CODE10', 'COND_CODE11', 'COND_CODE12', 'COND_CODE13',
       'COND_CODE14', 'GENDER', 'MARITAL', 'ETHNICITY', 'STATE', 'CITY',
       'BASE_COST', 'PROC_CODE0', 'PROC_CODE1', 'PROC_CODE2', 'PROC_CODE3',
       'PROC_CODE4', 'PROC_CODE5', 'PROC_CODE6', 'PROC_CODE7', 'PROC_CODE8',
       'PROC_CODE9', 'PROC_CODE10', 'PROC_CODE11', 'MED_CODE0',
       'MED_CODE1', 'MED_CODE2', 'MED_CODE3', 'MED_CODE4', 'MED_CODE5',
       'MED_CODE6', 'MED_CODE7', 'MED_CODE8', 'MED_CODE9', 'MED_CODE10',
       'MED_CODE11', 'MED_CODE12', 'MED_CODE13', 'MED_CODE14', 'MED_CODE15',
       'MED_CODE16']]

x_med_train, x_med_test = df_med.iloc[:-20000, :], df_med.iloc[-20000:,:]
y_med_train, y_med_test = cost_med.iloc[:-20000, :], cost_med.iloc[-20000:,:]

In [13]:
train_and_scale(x_med_train, x_med_test, y_med_train, y_med_test,'medication')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


[1]	valid_0's l1: 0.0352007	valid_0's l2: 0.00293619
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l1: 0.0328863	valid_0's l2: 0.00258721
[3]	valid_0's l1: 0.0307938	valid_0's l2: 0.00230486
[4]	valid_0's l1: 0.0290675	valid_0's l2: 0.00208229
[5]	valid_0's l1: 0.0273172	valid_0's l2: 0.00189016
[6]	valid_0's l1: 0.0257287	valid_0's l2: 0.00173224
[7]	valid_0's l1: 0.0245582	valid_0's l2: 0.00160004
[8]	valid_0's l1: 0.0234679	valid_0's l2: 0.00149727
[9]	valid_0's l1: 0.0225131	valid_0's l2: 0.00141294
[10]	valid_0's l1: 0.0216347	valid_0's l2: 0.00133927
[11]	valid_0's l1: 0.0208942	valid_0's l2: 0.00128003
[12]	valid_0's l1: 0.0201383	valid_0's l2: 0.00122429
[13]	valid_0's l1: 0.0194834	valid_0's l2: 0.00118364
[14]	valid_0's l1: 0.0189039	valid_0's l2: 0.00115016
[15]	valid_0's l1: 0.0183628	valid_0's l2: 0.0011229
[16]	valid_0's l1: 0.0178507	valid_0's l2: 0.00109209
[17]	valid_0's l1: 0.0174342	valid_0's l2: 0.00107068
[18]	valid_0's l1: 0.0169501	va