# Rehospitalisation -> admitted again within 30 days

In [1]:
import pandas as pd
import numpy as np 
import lightgbm as lgb 

from tqdm import tqdm


from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix


import pickle

In [2]:
encounters = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/encounters.csv")
encounters['START'] = pd.to_datetime(encounters['START'])
encounters['STOP'] = pd.to_datetime(encounters['STOP'])

encounters = encounters.iloc[:100000]

In [3]:
negtable = table = pd.DataFrame()
grouped_df = encounters.groupby('PATIENT')
for (patient_id, df) in tqdm(grouped_df):
    df.sort_values(by = ['START', 'STOP'],inplace = True)
    df['shifted_stop'] = df['STOP'].shift(1)
    df['diff'] = df['START'] - df['shifted_stop']
    df['diff_days'] = df['diff'].apply(lambda x : x.days if type(x) != type(1) else 50)
    # rehospital = sample[sample['diff_days'] <= 30]
    df['diff_days_1'] = df['diff_days'].shift(-1).fillna(False)
    negtable = pd.concat([negtable,df[df['diff_days'] > 30]])
    df = df[(df['diff_days'] <= 30).shift(-1).fillna(False)]
    
    table = pd.concat([table,df])
    

100%|██████████| 3839/3839 [01:20<00:00, 47.85it/s]


In [4]:
table.drop(columns = ['START', 'STOP', 'PATIENT', 'ORGANIZATION',
       'PROVIDER', 'PAYER','REASONDESCRIPTION', 'shifted_stop', 'diff'], inplace = True)
negtable.drop(columns = [ 'START', 'STOP', 'PATIENT', 'ORGANIZATION',
       'PROVIDER', 'PAYER','REASONDESCRIPTION', 'shifted_stop', 'diff'], inplace = True)

In [5]:
final_table = pd.concat([table, negtable])
final_table['diff_days'].fillna(0, inplace = True)
final_table['diff_days_1'] = final_table['diff_days_1'].apply(lambda x : 1 if x <= 30 else 0 )
final_table['REASONCODE'].fillna(-1, inplace = True)

In [6]:
conditions = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/conditions.csv")
conditions_desc = conditions.groupby('ENCOUNTER')['DESCRIPTION'].apply(lambda x: " ".join(x)).reset_index()
conditions_desc.columns = ['Id', 'Condition']


medications = pd.read_csv("../input/syntheacovid100k/100k_synthea_covid19_csv/medications.csv")
medication_desc = medications.groupby('ENCOUNTER')['DESCRIPTION'].apply(lambda x: " ".join(x)).reset_index()
medication_desc.columns = ['Id', 'Medication']


In [7]:
final_table = final_table.merge(medication_desc, on = 'Id')
final_table = final_table.merge(conditions_desc, on = 'Id')

In [8]:


class CategoricalStringToEmbedding:
    def __init__(self, num_features, max_words,texts):
        self.texts = texts
        self.num_features = num_features
        self.max_words = max_words
        self.max_features = self.max_words
        self.tokenizer = Tokenizer(num_words=max_words, 
                     filters='!"#$%&()*+,/:;<=>?@[\\]^`{|}~\t\n',)
        self.tokenizer.fit_on_texts(self.texts)
        self.sequences = self.tokenizer.texts_to_sequences(self.texts)
        self.data = pad_sequences(self.sequences, maxlen=self.num_features, padding='post', truncating='post', value=0.0)
        print(self.data.shape)
        self.embedding = Sequential()
        self.embedding.add(Embedding(self.max_features, 1, input_length=self.max_words))
    
    def create_embeddings(self,placeholder):
        output_array = self.embedding.predict(self.data)
        print(output_array.shape)
        VectFeat = []

        for rowNum in range(output_array.shape[0]):
            VectFeat.append(output_array[rowNum].flatten())

        VectFeat = pd.DataFrame(VectFeat)
        VectFeat.columns=[f"{placeholder}"+str(i) for i in range(0, self.num_features)]

        return VectFeat

In [9]:
medication_embedding = CategoricalStringToEmbedding(num_features = 20, max_words = 200, texts = final_table['Medication'].tolist())
medication_values = medication_embedding.create_embeddings("medication")

condition_embedding = CategoricalStringToEmbedding(num_features = 20, max_words = 200, texts = final_table['Condition'].tolist())
condition_values = condition_embedding.create_embeddings("condition")

(5611, 20)


2022-10-28 11:15:55.992609: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-10-28 11:15:56.114770: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


(5611, 20, 1)
(5611, 20)
(5611, 20, 1)


In [10]:
final_table = pd.concat([final_table,medication_values], axis = 1)
final_table = pd.concat([final_table, condition_values], axis = 1)
final_table= final_table.reset_index()
final_table.drop(columns = ['index', 'Id','DESCRIPTION','Condition','Medication'], inplace = True)

In [11]:
cat_cols = ['ENCOUNTERCLASS']
int_cols = ['CODE', 'BASE_ENCOUNTER_COST', 'TOTAL_CLAIM_COST', 'PAYER_COVERAGE','REASONCODE', 'diff_days',
           'condition0', 'condition1', 'condition2', 'condition3', 'condition4',
       'condition5', 'condition6', 'condition7', 'condition8', 'condition9',
       'condition10', 'condition11', 'condition12', 'condition13',
       'condition14', 'condition15', 'condition16', 'condition17',
       'condition18', 'condition19','medication0', 'medication1', 'medication2', 'medication3',
       'medication4', 'medication5', 'medication6', 'medication7',
       'medication8', 'medication9', 'medication10', 'medication11',
       'medication12', 'medication13', 'medication14', 'medication15',
       'medication16', 'medication17', 'medication18', 'medication19']



X = final_table[cat_cols + int_cols]
y = final_table['diff_days_1']


X_train, X_test,y_train, y_test = train_test_split(X, y , test_size = 0.2, shuffle = True, random_state = True)

In [12]:
scaler = MinMaxScaler()
scaler.fit(X_train[int_cols])
X_train[int_cols] = scaler.transform(X_train[int_cols])
X_test[int_cols] = scaler.transform(X_test[int_cols])

X_train[cat_cols] = X_train[cat_cols].apply(lambda x : x.astype('category'))

clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

In [13]:
X_test[cat_cols] = X_test[cat_cols].apply(lambda x : x.astype('category'))
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred),confusion_matrix(y_test, y_pred)

(0.9296527159394479,
 array([[461,  35],
        [ 44, 583]]))

In [14]:
with open('rehospitalisation_data_scaler.pkl', 'wb') as fid:
    pickle.dump(scaler, fid) 

with open('rehospitalisation_model.pkl', 'wb') as fid:
    pickle.dump(clf, fid) 

In [15]:
!ls

__notebook__.ipynb		   rehospitalisation_model.pkl
rehospitalisation_data_scaler.pkl
