In [1]:
import os
import pandas as pd
from matplotlib import pyplot as plt   
              

In [2]:
def load_datasets(parquets_folder,load_test=False):    
    
    path= parquets_folder+'/'
    
    filenames = ['hospital_train_data.parquet','hospital_val_data.parquet','hospital_test_data.parquet']
    
    
    for file in filenames:
        if not os.path.isfile(path+file):
            print(file, 'not available in the specified folder')
        else:
            if '_train_' in file and not load_test:
                print('Loading dataset: ',file)
                train_data = pd.read_parquet(path+file)
            elif '_val_' in file and not load_test:
                print('Loading dataset: ',file)
                val_data = pd.read_parquet(path+file)
            elif '_test_' in file and load_test:
                print('Loading dataset: ',file)
                test_data = pd.read_parquet(path+file)
        
    return (train_data,val_data) if not load_test else test_data
        
                    

In [3]:
%cd /Users/josefinadallavia/Documents/MIM/Tesis/A-Machine-Learning-Approach-for-Prediction-of-Hospital-Bed-Availability/

/Users/josefinadallavia/Documents/MIM/Tesis/A-Machine-Learning-Approach-for-Prediction-of-Hospital-Bed-Availability


In [4]:
train_set, val_set = load_datasets('data/hospital_dataset')

Loading dataset:  hospital_train_data.parquet
Loading dataset:  hospital_val_data.parquet


In [5]:
pd.options.display.max_columns = None

In [6]:
train_set['admission_weekday'] = pd.to_datetime(train_set['admission_date']).dt.weekday.astype('str')
train_set['date_weekday'] = pd.to_datetime(train_set['date']).dt.weekday.astype('str')

In [7]:
val_set['admission_weekday'] = pd.to_datetime(val_set['admission_date']).dt.weekday.astype('str')
val_set['date_weekday'] = pd.to_datetime(val_set['date']).dt.weekday.astype('str')

In [8]:
categorical = []
numerical = []
for col in train_set:
    if train_set[col].dtype == 'object':
        if train_set[col].nunique() < 10:
            categorical.append(col)
    elif train_set[col].dtype in ['float', 'int'] and col != 'discharge':
        numerical.append(col)

In [9]:
categorical

['gender',
 'entity_group',
 'origin',
 'isolation',
 'ARM_TEP',
 'CEC_TEP',
 'request_sector',
 'admission_weekday',
 'date_weekday']

In [10]:
numerical

['hosp_day_number',
 'images_count',
 'images_cumulative',
 'images_emergencies',
 'images_requester_roles_count',
 'images_requesters_count',
 'images_study_types_count',
 'labos_count',
 'labos_cumulative',
 'labos_emergencies',
 'labos_requester_roles_count',
 'labos_requesters_count',
 'labos_set_count',
 'labos_set_cumulative',
 'sectors_count',
 'sectors_last_stay',
 'surgeries_count',
 'surgeries_cumulative',
 'surgeries_post_surgery_duration',
 'surgeries_pre_surgery_duration',
 'surgeries_prep_duration',
 'surgeries_services_count',
 'surgeries_surgery_delay',
 'surgeries_surgery_duration',
 'surgeries_types_count',
 'emergency_service',
 'new_born_weight',
 'new_born_gestation_age',
 'PIM2TEP',
 'high_risk_TEP',
 'low_risk_TEP',
 'patient_age']

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

class CategoricalSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on categorical columns in the data
    """
    def __init__(self, keys):
        self.keys = keys

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.keys].astype(str)
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [13]:
from sklearn.pipeline import Pipeline

In [14]:
categories = Pipeline([
                ('selector', CategoricalSelector(keys=categorical)),
                ('binarizer', OneHotEncoder())
            ])

In [15]:
from sklearn.preprocessing import StandardScaler

numbers =  Pipeline([
                ('selector', NumberSelector(key='hosp_day_number')),
                ('standard', StandardScaler())
            ])

In [16]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('categories', categories), 
                      ('numerical', numbers)])

In [17]:
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(train_set.fillna(0))

<319150x47 sparse matrix of type '<class 'numpy.float64'>'
	with 3191500 stored elements in Compressed Sparse Row format>

In [18]:
import lightgbm as lgb

In [19]:
y_train = train_set['discharge']
y_val = val_set['discharge']

In [20]:
X_train = train_set.drop('discharge',axis=1)
X_val = val_set.drop('discharge',axis=1)

In [None]:
for col in categorical:
    
    X_train[col] = X_train[col].astype('category')
    X_train[col] = pd.Categorical(X_train[col].cat.codes+1)
    
    X_val[col] = X_val[col].astype('category')
    X_val[col] = pd.Categorical(X_val[col].cat.codes+1)

In [None]:
train_data = lgb.Dataset(X_train[categorical+numerical], 
                         label=y_train, 
                         feature_name=categorical+numerical, 
                         categorical_feature=categorical)
train_data.save_binary('data/hospital_dataset/train.bin')




In [None]:
validation_data.save_binary('data/hospital_dataset/validation.bin')

In [None]:
validation_data = lgb.Dataset(val_set[categorical+numerical], 
                              label=y_val, 
                              reference=train_data,
                              feature_name=categorical+numerical, 
                              categorical_feature=categorical
                             )