# Final Assignment B
Kamile Stankeviciute `ks830`

In [1]:
# Imports
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
# Loading the dataset
pd.set_option('display.max_columns', 500)

diabetic = pd.read_csv('diabetes/diabetic_data_original.csv')
diabetic.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [6]:
diabetic['encounter_id'].sort_values()

8             12522
9             15738
4             16680
10            28236
5             35754
            ...    
101761    443847548
101762    443847782
101763    443854148
101764    443857166
101765    443867222
Name: encounter_id, Length: 101766, dtype: int64

In [None]:
patient_nbr_counts = diabetic['patient_nbr'].value_counts()
df = diabetic[diabetic['patient_nbr'].isin(patient_nbr_counts.index[patient_nbr_counts.gt(1)])].sort_values(['patient_nbr', 'encounter_id'])
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 100)
print(df[['encounter_id', 'patient_nbr', 'readmitted']])

## Training task

*For the purposes of this assignment, we wish to predict the `time_in_hospital` value. It doesn’t make sense to predict the duration of a stay in hospital by using attributes recorded during that stay. Instead, we will restrict attention to **follow-on visits**, and seek to predict the length of a stay for follow-on visit on the basis of attributes recorded in the previous visit.*

*Follow-on* visits mean that the model will be trained on *patients who had follow-on visits*. I will select the patients who have been readmitted (only those `patient_nbr`s which appear more than once in the dataset), sort them by time (assuming the `encounter_id` is sorted and sequential), ignoring the patients who have not been readmitted. Then 
* the train instance will contain data from the first visit, 
* the test instance will contain length of stay (`time_in_hospital`) of the next visit


Q: What about the patients which had several follow-ons? Can the first follow-up (ground truth for the initial visit) be a training instance, with the second follow-up having the ground truth? Is there data leakage? A: No as long as just the time in hospital is given.

Q: Can I use the length of stay of the current visit as a feature? A: Yes, why not.

## Data preparation

*Don’t explain the code you used, but do explain what filtering or processing you applied, and report how many items there are in your dataset.*

* Patient with follow-on visits selection.
* Generating predicted variable (`next_time_in_hospital`) by extracting the `time_in_hospital` of the same `patient_nbr` from the next `encounter_id`.
* Scaling (normalisation): important for training stability.
* Categorial variable one-hot coding.
* Handling missing data: masks and codes.
* Feature engineering and selection should not be necessary with a deep learning framework, which, with sufficient model complexity, should learn to extract those features by itself.

**Q: Can I assume `encounter_id`s are chronological? A: assumption is reasonable as manually looking at the dataset with sorted encounters and patient identifiers all entries have readmitted outcome but the last outcome is of not readmitted.** I has since been updated that this is the correct assumption.

Processing can be described in the context of the last assignment.

In [None]:
df = diabetic.copy()

# total_bedrooms has some missing values, so replace it with two columns: a mask, and a value
df['total_bedrooms_mask'] = np.where(pandas.isnull(housing.total_bedrooms), 0, 1)
df['total_bedrooms'] = np.where(pandas.isnull(housing.total_bedrooms), 0, housing.total_bedrooms)

# ocean_proximity is a categorical, so replace it by one-hot-coded version
for (suffix,level) in [('med','<1H OCEAN'), ('far','INLAND'), ('isle','ISLAND'), ('bay','NEAR BAY'), ('near','NEAR OCEAN')]:
    assert np.sum(housing.ocean_proximity==level) > 0, f"No values for {level}"
    df[f'ocean_prox_{suffix}'] = np.where(housing.ocean_proximity==level, 1, 0)
del df['ocean_proximity']

# Scale the columns, in a sane way depending on what they mean
for col in ['longitude','latitude']:
    df[col] = (df[col] - np.min(df[col])) / (np.max(df[col]) - np.min(df[col]))
for col in ['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']:
    df[col] = (df[col] - np.mean(df[col])) / np.std(df[col])
    
X = df.drop('median_house_value', axis=1).values
Y = df.median_house_value.values

print(X.shape, Y.shape)