In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

## Load the data

In [2]:
dia_data0=pd.read_csv('dataset_diabetes/diabetic_data.csv')
ids_map=pd.read_csv('dataset_diabetes/IDs_mapping.csv')

## Look at some of the data

Seems it is all categorical?

In [3]:
dia_data0.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
dia_data0['diag_1']

0         250.83
1            276
2            648
3              8
4            197
           ...  
101761    250.13
101762       560
101763        38
101764       996
101765       530
Name: diag_1, Length: 101766, dtype: object

## Check for number of unique values for each potential feature

In [5]:
potential_features=dia_data0.columns
for n in potential_features:
    num_uniq=len(dia_data0[n].unique())
    print('Number of unique '+ n + ': '+str(num_uniq))

Number of unique encounter_id: 101766
Number of unique patient_nbr: 71518
Number of unique race: 6
Number of unique gender: 3
Number of unique age: 10
Number of unique weight: 10
Number of unique admission_type_id: 8
Number of unique discharge_disposition_id: 26
Number of unique admission_source_id: 17
Number of unique time_in_hospital: 14
Number of unique payer_code: 18
Number of unique medical_specialty: 73
Number of unique num_lab_procedures: 118
Number of unique num_procedures: 7
Number of unique num_medications: 75
Number of unique number_outpatient: 39
Number of unique number_emergency: 33
Number of unique number_inpatient: 21
Number of unique diag_1: 717
Number of unique diag_2: 749
Number of unique diag_3: 790
Number of unique number_diagnoses: 16
Number of unique max_glu_serum: 4
Number of unique A1Cresult: 4
Number of unique metformin: 4
Number of unique repaglinide: 4
Number of unique nateglinide: 4
Number of unique chlorpropamide: 4
Number of unique glimepiride: 4
Number of

## Delete uninformative features

The 101,766 patient encounters all have the same value for 2 features:

'examide' and 'citoglipton'

These two features will not be useful so we can delete them. 

We will also delete features with many many unique but uninformative values (diag_1, diag_2, and diag_3) for simplicity.

We will also delete any features including '?' or missing values (race, weight, payer_code, medical_specialty).

Later we will also delete from training data the patient id and encounter numbers and the Y value we want to predict ('encounter_id', 'patient_nbr','readmitted').

In [6]:
dia_data=dia_data0.drop(['examide','citoglipton','weight','diag_1','diag_2','diag_3','race','payer_code','medical_specialty'], axis=1)
dia_data.shape

(101766, 41)

In [7]:
potential_features=dia_data.columns
for n in potential_features:
    num_uniq=len(dia_data[n].unique())
    print('Number of unique '+ n + ': '+str(num_uniq))
    print(dia_data[n].unique())

Number of unique encounter_id: 101766
[  2278392    149190     64410 ... 443854148 443857166 443867222]
Number of unique patient_nbr: 71518
[  8222157  55629189  86047875 ... 140199494 120975314 175429310]
Number of unique gender: 3
['Female' 'Male' 'Unknown/Invalid']
Number of unique age: 10
['[0-10)' '[10-20)' '[20-30)' '[30-40)' '[40-50)' '[50-60)' '[60-70)'
 '[70-80)' '[80-90)' '[90-100)']
Number of unique admission_type_id: 8
[6 1 2 3 4 5 8 7]
Number of unique discharge_disposition_id: 26
[25  1  3  6  2  5 11  7 10  4 14 18  8 13 12 16 17 22 23  9 20 15 24 28
 19 27]
Number of unique admission_source_id: 17
[ 1  7  2  4  5  6 20  3 17  8  9 14 10 22 11 25 13]
Number of unique time_in_hospital: 14
[ 1  3  2  4  5 13 12  9  7 10  6 11  8 14]
Number of unique num_lab_procedures: 118
[ 41  59  11  44  51  31  70  73  68  33  47  62  60  55  49  75  45  29
  35  42  66  36  19  64  25  53  52  87  27  37  46  28  48  72  10   2
  65  67  40  54  58  57  43  32  83  34  39  69  38  56 

## Save trimmed data

Will use OneHotEncoder from sklearn... later 

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

Potential features (X) will be all remaining features except identifiers 'encounter_id' and 'patient_nbr' and variable we want to predict, readmission so these are removed below.

In [8]:
Y=dia_data['readmitted'].to_numpy()
encounter_id=dia_data['encounter_id'].to_numpy()
patient_nbr=dia_data['patient_nbr'].to_numpy()
X_df=dia_data.drop(['encounter_id', 'patient_nbr','readmitted'], axis=1)

In [9]:
X_array=X_df.to_numpy()

In [10]:
np.save('dataset_diabetes/X_array.npy',X_array)
np.save('dataset_diabetes/Y.npy',Y)
np.save('dataset_diabetes/encounter_id.npy',encounter_id)
np.save('dataset_diabetes/patient_nbr.npy',patient_nbr)