# Title

## Introduction

## Preprocessing

### Feature Reduction

The papers used to inform the eliminaton of columns were "Impact of HbA1c Measurement on Hospital Readmission Rates: Analysis of 70,000 Clinical Database Patient Records" found at https://downloads.hindawi.com/journals/bmri/2014/781670.pdf and "Risk factors associated with 30-day readmission and length of stay in patients with type 2 diabetes" found at https://www.sciencedirect.com/science/article/abs/pii/S1056872716307383. 

#### Overview of Support from First article

Cut weight and payer code.  Paper finds gender to not be statistically significant so cut that as well.  Use only one encounter per patient, ideally the first encounter to ensure statistical independence, though this may not be necessary for all models, it is still ideal for our purposes.  Remove all encounters involving discharge to hospice or death. HbA1C measure is more useful being treated as a binary variable focusing on whether the test was was administered at all rather than the results of the test.  

In [122]:
#Import packages 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#import training data and delete weight, payer code, and gender
df=pd.read_csv("diabetic_data_train.csv").drop(columns=['weight','gender','payer_code'])

#import and sort id_codes for admssion type, discharge disposition, and admission source 
dfid=pd.read_csv("IDs_mapping.csv")
df_admissionTypeID = dfid.iloc[0:8]
df_dischargeID=dfid.iloc[10:40].reset_index(drop=True)
df_dischargeID.columns=['discharge_disposition_id', "description"]
df_dischargeID.dropna(inplace=True)
df_admissionSourceID=dfid.iloc[42:].reset_index(drop=True)
df_admissionSourceID.columns=['admission_source_id', "description"]

#ensure only one encounter per patient by only using the first encounter listed
df=df.drop_duplicates(subset=['patient_nbr'],keep='first').reset_index(drop=True)

#remove all encounters occurring in discharge to hospice or death
#the fuction commented out below searched the discharge codes to find the ones that contain hospice and expired
#which can be interperated as death.
#hospice=df_dischargeID['description'].str.contains('hospice')
#Hospice=df_dischargeID['description'].str.contains('Hospice')
#expired=df_dischargeID['description'].str.contains('expired')
#Expired=df_dischargeID['description'].str.contains('Expired')
#death_array=[]
#hospiceArr=df_dischargeID[hospice].discharge_disposition_id.to_numpy()
#for i in range(len(hospiceArr)):
#    death_array.append(hospiceArr[i])
#HospiceArr=df_dischargeID[Hospice].discharge_disposition_id.to_numpy()
#for i in range(len(HospiceArr)):
#    death_array.append(HospiceArr[i])
#expiredArr=df_dischargeID[expired].discharge_disposition_id.to_numpy()
#for i in range(len(expiredArr)):
#    death_array.append(expiredArr[i])
#ExpiredArr=df_dischargeID[Expired].discharge_disposition_id.to_numpy()
#for i in range(len(ExpiredArr)):
#    death_array.append(ExpiredArr[i])
#np.unique(np.array(death_array))

In [120]:
hospice=df_dischargeID['description'].str.contains('hospice')
Hospice=df_dischargeID['description'].str.contains('Hospice')
expired=df_dischargeID['description'].str.contains('expired')
Expired=df_dischargeID['description'].str.contains('Expired')
death_array=[]
hospiceArr=df_dischargeID[hospice].discharge_disposition_id.to_numpy()
for i in range(len(hospiceArr)):
    death_array.append(hospiceArr[i])
HospiceArr=df_dischargeID[Hospice].discharge_disposition_id.to_numpy()
for i in range(len(HospiceArr)):
    death_array.append(HospiceArr[i])
expiredArr=df_dischargeID[expired].discharge_disposition_id.to_numpy()
for i in range(len(expiredArr)):
    death_array.append(expiredArr[i])
ExpiredArr=df_dischargeID[Expired].discharge_disposition_id.to_numpy()
for i in range(len(ExpiredArr)):
    death_array.append(ExpiredArr[i])
np.unique(np.array(death_array))

array(['11', '13', '14', '19', '20', '21'], dtype='<U2')

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
df=pd.read_csv("diabetic_data.csv")
df.count()[0]

101766

In [12]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [54]:
df=pd.read_csv("diabetic_data.csv")
df.drop_duplicates(subset=['patient_nbr'],keep='first').reset_index(drop=True)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71513,443842016,183087545,Caucasian,Female,[70-80),?,1,1,7,9,...,No,Steady,No,No,No,No,No,Ch,Yes,>30
71514,443842022,188574944,Other,Female,[40-50),?,1,1,7,14,...,No,Up,No,No,No,No,No,Ch,Yes,>30
71515,443842070,140199494,Other,Female,[60-70),?,1,1,7,2,...,No,Steady,No,No,No,No,No,No,Yes,>30
71516,443842340,120975314,Caucasian,Female,[80-90),?,1,1,7,5,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [40]:
pd.DataFrame(pd.unique(df.patient_nbr),columns=['patient_nbr'])

Unnamed: 0,patient_nbr
0,8222157
1,55629189
2,86047875
3,82442376
4,42519267
...,...
71513,183087545
71514,188574944
71515,140199494
71516,120975314


In [29]:
df_unique_patient.patient_nbr

0          8222157
1         55629189
2         86047875
3         82442376
4         42519267
           ...    
71513    183087545
71514    188574944
71515    140199494
71516    120975314
71517    175429310
Name: patient_nbr, Length: 71518, dtype: int64

In [None]:
df

In [4]:
df[['metformin', 'repaglinide', 'nateglinide']]

Unnamed: 0,metformin,repaglinide,nateglinide
0,No,No,No
1,No,No,No
2,No,No,No
3,No,No,No
4,No,No,No
...,...,...,...
101761,Steady,No,No
101762,No,No,No
101763,Steady,No,No
101764,No,No,No


In [89]:
dfid=pd.read_csv("IDs_mapping.csv")
df_admissionTypeID = dfid.iloc[0:8]
#df_dischargeID = dfid.iloc
df_dischargeID=dfid.iloc[10:40]
df_dischargeID.columns=['discharge_disposition_id', "discription"]
df_admissionSourceID=dfid.iloc[42:]
df_admissionSourceID.columns=['admission_source_id', "discription"]