In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pickle

data=pd.read_csv('../../../Sherry-K/data/new_data.csv')

In [136]:
X = data.copy()

In [137]:
print(X.shape)
X.head()

(86500, 55)


Unnamed: 0.1,Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,target,self_pay,medicaid/medicare,coverByInsurance,f_diag,s_diag,t_diag,clustering3,clustering4,clustering2
0,0,220288932,32885451,Caucasian,Male,[50-60),2,6,7,10,...,0,0,0,1,diseases of the circulatory system,neoplasms,neoplasms,0,1,1
1,1,18126858,4788405,Caucasian,Male,[50-60),1,1,7,5,...,0,0,0,0,diseases of the circulatory system,diseases of the circulatory system,diseases of the respiratory system,2,0,0
2,2,21616398,1660293,Caucasian,Female,[60-70),1,1,7,3,...,0,0,0,0,diseases of the digestive system,diseases of the digestive system,"endocrine, nutritional and metabolic diseases,...",2,0,0
3,3,30596088,79713063,Caucasian,Female,[70-80),1,3,7,5,...,0,0,0,0,"endocrine, nutritional and metabolic diseases,...","endocrine, nutritional and metabolic diseases,...",diseases of the genitourinary system,2,0,0
4,4,203180148,103254777,AfricanAmerican,Male,[30-40),1,6,7,10,...,0,0,1,0,diseases of the circulatory system,diseases of the genitourinary system,"endocrine, nutritional and metabolic diseases,...",0,1,1


In [138]:
## dropping id features not to be used in model
X = X.drop(['Unnamed: 0','encounter_id', 'patient_nbr'], axis = 1)

## removing rows with deceased patients based on discharge id
dead_ids = [11, 13, 14, 19,20,21]
X = X[~X['discharge_disposition_id'].isin(dead_ids)]

In [139]:
## converting id features to categorical
X['admission_type_id'] = X['admission_type_id'].astype(str)
X['discharge_disposition_id'] = X['discharge_disposition_id'].astype(str)
X['admission_source_id'] = X['admission_source_id'].astype(str)


In [140]:
X.dtypes

race                        object
gender                      object
age                         object
admission_type_id           object
discharge_disposition_id    object
admission_source_id         object
time_in_hospital             int64
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide                   object
tolbutamide                 object
pioglitazone                object
rosiglitazone               object
acarbose            

In [141]:
X['medical_specialty_new'] = X['medical_specialty'].copy()


In [142]:
top10_medical = ['No', 'InternalMedicine', 'Emergency/Trauma','Family/GeneralPractice','Cardiology','Surgery-General','Nephrology','Orthopedics','Orthopedics-Reconstructive']

In [143]:
X.loc[~X['medical_specialty_new'].isin(top10_medical), 'medical_specialty_new'] = 'Other'
X = X.drop('medical_specialty', axis = 1)

In [144]:
X.medical_specialty_new.value_counts()

No                            41340
InternalMedicine              12066
Other                          7876
Emergency/Trauma               6299
Family/GeneralPractice         6179
Cardiology                     4495
Surgery-General                2599
Nephrology                     1319
Orthopedics                    1211
Orthopedics-Reconstructive     1048
Name: medical_specialty_new, dtype: int64

In [145]:
data.medical_specialty.value_counts()

No                                      42471
InternalMedicine                        12408
Emergency/Trauma                         6429
Family/GeneralPractice                   6337
Cardiology                               4554
Surgery-General                          2635
Nephrology                               1384
Orthopedics                              1219
Orthopedics-Reconstructive               1051
Radiologist                               974
Pulmonology                               735
Psychiatry                                720
Urology                                   581
ObstetricsandGynecology                   575
Surgery-Cardiovascular/Thoracic           559
Gastroenterology                          468
Surgery-Vascular                          459
Surgery-Neuro                             393
PhysicalMedicineandRehabilitation         338
Oncology                                  298
Pediatrics                                207
Hematology/Oncology               

In [146]:
X['discharge_id'] = X['discharge_disposition_id'].copy()
top10_discharge = ['1', '3', '6','18','2','22','5','25','4']
X.loc[~X['discharge_id'].isin(top10_discharge), 'discharge_id'] = 'Other'
X = X.drop('discharge_disposition_id', axis = 1)

In [147]:
X.discharge_id.value_counts()

1        51279
3        11873
6        10881
18        3101
2         1815
22        1707
Other     1220
5         1004
25         843
4          709
Name: discharge_id, dtype: int64

In [148]:
X['admission_id'] = X['admission_source_id'].copy()
top10_admission = ['7', '1', '17','4','6','2','5']
X.loc[~X['admission_id'].isin(top10_admission), 'admission_id'] = 'Other'
X = X.drop('admission_source_id', axis = 1)

In [149]:
for x in list(X.columns):
    print(X[x].value_counts())

Caucasian          63151
AfricanAmerican    15906
Other               3137
Hispanic            1704
Asian                534
Name: race, dtype: int64
Female             45416
Male               39013
Unknown/Invalid        3
Name: gender, dtype: int64
[70-80)     21482
[60-70)     18815
[50-60)     14512
[80-90)     13917
[40-50)      8133
[30-40)      3215
[90-100)     2235
[20-30)      1399
[10-20)       583
[0-10)        141
Name: age, dtype: int64
1    44490
3    15877
2    15481
6     4382
5     3910
8      267
7       16
4        9
Name: admission_type_id, dtype: int64
3     14856
2     14429
1     11751
4     11573
5      8253
6      6238
7      4840
8      3621
9      2463
10     1923
11     1502
12     1177
13      956
14      850
Name: time_in_hospital, dtype: int64
1      2676
43     2334
44     2110
45     1975
38     1878
40     1828
46     1822
41     1778
47     1762
39     1757
42     1745
37     1737
48     1706
49     1692
36     1653
50     1613
35     1589
51     15

In [150]:
to_standardize = list(X.select_dtypes(include=['int']).columns)
to_standardize = to_standardize[0:8]

In [151]:
to_standardize

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses']

In [152]:
X.dtypes

race                        object
gender                      object
age                         object
admission_type_id           object
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide                   object
tolbutamide                 object
pioglitazone                object
rosiglitazone               object
acarbose                    object
miglitol                    object
troglitazone                object
tolazamide          

In [153]:
## dropping columns with medications that do have few changes
X = X.drop(['glimepiride','acetohexamide','citoglipton','examide','tolazamide','examide','glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone','troglitazone', 'change', 'diabetesMed'], axis = 1)


In [154]:
X.dtypes

race                     object
gender                   object
age                      object
admission_type_id        object
time_in_hospital          int64
num_lab_procedures        int64
num_procedures            int64
num_medications           int64
number_outpatient         int64
number_emergency          int64
number_inpatient          int64
number_diagnoses          int64
max_glu_serum            object
A1Cresult                object
metformin                object
repaglinide              object
nateglinide              object
chlorpropamide           object
glipizide                object
glyburide                object
tolbutamide              object
pioglitazone             object
rosiglitazone            object
acarbose                 object
miglitol                 object
insulin                  object
target                    int64
self_pay                  int64
medicaid/medicare         int64
coverByInsurance          int64
f_diag                   object
s_diag  

In [155]:
X.to_csv('final_data.csv', index = False)

In [156]:
X.dtypes

race                     object
gender                   object
age                      object
admission_type_id        object
time_in_hospital          int64
num_lab_procedures        int64
num_procedures            int64
num_medications           int64
number_outpatient         int64
number_emergency          int64
number_inpatient          int64
number_diagnoses          int64
max_glu_serum            object
A1Cresult                object
metformin                object
repaglinide              object
nateglinide              object
chlorpropamide           object
glipizide                object
glyburide                object
tolbutamide              object
pioglitazone             object
rosiglitazone            object
acarbose                 object
miglitol                 object
insulin                  object
target                    int64
self_pay                  int64
medicaid/medicare         int64
coverByInsurance          int64
f_diag                   object
s_diag  

In [131]:
cat_change = ['clustering3', 'clustering4', 'clustering2', 'self_pay', 'medicaid/medicare','coverByInsurance', 'target'] 

In [132]:
for cat in cat_change:
    X[cat] = X[cat].astype(str)

In [133]:
col_num = list(X.select_dtypes(include = int).columns)

In [134]:
X.shape

(84432, 39)

In [178]:
scaler = StandardScaler()
scaler.fit(X.select_dtypes(include = int))

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [179]:
scalerfile = 'scaler.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))

In [180]:
scaler = pickle.load(open(scalerfile, 'rb'))

In [187]:
X_standard = X.select_dtypes(include = int)
X_standard = scaler.transform(X_standard)

  


In [188]:
X_standard

array([[ 1.89580349,  0.10834242, -0.1966791 , ..., -0.21289213,
         1.09342471,  0.82238937],
       [ 0.21067677, -0.19761845,  1.56292913, ..., -0.21289213,
        -0.50213764,  0.82238937],
       [-0.46337392,  0.21032937, -0.78321517, ..., -0.21289213,
         3.48676824,  0.82238937],
       ...,
       [ 2.23282884,  1.43417284, -0.1966791 , ..., -0.21289213,
        -0.50213764,  0.82238937],
       [-0.80039927, -0.19761845, -0.78321517, ..., -0.21289213,
        -0.50213764, -2.78175906],
       [-0.80039927,  0.46529676, -0.78321517, ..., -0.21289213,
        -0.50213764, -1.23712402]])