In [37]:
# Import modules
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [38]:
data = pd.read_csv('data/diabetic_data.csv')
data1 = pd.read_csv('data/IDs_mapping.csv')

In [39]:
print(data.shape)
data.head()

(101766, 50)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [40]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [41]:
# replace ? to NaN (missing value)
data= data.replace("?", np.NaN)

In [42]:
#columns contain missing value
data.columns[data.isna().any()].tolist()

['race',
 'weight',
 'payer_code',
 'medical_specialty',
 'diag_1',
 'diag_2',
 'diag_3']

In [49]:
#split admission id into multiple columns
data2 = data1[0:8]
data3 = data1[10:40]
data4 = data1[42:67]
data3 = data3.rename(index=str, columns={"admission_type_id":"discharge_disposition_id"})
data4 = data4.rename(index=str, columns={"admission_type_id":"admission_source_id"})
data2.admission_type_id = data2.admission_type_id.astype(int)
data3.discharge_disposition_id = data3.discharge_disposition_id.astype(int)
data4.admission_source_id = data4.admission_source_id.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [48]:
#merge two data and rename the columns
new_data = pd.merge(data, data2, on='admission_type_id', how='inner')
new_data = pd.merge(new_data, data3, on='discharge_disposition_id', how='inner')
new_data = pd.merge(new_data, data4, on='admission_source_id', how='inner')
new_data.rename(columns={'description_x':'admission_type_id','description_y':'discharge_disposition_id','description':'admission_source_id'}, inplace=True)
new_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id.1,discharge_disposition_id.1,admission_source_id.1
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,NO,,Not Mapped,Physician Referral
1,1968528,720936,Caucasian,Female,[70-80),,6,25,1,10,...,No,No,No,No,Ch,Yes,>30,,Not Mapped,Physician Referral
2,2223336,558360,AfricanAmerican,Female,[60-70),,6,25,1,9,...,No,No,No,No,No,No,NO,,Not Mapped,Physician Referral
3,2298006,2519748,Caucasian,Male,[60-70),,6,25,1,4,...,No,No,No,No,Ch,Yes,NO,,Not Mapped,Physician Referral
4,2356308,608841,AfricanAmerican,Female,[50-60),,6,25,1,1,...,No,No,No,No,No,Yes,>30,,Not Mapped,Physician Referral
5,2363592,1059561,Caucasian,Female,[50-60),,6,25,1,1,...,No,No,No,No,No,Yes,NO,,Not Mapped,Physician Referral
6,2398146,8147493,Caucasian,Male,[60-70),,6,25,1,1,...,No,No,No,No,No,Yes,NO,,Not Mapped,Physician Referral
7,2422806,3377124,Caucasian,Male,[40-50),,6,25,1,14,...,No,No,No,No,Ch,Yes,>30,,Not Mapped,Physician Referral
8,2466036,4311585,Caucasian,Male,[60-70),,6,25,1,10,...,No,No,No,No,Ch,Yes,>30,,Not Mapped,Physician Referral
9,2473188,981198,Caucasian,Female,[60-70),,6,25,1,5,...,No,No,No,No,No,Yes,>30,,Not Mapped,Physician Referral
