In [63]:
# Import modules
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [64]:
new_data = pd.read_csv('data/new_data.csv')

In [66]:
#enlabel the columns
new_data = new_data
new_data['admission_type_id'] = pd.Categorical(new_data.admission_type_id)
new_data['discharge_disposition_id'] = pd.Categorical(new_data.discharge_disposition_id)
new_data['admission_source_id'] = pd.Categorical(new_data.admission_source_id)

cat_cols = list(new_data.select_dtypes(include=[object]).columns)
for col in cat_cols:
   new_data[col] = pd.Categorical(new_data[col])

le = preprocessing.LabelEncoder()

col_to_encode = new_data[list(new_data.select_dtypes(include=['category']).columns)]
for col in col_to_encode:
   new_data[col] = le.fit_transform(new_data[col])


In [35]:
##### Up-sample logistic regression  #####

In [36]:
#Up-sample Minority Class
from sklearn.utils import resample
majority = new_data[new_data.readmitin30days==0]
minority = new_data[new_data.readmitin30days==1]
minority_upsampled = resample(minority,replace=True,n_samples=90409,random_state=123)
df_upsampled = pd.concat([majority, minority_upsampled])
df_upsampled.readmitin30days.value_counts()

1    90409
0    90409
Name: readmitin30days, dtype: int64

In [37]:
#create another df w/o id's and target
droplis = ['encounter_id','patient_nbr','readmitin30days']
new_data_train = df_upsampled.drop(droplis, axis=1)

In [38]:
#creating a training and testing datasets
y = df_upsampled.readmitin30days
X_train, X_test, y_train, y_test = train_test_split(new_data_train, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(144654, 48) (144654,)
(36164, 48) (36164,)


In [39]:
## Fit the model with your data
decision_logit = linear_model.LogisticRegression(C=1e4)
decision_logit.fit(X_train, y_train)

## The score (accuracy for classification problems):
decision_logit.score(X_train, y_train)

0.6014420617473419

In [40]:
#up-sample AUC for training 
from sklearn.metrics import roc_auc_score
upsample_AUC_train =roc_auc_score(y_train, decision_logit.predict_proba(X_train)[:,1])
upsample_AUC_train

0.6449220929806081

In [41]:
#up-sample AUC for testing
from sklearn.metrics import roc_auc_score
upsample_AUC_test=roc_auc_score(y_test, decision_logit.predict_proba(X_test)[:,1])
upsample_AUC_test

0.6470790135849378

In [42]:
#########without upsampled logistic regression#########

In [43]:
droplis = ['encounter_id','patient_nbr','readmitin30days']
new_data_train2 = new_data.drop(droplis, axis=1)

In [44]:
#creating a training and testing datasets
y = new_data.readmitin30days
X2_train, X2_test, y2_train, y2_test = train_test_split(new_data_train2, y, test_size=0.2)
print(X2_train.shape, y2_train.shape)
print(X2_test.shape, y2_test.shape)

(81412, 48) (81412,)
(20354, 48) (20354,)


In [45]:
## Fit the model with your data
decision_logit = linear_model.LogisticRegression(C=1e4)
decision_logit.fit(X2_train, y2_train)

## The score (accuracy for classification problems):
decision_logit.score(X2_train, y2_train)

0.8885053800422542

In [46]:
#AUC for training 
from sklearn.metrics import roc_auc_score
AUC_train=roc_auc_score(y2_train, decision_logit.predict_proba(X2_train)[:,1])
AUC_train

0.6423030965111737

In [47]:
#AUC for testing
from sklearn.metrics import roc_auc_score
AUC_test=roc_auc_score(y2_test, decision_logit.predict_proba(X2_test)[:,1])
AUC_test

0.6414869439170211

In [51]:
##########   normalize  rfe   #############

In [52]:
#droplis = ['encounter_id','patient_nbr','readmitin30days']
new_data_train3 = new_data.drop(droplis, axis=1)
#y = df_upsampled.readmitin30days

In [53]:
#create a random column to test
new_data_train3['random'] = np.random.randint(0,2, new_data_train3.shape[0])
new_data_train3['random'].value_counts()

1    51064
0    50702
Name: random, dtype: int64

In [54]:
#normalize new_data_train3
from sklearn.preprocessing import Normalizer
transformer = Normalizer().fit(new_data_train3)
normal = transformer.transform(new_data_train3)
normal = pd.DataFrame(normal, columns= new_data_train3.columns)
target = new_data.readmitin30days

In [55]:
#normalize rfe
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
rfe = RFE(model1, 1)
rfe = rfe.fit(normal, y)
print(rfe.n_features_)
print(rfe.support_)
print(rfe.ranking_)

1
[False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False]
[26 31 27 19 18 37 20 34 33 22 30 36  6  1 15  5 25  2 24 14 12  9 45 23
 17 40  7 10 11 13 42 41 49 48 21 16 47 44 43 46  3  8 28 32 35 39 38  4
 29]


In [56]:
df = pd.concat([pd.DataFrame(X_train.columns), pd.DataFrame(rfe.ranking_)], axis=1)
df.columns = ["Col", "rfe"]
df = df.sort_values(by="rfe", ascending = True)

In [57]:
######## rfe w/o normalize ##########

In [58]:
#droplis = ['encounter_id','patient_nbr','readmitin30days']
new_data_train4 = new_data.drop(droplis, axis=1)
#y = df_upsampled.readmitin30days

In [59]:
# create the RFE model and select 1 attribute
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
rfe1 = RFE(model, 1)
rfe1 = rfe1.fit(new_data_train4, y)
print(rfe1.n_features_)
print(rfe1.support_)
print(rfe1.ranking_)

1
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False False
 False False False False False False False False False False False False]
[43 34 21 31 29 39 32 45 46 33 41 44 25  4 20 30 28 10  9 35  2 15 23 36
 26  3 17 18  6  1 13  5 47 48 27 37 11 22 14 24 16  8 42 38 40 12 19  7]


In [60]:
df1 = pd.concat([pd.DataFrame(X_train.columns), pd.DataFrame(rfe1.ranking_)], axis=1)
df1.columns = ["Col", "rfe1"]
df1 = df1.sort_values(by="rfe1", ascending = True)

In [61]:
###### compare two rfe ########

In [62]:
result = pd.concat([df, df1], axis=1)
result.sort_values(by="rfe", ascending = True)

Unnamed: 0,Col,rfe,Col.1,rfe1
13,number_inpatient,1,number_inpatient,4.0
17,metformin,2,metformin,10.0
40,change,3,change,16.0
47,coverByInsurance,4,coverByInsurance,7.0
15,max_glu_serum,5,max_glu_serum,30.0
12,number_emergency,6,number_emergency,25.0
26,pioglitazone,7,pioglitazone,17.0
41,diabetesMed,8,diabetesMed,8.0
21,glimepiride,9,glimepiride,15.0
27,rosiglitazone,10,rosiglitazone,18.0
