# Logistic Regression

## Resampling: downsampling
## Feature Selection:Wrapper Method
## Data: df3_grouped

In [1]:
from sklearn import __version__ 
print('The scikit-learn version is {}.'.format(__version__))

The scikit-learn version is 0.19.1.


### Import Relevant Libraries

In [21]:
import pandas as pd, numpy as np #basic packages

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import RFE

from sklearn.metrics import accuracy_score, precision_score, log_loss, confusion_matrix

from sklearn.utils import resample

import time

# Import Data and Define Global Variables

In [22]:
df = pd.read_csv("data_folder/df3_grouped.csv", index_col = 0)
df.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_AfricanAmerican,race_Asian,...,insulin_Up,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted
0,0.024369,0.999109,0.0,0.024369,0.0,0.0,0.0,0.024369,0,0,...,0,0,1,0,0,0,1,1,0,0
1,0.048069,0.945362,0.0,0.288416,0.0,0.0,0.0,0.144208,0,0,...,1,0,1,0,0,1,0,0,1,0
2,0.105409,0.579751,0.263523,0.68516,0.105409,0.0,0.052705,0.316228,1,0,...,0,0,1,0,0,0,1,0,1,0
3,0.042201,0.928427,0.021101,0.33761,0.0,0.0,0.0,0.147704,0,0,...,1,0,1,0,0,1,0,0,1,0
4,0.019277,0.983135,0.0,0.154217,0.0,0.0,0.0,0.096386,0,0,...,0,0,1,0,0,1,0,0,1,0


In [23]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# Resample with Undersampling

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [25]:
print(y_train.value_counts())

0    63304
1     7932
Name: readmitted, dtype: int64


In [29]:
print(y_train.value_counts())

n_balance = int((y_train.value_counts()[0] + y_train.value_counts()[1]) / 2)
print(n_balance)

0    63304
1     7932
Name: readmitted, dtype: int64
35618


In [30]:
df_train = pd.DataFrame(X_train).join(y_train)

In [31]:
df_train.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_AfricanAmerican,race_Asian,...,insulin_Up,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted
15992,0.204039,0.918176,0.0,0.285655,0.0,0.0,0.0,0.183635,0,0,...,0,0,1,0,0,1,0,0,1,0
10606,0.097503,0.950654,0.0,0.268133,0.0,0.0,0.0,0.121879,0,0,...,0,0,1,0,0,0,1,1,0,0
64779,0.026279,0.972338,0.052559,0.183956,0.0,0.0,0.0,0.131397,0,0,...,0,0,1,0,0,0,1,1,0,0
83257,0.152195,0.856095,0.0,0.456584,0.0,0.019024,0.076097,0.171219,0,0,...,0,0,1,0,0,1,0,0,1,0
4204,0.221389,0.940902,0.0,0.221389,0.0,0.0,0.0,0.129143,0,0,...,0,0,1,0,0,0,1,0,1,0


In [32]:
# Separate majority and minority classes
df_majority = df_train[df_train.readmitted==0]
df_minority = df_train[df_train.readmitted==1]
 
# Downsample majority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples=63304,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_train_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_train_upsampled.readmitted.value_counts()
# 1    11357
# 0    11357
# Name: balance, dtype: int64

1    63304
0    63304
Name: readmitted, dtype: int64

# Separate Training Data into X and y again

In [33]:
X_train = df_train_upsampled.iloc[:,:-1]
y_train = df_train_upsampled.iloc[:,-1]

In [34]:
(type(y_test) == type(y_train)) & (type(X_test) == type(X_train))

True

In [35]:
y_train.value_counts()

1    63304
0    63304
Name: readmitted, dtype: int64

# Wrapper Method Feature Selection

In [36]:
def top_40(model, X, y):

    start = time.time()
    
    rfe = RFE(model, 40)
    fit = rfe.fit(X, y)
    
    keep = []
    for i,a in enumerate(fit.support_):
        if (a == True):
            keep.append(i)
            
    end = time.time()
    
    print("Runtime: %g" % (end - start))
    
    return keep

# Logistic Regression Algorithm

In [37]:
def lr(X_train, y_train, X_test):
    
    start = time.time()
    
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    
    end = time.time()
    
    print("Runtime: %g" % (end - start))
    
    return y_pred

In [38]:
lr_keep = top_40(LogisticRegression(), X_train, y_train)

Runtime: 726.653


In [39]:
lr_pred = lr(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])

Runtime: 0.514167


# Results

In [40]:
print("Accuracy:" , accuracy_score(y_test, lr_pred)," Precision:" , precision_score(y_test, lr_pred))

Accuracy: 0.711529642974  Precision: 0.187963821892


In [41]:
cf = confusion_matrix(y_test, lr_pred)
cfa = list(cf[0]) + list(cf[1])
label = ["tn", "fp", "fn", "tp"]

In [42]:
for i in range(0,4):
    print(label[i], cfa[i])

tn 20102
fp 7003
fn 1804
tp 1621
