# SVC
# Resample — SMOTE
### Import Libraries

conda install -c glemaitre imbalanced-learn 


conda update scikit-learn

In [3]:
from sklearn import __version__ 
print('The scikit-learn version is {}.'.format(__version__))

The scikit-learn version is 0.19.1.


In [4]:
import pandas as pd, numpy as np #basic packages

from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import RFE

from sklearn.metrics import accuracy_score, precision_score, log_loss, confusion_matrix

from imblearn.over_sampling import SMOTE

import time

# Import Data and Define Global Variables

In [5]:
df = pd.read_csv("data_folder/df3_grouped.csv", index_col = 0)
df.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_AfricanAmerican,race_Asian,...,insulin_Up,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted
0,0.024369,0.999109,0.0,0.024369,0.0,0.0,0.0,0.024369,0,0,...,0,0,1,0,0,0,1,1,0,0
1,0.048069,0.945362,0.0,0.288416,0.0,0.0,0.0,0.144208,0,0,...,1,0,1,0,0,1,0,0,1,0
2,0.105409,0.579751,0.263523,0.68516,0.105409,0.0,0.052705,0.316228,1,0,...,0,0,1,0,0,0,1,0,1,0
3,0.042201,0.928427,0.021101,0.33761,0.0,0.0,0.0,0.147704,0,0,...,1,0,1,0,0,1,0,0,1,0
4,0.019277,0.983135,0.0,0.154217,0.0,0.0,0.0,0.096386,0,0,...,0,0,1,0,0,1,0,0,1,0


In [6]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
len(df.columns)

255

# Imbalanced Data

In [7]:
pd.Series(y).value_counts()

0    90409
1    11357
Name: readmitted, dtype: int64

# Resample with SMOTE

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [9]:
type(y_test)

pandas.core.series.Series

In [10]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

In [None]:
smote_train = pd.concat(X_train,y_train)

smote_train.head()

In [11]:
(type(y_test) == type(y_train)) & (type(X_test) == type(X_train))

True

In [12]:
y_train.value_counts()

1    63304
0    63304
dtype: int64

# Wrapper Method Feature Selection

In [13]:
def top_40(model, X, y):

    rfe = RFE(model, 40)
    fit = rfe.fit(X, y)
    
    keep = []
    for i,a in enumerate(fit.support_):
        if (a == True):
            keep.append(i)
    
    return keep

# Logistic Regression Algorithm

In [21]:
def rf(X_train, y_train, X_test):
    
    start = time.time()
    
    rf = SVC(kernel = "linear")
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_test)
    
    end = time.time()
    
    print("Runtime: %g" % (end - start))
    
    return y_pred

In [15]:
rf_keep = top_40(LogisticRegression(), X_train, y_train)

pd.Series(rf_keep).to_csv("top40_features_lr.csv")

In [22]:
rf_pred = rf(X_train.iloc[:,rf_keep], y_train, X_test.iloc[:,rf_keep])

Runtime: 1356.8


# Results

In [23]:
print("Accuracy:" , accuracy_score(y_test, rf_pred)," Precision:" , precision_score(y_test, rf_pred))

Accuracy: 0.791025221094  Precision: 0.216356306393


In [24]:
cf = confusion_matrix(y_test, rf_pred)
cfa = list(cf[0]) + list(cf[1])
label = ["tn", "fp", "fn", "tp"]

In [25]:
for i in range(0,4):
    print(label[i], cfa[i])

tn 23023
fp 4082
fn 2298
tp 1127
