In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.metrics import f1_score
from time import time
from operator import itemgetter
from scipy.stats import randint
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import  cross_val_score
import os
import subprocess
from time import time
from operator import itemgetter
from scipy.stats import randint
import pandas as pd
import numpy as np
from pprint import pprint


filename = 'df_imputed_scaled_OHE_reduced.pkl'
df = pd.read_pickle(filename)

X = df[df.columns[:-1]]
y = pd.Series(df['heartdisease'])


train_X,test_X,train_y,test_y =train_test_split(X,y,test_size=0.33,shuffle = True, random_state=45)



In [2]:
def report(grid_scores, n_top=3):
    """Report top n_top parameters settings, default n_top=3.

    Args
    ----
    grid_scores -- output from grid or random search
    n_top -- how many to report, of top models

    Returns
    -------
    top_params -- [dict] top parameter settings found in
                  search
    """
    top_scores = sorted(grid_scores,
                        key=itemgetter(1),
                        reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print(("Mean validation score: "
               "{0:.3f} (std: {1:.3f})").format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

    return top_scores[0].parameters

In [3]:
def run_randomsearch(X, y, clf, para_dist, cv=5,
                     n_iter_search=100):
    """Run a random search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_dist -- [dict] list, distributions of parameters
                  to sample
    cv -- fold of cross-validation, default 5
    n_iter_search -- number of random parameter sets to try,
                     default 20.

    Returns
    -------
    top_params -- [dict] from report()
    """
    random_search = RandomizedSearchCV(clf,
                        param_distributions=param_dist,
                        n_iter=n_iter_search)

    start = time()
    random_search.fit(X, y)
    print(("\nRandomizedSearchCV took {:.2f} seconds "
           "for {:d} candidates parameter "
           "settings.").format((time() - start),
                               n_iter_search))

    top_params = report(random_search.grid_scores_, 3)
    return  top_params

In [4]:
print("-- Random Parameter Search via liblinear 10-fold CV")
#solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},

    
C_range = np.logspace(-5, 5, 11)
penalty_range = ['l1', 'l2']

param_dist = { 'penalty': penalty_range,
               'C': C_range,}

pprint(param_dist)
lr_model = LogisticRegression()
lr_model_rs = run_randomsearch(X, y, lr_model, param_dist, cv=10, n_iter_search=22)

-- Random Parameter Search via liblinear 10-fold CV
{'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05]),
 'penalty': ['l1', 'l2']}

RandomizedSearchCV took 0.49 seconds for 22 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.796 (std: 0.013)
Parameters: {'penalty': 'l2', 'C': 0.01}

Model with rank: 2
Mean validation score: 0.786 (std: 0.018)
Parameters: {'penalty': 'l2', 'C': 0.1}

Model with rank: 3
Mean validation score: 0.778 (std: 0.021)
Parameters: {'penalty': 'l1', 'C': 0.1}



In [5]:
# test the retuned best parameters
print("\n\n-- Testing best parameters [Random, CV = 10]...")
lr_model_rs_final = LogisticRegression(**lr_model_rs)
scores = cross_val_score(lr_model_rs_final, X, y, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()),
                                          end="\n\n" )



-- Testing best parameters [Random, CV = 10]...
mean: 0.798 (std: 0.075)



In [6]:
lr_model_rs_best = LogisticRegression(**lr_model_rs)
lr_model_rs_base = LogisticRegression() 



lr_model_rs_base.fit(train_X, train_y)
lr_model_rs_best.fit(train_X, train_y)
prediction_base=lr_model_rs_base.predict(test_X)
prediction_best=lr_model_rs_best.predict(test_X)





prediction_base_train=lr_model_rs_base.predict(train_X)
prediction_best_train=lr_model_rs_best.predict(train_X)

print('Test Accuracy for LogisticRegression base is ',metrics.accuracy_score(prediction_base,test_y))
print('Train Accuracy for LogisticRegression base is ',metrics.accuracy_score(train_y,prediction_base_train))
print('\n')
print('Test Accuracy for LogisticRegression best is ',metrics.accuracy_score(prediction_best,test_y))
print('Train Accuracy for LogisticRegression best is ',metrics.accuracy_score(train_y,prediction_best_train))





Test Accuracy for LogisticRegression base is  0.8355263157894737
Train Accuracy for LogisticRegression base is  0.7857142857142857


Test Accuracy for LogisticRegression best is  0.8486842105263158
Train Accuracy for LogisticRegression best is  0.7792207792207793


In [26]:
'''
default: ‘liblinear’ Algorithm to use in the optimization problem.
For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and
‘saga’ are faster for large ones.
For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’
handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas
‘liblinear’ and ‘saga’ handle L1 penalty.
'''

print("-- Random Parameter Search via liblinear 10-fold CV")
#solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},

    
C_range = np.logspace(-5, 5, 11)
penalty_range = ['l2']
solvers = ['newton-cg', 'lbfgs', 'sag']

param_dist = { 'penalty': penalty_range,
               'C': C_range,
             'solver': solvers}

pprint(param_dist)
lr_model = LogisticRegression()
lr_model_rs = run_randomsearch(X, y, lr_model, param_dist, cv=10, n_iter_search=22)

# test the retuned best parameters
print("\n\n-- Testing best parameters [Random, CV = 10]...")
lr_model_rs_final = LogisticRegression(**lr_model_rs)
scores = cross_val_score(lr_model_rs_final, X, y, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()),
                                          end="\n\n" )


-- Random Parameter Search via liblinear 10-fold CV
{'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05]),
 'penalty': ['l2'],
 'solver': ['newton-cg', 'lbfgs', 'sag']}





RandomizedSearchCV took 0.89 seconds for 22 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.787 (std: 0.027)
Parameters: {'solver': 'sag', 'penalty': 'l2', 'C': 0.01}

Model with rank: 2
Mean validation score: 0.787 (std: 0.027)
Parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.01}

Model with rank: 3
Mean validation score: 0.787 (std: 0.027)
Parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.01}



-- Testing best parameters [Random, CV = 10]...
mean: 0.793 (std: 0.063)

