In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer

In [24]:
data = load_breast_cancer()

data_df = pd.DataFrame(data.data, columns=data.feature_names)
data_df['target'] = pd.Series(data.target)
dataset = data_df

dataset.head()

In [102]:
# features or attributes

features = dataset.columns[:30]
print("feature labels : ")
print(list(features))


# replace missing field by mean of attribute

means = {}
for feature in features:
    means[feature] = train_df[feature].mean()
    train_df[feature].fillna(means[feature] , inplace = True)
    
train_df['target'].fillna(0, inplace = True)

#function to get hold out test set

from random import randrange, seed

def get_hold_out(dataset, train_size):
    train = pd.DataFrame(columns = dataset.columns)
    test = dataset.copy()
    while len(train) < train_size:
        index = randrange(int(len(test)))
        row = test.iloc[index]
        test.drop([index], axis = 0, inplace = True)
        train.loc[len(train.index)] = row
        test.index = range(len(test))
    return train, test

seed(1)
train_df, test_df = get_hold_out(dataset, 400)

print()
print("train data size : ", len(train_df))
print("test data size : ", len(test_df))

train_df.isnull().values.any()


In [103]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate


y = train_df.target.copy()
X = train_df.drop('target', axis = 1)

lr = LogisticRegression(solver = 'liblinear')
results = cross_validate(lr, X, y, scoring = ('neg_log_loss'), cv = 5)

validation_scores = -1*results['test_score']
print("Logistic Regression without regularizations")
print('Loss for each fold: ',validation_scores)
print('Mean Loss: ',validation_scores.mean())

lr_score = validation_scores.mean()

In [104]:
l1 = LogisticRegression(penalty = 'l1', solver = 'liblinear')
results = cross_validate(l1, X, y, scoring = ('neg_log_loss'), cv = 5)

validation_scores = -1 * results['test_score']
print("Logistic Regression with L1 regularization")
print('Loss for each fold: ',validation_scores)
print('Mean Loss: ',validation_scores.mean())

l1_score = validation_scores.mean()

In [105]:
l2 = LogisticRegression(penalty = 'l2',  solver = 'liblinear')
results = cross_validate(l2, X, y, scoring = ('neg_log_loss'), cv = 5)

validation_scores = -1 * results['test_score']
print("Logistic Regression with L2 regularization")
print('Loss for each fold: ',validation_scores)
print('Mean Loss: ',validation_scores.mean())

l2_score = validation_scores.mean()

Logistic Regression with L1 penalty has minimum loss hence it is picked for further tuning


In [106]:

# fill missing values in hold out set first
for feature in features:
    test_df[feature] = test_df[feature].fillna(means[feature])
y_test = test_df['target']
y_test.fillna(0, inplace = True)
test_df = test_df.drop('target', axis = 1)



In [107]:
from sklearn.metrics import roc_curve, roc_auc_score

y = np.array(y)
y = np.reshape(y,X.shape[0])
y_test = np.array(y_test)
y_test = np.reshape(y_test, (test_df.shape[0],))
l2.fit(X,y)

base_probs = [0 for _ in range(len(y_test))]
preds = l2.predict_proba(test_df)[:,1]

In [110]:
base_fpr, base_tpr, thresh1 = roc_curve(y_test, base_probs)
log_fpr, log_tpr, thresh2 = roc_curve(y_test, preds)

base_auc = roc_auc_score(y_test, base_probs)
log_auc = roc_auc_score(y_test, preds)

plt.plot(base_fpr, base_tpr, linestyle='--', label='Random predictions')
plt.plot(log_fpr, log_tpr, marker='.', label='Model')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.title('ROC CURVE')
plt.show()

print("AUC value for random predictions = "+str(base_auc))
print("AUC value for model's predictions = "+str(log_auc))

In [114]:
# tuning the threshold

optimal_idx = np.argmax(log_tpr - log_fpr)
optimal_thresh = thresh2[optimal_idx]
print("optimal value of threshold based on ROC curve =",optimal_thresh)