# Logistic Regression - AUC

In [24]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pandas as pd

In [8]:
ds = datasets.load_breast_cancer()
print(ds.keys())
filename = ds.filename
cols = ds.feature_names
cols = [i.replace(' ','_') for i in cols]
cols.append('breast_cancer')
print(cols)

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness', 'mean_compactness', 'mean_concavity', 'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension', 'radius_error', 'texture_error', 'perimeter_error', 'area_error', 'smoothness_error', 'compactness_error', 'concavity_error', 'concave_points_error', 'symmetry_error', 'fractal_dimension_error', 'worst_radius', 'worst_texture', 'worst_perimeter', 'worst_area', 'worst_smoothness', 'worst_compactness', 'worst_concavity', 'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension', 'breast_cancer']


In [9]:
breast = pd.read_csv(filename, header=None, names=cols)
breast.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,breast_cancer
0,569.0,30.0,malignant,benign,,,,,,,...,,,,,,,,,,
1,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
2,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
3,19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
4,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0


In [10]:
breast.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570 entries, 0 to 569
Data columns (total 31 columns):
mean_radius                570 non-null float64
mean_texture               570 non-null float64
mean_perimeter             570 non-null object
mean_area                  570 non-null object
mean_smoothness            569 non-null float64
mean_compactness           569 non-null float64
mean_concavity             569 non-null float64
mean_concave_points        569 non-null float64
mean_symmetry              569 non-null float64
mean_fractal_dimension     569 non-null float64
radius_error               569 non-null float64
texture_error              569 non-null float64
perimeter_error            569 non-null float64
area_error                 569 non-null float64
smoothness_error           569 non-null float64
compactness_error          569 non-null float64
concavity_error            569 non-null float64
concave_points_error       569 non-null float64
symmetry_error             569 no

In [11]:
breast.dropna(subset=['mean_smoothness'], inplace=True)
breast.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569 entries, 1 to 569
Data columns (total 31 columns):
mean_radius                569 non-null float64
mean_texture               569 non-null float64
mean_perimeter             569 non-null object
mean_area                  569 non-null object
mean_smoothness            569 non-null float64
mean_compactness           569 non-null float64
mean_concavity             569 non-null float64
mean_concave_points        569 non-null float64
mean_symmetry              569 non-null float64
mean_fractal_dimension     569 non-null float64
radius_error               569 non-null float64
texture_error              569 non-null float64
perimeter_error            569 non-null float64
area_error                 569 non-null float64
smoothness_error           569 non-null float64
compactness_error          569 non-null float64
concavity_error            569 non-null float64
concave_points_error       569 non-null float64
symmetry_error             569 no

In [12]:
breast['mean_perimeter'] = breast['mean_perimeter'].astype('float')
breast['mean_area'] = breast['mean_area'].astype('float')
breast.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569 entries, 1 to 569
Data columns (total 31 columns):
mean_radius                569 non-null float64
mean_texture               569 non-null float64
mean_perimeter             569 non-null float64
mean_area                  569 non-null float64
mean_smoothness            569 non-null float64
mean_compactness           569 non-null float64
mean_concavity             569 non-null float64
mean_concave_points        569 non-null float64
mean_symmetry              569 non-null float64
mean_fractal_dimension     569 non-null float64
radius_error               569 non-null float64
texture_error              569 non-null float64
perimeter_error            569 non-null float64
area_error                 569 non-null float64
smoothness_error           569 non-null float64
compactness_error          569 non-null float64
concavity_error            569 non-null float64
concave_points_error       569 non-null float64
symmetry_error             569 

In [13]:
X = breast.drop('breast_cancer', axis=1).values
y = breast['breast_cancer'].values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [21]:
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
y_pred_prob = logreg.predict_proba(X_test)[:,1]
print(y_pred_prob)

[8.04811562e-01 1.54784500e-08 3.65440566e-03 9.95045833e-01
 9.98695878e-01 1.00925940e-09 3.68882198e-12 1.27443291e-02
 9.95149144e-01 9.82128356e-01 9.17070449e-01 8.44825497e-04
 9.86929424e-01 1.68854429e-01 9.95199352e-01 2.35702869e-03
 9.95889856e-01 9.99545317e-01 9.96892029e-01 2.23528126e-07
 8.29864022e-01 9.76898438e-01 3.41867083e-09 9.90667558e-01
 9.82427878e-01 9.98956295e-01 9.94780615e-01 9.86207313e-01
 9.89701240e-01 3.90032119e-08 9.89116363e-01 9.97814842e-01
 9.68391169e-01 9.77547382e-01 9.96464568e-01 9.91816846e-01
 5.17752770e-03 9.91201969e-01 1.16455588e-05 7.56921081e-01
 9.95686734e-01 1.29641252e-03 9.96728743e-01 9.79930753e-01
 9.97569221e-01 9.09752037e-01 9.97170180e-01 9.87559130e-01
 8.60908923e-01 9.94454262e-01 1.78124339e-04 2.34695542e-09
 8.29220020e-01 9.99205306e-01 9.96848272e-01 9.67810260e-01
 9.94987001e-01 5.57942171e-14 6.15466808e-01 9.99022826e-01
 9.72657977e-01 2.82802143e-07 1.41246444e-11 9.04739465e-01
 9.91774843e-01 8.180583



In [22]:
print('AUC roc_auc_score: {}'.format(roc_auc_score(y_test, y_pred_prob)))

AUC roc_auc_score: 0.9975506756756757


In [27]:
logreg_cv = LogisticRegression()
y_pred_prob_cv = cross_val_score(logreg_cv, X, y, cv=5, scoring='roc_auc')
print(y_pred_prob_cv)

[0.99450904 0.99192506 0.99731724 0.98256204 0.99664655]


