# Breast Cancer Analysis with Logistic Regression

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplolib inline

UsageError: Line magic function `%matplolib` not found.


## Importing the dataset

In [3]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
print(breast_cancer['DESCR'])

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [4]:
dataset = pd.DataFrame(breast_cancer['data'], columns=breast_cancer['feature_names'])
dataset['target'] = breast_cancer['target']
dataset

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [5]:
dataset.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

No null values

In [6]:
X = dataset[breast_cancer.feature_names].values
y = dataset['target'].values

## Split the dataset into training and test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Fitting Logistic Regression to train set

In [8]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
model.score(X_train, y_train)

0.9626373626373627

## Predicting a new result

In [10]:
model.predict([X_train[0]])

array([1])

## Predicting Test set results

In [11]:
y_pred = model.predict(X_test)
y_pred

array([0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1])

In [12]:
model.score(X_test, y_test)

0.9473684210526315

## Making Confusion Matrix

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
cm

array([[31,  5],
       [ 1, 77]])

In [14]:
print(
    accuracy_score(y_test, y_pred),
    precision_score(y_test, y_pred),
    recall_score(y_test, y_pred),
    f1_score(y_test, y_pred),
    sep='\n')

0.9473684210526315
0.9390243902439024
0.9871794871794872
0.9625


## Computing accuracy with k-fold cross validation

In [15]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
print("Accuracy: {:.2f} ".format(accuracies.mean()))
print("Standard Deviation: {:.2f} ".format(accuracies.std()*100))

Accuracy: 0.95 
Standard Deviation: 2.69 


## Fitting the Random forest model

In [16]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
y_pred = model.predict(X_test)

In [18]:
confusion_matrix(y_test, y_pred)

array([[31,  5],
       [ 2, 76]])

In [19]:
print(
    accuracy_score(y_test, y_pred),
    precision_score(y_test, y_pred),
    recall_score(y_test, y_pred),
    f1_score(y_test, y_pred),
    sep='\n')

0.9385964912280702
0.9382716049382716
0.9743589743589743
0.9559748427672956


In [20]:
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
print("Accuracy: {:.2f} ".format(accuracies.mean()))
print("Standard Deviation: {:.2f} ".format(accuracies.std()*100))

Accuracy: 0.96 
Standard Deviation: 2.06 


## Fitting the Catboost model

In [21]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier()
cat_model.fit(X_train, y_train)

6s
613:	learn: 0.0202248	total: 16.9s	remaining: 10.6s
614:	learn: 0.0201539	total: 16.9s	remaining: 10.6s
615:	learn: 0.0201193	total: 16.9s	remaining: 10.5s
616:	learn: 0.0200723	total: 16.9s	remaining: 10.5s
617:	learn: 0.0200233	total: 17s	remaining: 10.5s
618:	learn: 0.0199741	total: 17s	remaining: 10.5s
619:	learn: 0.0199449	total: 17s	remaining: 10.4s
620:	learn: 0.0199037	total: 17.1s	remaining: 10.4s
621:	learn: 0.0198570	total: 17.1s	remaining: 10.4s
622:	learn: 0.0197960	total: 17.1s	remaining: 10.4s
623:	learn: 0.0197526	total: 17.1s	remaining: 10.3s
624:	learn: 0.0196898	total: 17.2s	remaining: 10.3s
625:	learn: 0.0196539	total: 17.2s	remaining: 10.3s
626:	learn: 0.0196230	total: 17.2s	remaining: 10.3s
627:	learn: 0.0195984	total: 17.3s	remaining: 10.2s
628:	learn: 0.0195704	total: 17.3s	remaining: 10.2s
629:	learn: 0.0195530	total: 17.3s	remaining: 10.2s
630:	learn: 0.0195095	total: 17.4s	remaining: 10.2s
631:	learn: 0.0194692	total: 17.4s	remaining: 10.1s
632:	learn: 0.0

<catboost.core.CatBoostClassifier at 0x7f1967fc1d10>

In [22]:
y_pred = cat_model.predict(X_test)

In [23]:
confusion_matrix(y_test, y_pred)

array([[34,  2],
       [ 1, 77]])

In [24]:
print(
    accuracy_score(y_test, y_pred),
    precision_score(y_test, y_pred),
    recall_score(y_test, y_pred),
    f1_score(y_test, y_pred),
    sep='\n')

0.9736842105263158
0.9746835443037974
0.9871794871794872
0.980891719745223


In [25]:
accuracies = cross_val_score(estimator=cat_model, X=X_train, y=y_train, cv=10)
print("Accuracy: {:.2f} ".format(accuracies.mean()))
print("Standard Deviation: {:.2f} ".format(accuracies.std()*100))

Accuracy: 0.97 
Standard Deviation: 1.82 
