## Cancer detection with breast-cancer-data.csv data

In [1]:
# Importing libraries necessary for this project
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

#### Read data from breast-cancer-data.csv

In [2]:
df = pd.read_csv('breast-cancer-data.csv')

In [3]:
df = pd.get_dummies(df, columns = ['diagnosis'],drop_first=True)

#### Convert all non categorical data

In [4]:
df_y = df['diagnosis_M']
# df_y = np.array(df_y)
#df_y.shape
df_y.head()

0    1
1    1
2    1
3    1
4    1
Name: diagnosis_M, dtype: uint8

In [5]:
df.head()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_M
0,842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


#### Remove all the uneccessary data columns

In [6]:
df.drop('id', axis=1, inplace=True)
df.drop('diagnosis_M', axis=1, inplace=True)
df.head()
#df_x = np.array(df_x)
#df_x

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
x = StandardScaler().fit_transform(df)

In [8]:
x_train, x_test, y_train, y_test =train_test_split(df,df_y, test_size = 0.25, random_state =5)

#### Perform PCA analysis with the given data

In [9]:
pca = PCA(2)  # project from 64 to 2 dimensions
pca2 = pca.fit_transform(df)
print(df.shape)
print(pca2.shape)

(569, 30)
(569, 2)


In [10]:
pca.explained_variance_ratio_

array([0.98204467, 0.01617649])

In [11]:
abs(pca.components_)

array([[5.08623202e-03, 2.19657026e-03, 3.50763298e-02, 5.16826469e-01,
        4.23694535e-06, 4.05260047e-05, 8.19399539e-05, 4.77807775e-05,
        7.07804332e-06, 2.62155251e-06, 3.13742507e-04, 6.50984008e-05,
        2.23634150e-03, 5.57271669e-02, 8.05646029e-07, 5.51918197e-06,
        8.87094462e-06, 3.27915009e-06, 1.24101836e-06, 8.54530832e-08,
        7.15473257e-03, 3.06736622e-03, 4.94576447e-02, 8.52063392e-01,
        6.42005481e-06, 1.01275937e-04, 1.68928625e-04, 7.36658178e-05,
        1.78986262e-05, 1.61356159e-06],
       [9.28705650e-03, 2.88160658e-03, 6.27480827e-02, 8.51823720e-01,
        1.48194356e-05, 2.68862249e-06, 7.51419574e-05, 4.63501038e-05,
        2.52430431e-05, 1.61197148e-05, 5.38692831e-05, 3.48370414e-04,
        8.19640791e-04, 7.51112451e-03, 1.49438131e-06, 1.27357957e-05,
        2.86921009e-05, 9.36007477e-06, 1.22647432e-05, 2.89683790e-07,
        5.68673345e-04, 1.32152605e-02, 1.85961117e-04, 5.19742358e-01,
        7.68565692e-05,

#### Choose only the neccessary data column after PCA transform

In [12]:
import operator

r = [5.08623202e-03, 2.19657026e-03, 3.50763298e-02, 5.16826469e-01,
        4.23694535e-06, 4.05260047e-05, 8.19399539e-05, 4.77807775e-05,
        7.07804332e-06, 2.62155251e-06, 3.13742507e-04, 6.50984008e-05,
        2.23634150e-03, 5.57271669e-02, 8.05646029e-07, 5.51918197e-06,
        8.87094462e-06, 3.27915009e-06, 1.24101836e-06, 8.54530832e-08,
        7.15473257e-03, 3.06736622e-03, 4.94576447e-02, 8.52063392e-01,
        6.42005481e-06, 1.01275937e-04, 1.68928625e-04, 7.36658178e-05,
        1.78986262e-05, 1.61356159e-06]
c = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

d = dict(zip(c, r))
sorted_d = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True))

print(sorted_d )

{'area_worst': 0.852063392, 'area_mean': 0.516826469, 'area_se': 0.0557271669, 'perimeter_worst': 0.0494576447, 'perimeter_mean': 0.0350763298, 'radius_worst': 0.00715473257, 'radius_mean': 0.00508623202, 'texture_worst': 0.00306736622, 'perimeter_se': 0.0022363415, 'texture_mean': 0.00219657026, 'radius_se': 0.000313742507, 'concavity_worst': 0.000168928625, 'compactness_worst': 0.000101275937, 'concavity_mean': 8.19399539e-05, 'concave points_worst': 7.36658178e-05, 'texture_se': 6.50984008e-05, 'concave points_mean': 4.77807775e-05, 'compactness_mean': 4.05260047e-05, 'symmetry_worst': 1.78986262e-05, 'concavity_se': 8.87094462e-06, 'symmetry_mean': 7.07804332e-06, 'smoothness_worst': 6.42005481e-06, 'compactness_se': 5.51918197e-06, 'smoothness_mean': 4.23694535e-06, 'concave points_se': 3.27915009e-06, 'fractal_dimension_mean': 2.62155251e-06, 'fractal_dimension_worst': 1.61356159e-06, 'symmetry_se': 1.24101836e-06, 'smoothness_se': 8.05646029e-07, 'fractal_dimension_se': 8.545308

In [13]:
var = ['area_worst', 'area_mean', 'area_se', 'perimeter_worst']
df_x = df[var]
df_x

Unnamed: 0,area_worst,area_mean,area_se,perimeter_worst
0,2019.0,1001.0,153.40,184.60
1,1956.0,1326.0,74.08,158.80
2,1709.0,1203.0,94.03,152.50
3,567.7,386.1,27.23,98.87
4,1575.0,1297.0,94.44,152.20
...,...,...,...,...
564,2027.0,1479.0,158.70,166.10
565,1731.0,1261.0,99.04,155.00
566,1124.0,858.1,48.55,126.70
567,1821.0,1265.0,86.22,184.60


#### Fit Logistic Regression model to perform cancer detection

In [14]:
x_train, x_test, y_train, y_test =train_test_split(df_x,df_y, test_size = 0.25, random_state =5)

In [15]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train) # train
temp=lr.predict(x_test)

In [16]:
lr.score(x_test, y_test)

0.965034965034965

In [17]:
metrics.accuracy_score(y_test,temp)

0.965034965034965

#### Evaluate confusion matrix for the predicted outcome

In [19]:
labels = pd.Series(lr.classes_)

In [21]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, temp)
print(pd.DataFrame(cm,columns=labels, index=labels)) # this is get the label names in CM
print('Accuracy: ' + str(abs(accuracy_score(y_test, temp))))

    0   1
0  87   1
1   4  51
Accuracy: 0.965034965034965
