# Detecting Prostate Cancer

In [26]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split

In [27]:
df = pd.read_csv("Prostate_Cancer.csv")

In [28]:
df.head(5)

Unnamed: 0,id,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,1,M,23,12,151,954,0.143,0.278,0.242,0.079
1,2,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,3,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,4,M,14,16,78,386,0.07,0.284,0.26,0.097
4,5,M,9,19,135,1297,0.141,0.133,0.181,0.059


In [29]:
df = df.drop(columns=['id'])

In [30]:
df.head(5)

Unnamed: 0,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,M,23,12,151,954,0.143,0.278,0.242,0.079
1,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,M,14,16,78,386,0.07,0.284,0.26,0.097
4,M,9,19,135,1297,0.141,0.133,0.181,0.059


In [31]:
df['diagnosis_result'].value_counts()

M    62
B    38
Name: diagnosis_result, dtype: int64

In [32]:
cols_to_norm = ['radius','texture','perimeter','area','smoothness','compactness','symmetry','fractal_dimension']

In [33]:
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [34]:
df.head(5)

Unnamed: 0,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,M,0.875,0.0625,0.825,0.448687,1.0,0.781759,0.633136,0.590909
1,B,0.0,0.125,0.675,0.670644,1.0,0.13355,0.272189,0.090909
2,M,0.75,1.0,0.65,0.597255,0.753425,0.397394,0.426036,0.159091
3,M,0.3125,0.3125,0.216667,0.109785,0.0,0.801303,0.739645,1.0
4,M,0.0,0.5,0.691667,0.653341,0.972603,0.309446,0.272189,0.136364


In [35]:
df.describe()

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.490625,0.451875,0.373167,0.298854,0.448356,0.288925,0.344201,0.265682
std,0.304943,0.32456,0.197301,0.190758,0.200572,0.199165,0.18216,0.185249
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.1875,0.1875,0.254167,0.163932,0.321918,0.138436,0.218935,0.136364
50%,0.5,0.40625,0.35,0.263723,0.438356,0.262215,0.325444,0.227273
75%,0.75,0.703125,0.51875,0.426611,0.575342,0.387622,0.43787,0.363636
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [36]:
train, test = train_test_split(df, test_size=0.35)

In [37]:
train.describe()

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
count,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0
mean,0.491346,0.485577,0.367564,0.288544,0.469968,0.317565,0.369504,0.295804
std,0.292291,0.32528,0.194215,0.18117,0.202951,0.213841,0.19362,0.193369
min,0.0,0.0,0.016667,0.011933,0.0,0.0,0.0,0.0
25%,0.3125,0.1875,0.258333,0.163484,0.342466,0.140065,0.254438,0.159091
50%,0.5,0.4375,0.35,0.25716,0.465753,0.29316,0.337278,0.272727
75%,0.6875,0.8125,0.516667,0.424224,0.60274,0.429967,0.455621,0.363636
max,1.0,1.0,0.825,0.717184,1.0,1.0,1.0,1.0


In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [42]:
X_train = train.drop(columns=['diagnosis_result'])
y_train = train['diagnosis_result']
X_test = test.drop(columns=['diagnosis_result'])
y_test = test['diagnosis_result']

In [45]:
y_train = y_train.reshape(-1,1)
print(y_train.shape)
y_test = y_test.reshape(-1,1)
print(y_test.shape)

(65, 1)
(35, 1)


  This is separate from the ipykernel package so we can avoid doing imports until


In [47]:
for K in range(15):
    K_value = K+1
    neigh = KNeighborsClassifier(n_neighbors = K_value, weights='uniform', algorithm='auto')
    neigh.fit(X_train, y_train) 
    y_pred = neigh.predict(X_test)
    print("Accuracy is ", accuracy_score(y_test,y_pred)*100,"% for K-Value:",K_value)

Accuracy is  65.71428571428571 % for K-Value: 1
Accuracy is  74.28571428571429 % for K-Value: 2
Accuracy is  82.85714285714286 % for K-Value: 3
Accuracy is  85.71428571428571 % for K-Value: 4
Accuracy is  80.0 % for K-Value: 5
Accuracy is  88.57142857142857 % for K-Value: 6
Accuracy is  82.85714285714286 % for K-Value: 7
Accuracy is  82.85714285714286 % for K-Value: 8
Accuracy is  82.85714285714286 % for K-Value: 9
Accuracy is  88.57142857142857 % for K-Value: 10
Accuracy is  80.0 % for K-Value: 11
Accuracy is  82.85714285714286 % for K-Value: 12
Accuracy is  80.0 % for K-Value: 13
Accuracy is  82.85714285714286 % for K-Value: 14
Accuracy is  80.0 % for K-Value: 15


  after removing the cwd from sys.path.


#### Optimum comes out to be at K=10

In [49]:
neigh = KNeighborsClassifier(n_neighbors = 10, weights='uniform', algorithm='auto')
neigh.fit(X_train, y_train) 
y_pred = neigh.predict(X_test)
print("Accuracy is ", accuracy_score(y_test,y_pred)*100,"% for K-Value:",10)

Accuracy is  88.57142857142857 % for K-Value: 10


  


In [53]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred,target_names=['M', 'B']))

             precision    recall  f1-score   support

          M       0.92      0.80      0.86        15
          B       0.86      0.95      0.90        20

avg / total       0.89      0.89      0.88        35

