In [35]:
#importing all required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.metrics import accuracy_score

In [36]:
#loading the dataset
df = pd.read_csv('/content/cancer_data.csv')

In [37]:
#printing first 5 rows of our data
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [38]:
#printing last 5 rows of our data
df.tail()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4
698,897471,4,8,8,5,4,5,10,4,1,4


In [39]:
#checking number of null/empty values
df.isnull().sum()

 id                       0
clump_thickness           0
unif_cell_size            0
unif_cell_shape           0
marg_adhesion             0
single_epith_cell_size    0
bare_nuclei               0
bland_chromation          0
normal_nucleoli           0
mitoses                   0
class                     0
dtype: int64

In [40]:
#getting information of non-null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0    id                     699 non-null    int64 
 1   clump_thickness         699 non-null    int64 
 2   unif_cell_size          699 non-null    int64 
 3   unif_cell_shape         699 non-null    int64 
 4   marg_adhesion           699 non-null    int64 
 5   single_epith_cell_size  699 non-null    int64 
 6   bare_nuclei             699 non-null    object
 7   bland_chromation        699 non-null    int64 
 8   normal_nucleoli         699 non-null    int64 
 9   mitoses                 699 non-null    int64 
 10  class                   699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [41]:
#checking number of rows and columns in our dataset
df.shape

(699, 11)

In [42]:
#converts non-numeric values to numeric values else converts them to NaN values
df['bare_nuclei'] = pd.to_numeric(df.bare_nuclei, errors="coerce")

In [43]:
#getting information of non-null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0    id                     699 non-null    int64  
 1   clump_thickness         699 non-null    int64  
 2   unif_cell_size          699 non-null    int64  
 3   unif_cell_shape         699 non-null    int64  
 4   marg_adhesion           699 non-null    int64  
 5   single_epith_cell_size  699 non-null    int64  
 6   bare_nuclei             683 non-null    float64
 7   bland_chromation        699 non-null    int64  
 8   normal_nucleoli         699 non-null    int64  
 9   mitoses                 699 non-null    int64  
 10  class                   699 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 60.2 KB


In [44]:
#checking number of null/empty values
df.isna().sum()

 id                        0
clump_thickness            0
unif_cell_size             0
unif_cell_shape            0
marg_adhesion              0
single_epith_cell_size     0
bare_nuclei               16
bland_chromation           0
normal_nucleoli            0
mitoses                    0
class                      0
dtype: int64

In [45]:
#drops all the rows having null values and updates the original table
df.dropna(inplace=True)

In [46]:
#checking number of rows and columns in our dataset
df.shape

(683, 11)

In [47]:
#drops the non-required column of id
df.drop(columns=' id',axis=1)

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1,2
695,2,1,1,1,2,1.0,1,1,1,2
696,5,10,10,3,7,3.0,8,10,2,4
697,4,8,6,4,3,4.0,10,6,1,4


In [48]:
#separating the data and outcome
x = df.drop(columns=[' id','class'],axis=1)
y = df['class']
print(x)
print(y)

     clump_thickness  unif_cell_size  unif_cell_shape  marg_adhesion  \
0                  5               1                1              1   
1                  5               4                4              5   
2                  3               1                1              1   
3                  6               8                8              1   
4                  4               1                1              3   
..               ...             ...              ...            ...   
694                3               1                1              1   
695                2               1                1              1   
696                5              10               10              3   
697                4               8                6              4   
698                4               8                8              5   

     single_epith_cell_size  bare_nuclei  bland_chromation  normal_nucleoli  \
0                         2          1.0                

In [49]:
#Train-Test splitting of data
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [50]:
#selection of model
model = neighbors.KNeighborsClassifier()
#Training our KNN
model.fit(x_train,y_train)

In [51]:
#Model Evaluation -> Finding Accuracy Score
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction,y_train)
print("Training data accuracy :",training_data_accuracy)
x_test_prediction = model.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_prediction,y_test)
print("Testing data accuracy :",training_data_accuracy)

Training data accuracy : 0.9835164835164835
Testing data accuracy : 0.9835164835164835


In [52]:
#checking the best sutied value of K from 1 to 14 using loop
no_neighbors = range(1,15)
training_accuracy = []
test_accuracy = []

for n in no_neighbors:
  model = neighbors.KNeighborsClassifier(n_neighbors=n)
  model.fit(x_train,y_train)
  training_accuracy.append(model.score(x_train,y_train))
  test_accuracy.append(model.score(x_test,y_test))

In [53]:
#plotting a graph
#plt.plot(no_neighbors,training_accuracy,label="Training Accuracy")
#plt.plot(no_neighbors,test_accuracy,label="Test Accuracy")
#plt.legend()
#plt.plot()

MAKING A BREAST CANCER PREDICTIVE SYSTEM

In [54]:
input_data = (5,1,1,1,2,1,3,1,1)

#Converting the input data array into a numpy array
np_data = np.asarray(input_data)

#reshaping the data
np_reshape = np_data.reshape(1,-1)


prediction = model.predict(np_reshape)

if(prediction[0]==4):
  print("This person has Breast Cancer")
else:
  print("This person does not have Breast Cancer")

This person does not have Breast Cancer




In [55]:
import pickle

#creating the model
filename = 'knn_cancer.sav'
pickle.dump(model, open(filename, 'wb'))

#loading the saved model
loaded_model = pickle.load(open('knn_cancer.sav','rb'))

In [56]:
input_data = (5,1,1,1,2,1,3,1,1)

#Converting the input data array into a numpy array
np_data = np.asarray(input_data)

#reshaping the data
np_reshape = np_data.reshape(1,-1)


prediction = loaded_model.predict(np_reshape)

if(prediction[0]==4):
  print("This person has Breast Cancer")
else:
  print("This person does not have Breast Cancer")

This person does not have Breast Cancer


