In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report , confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier


In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_names = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape",
                "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin",
                "Normal Nucleoli", "Mitoses", "Class"]
df = pd.read_csv(url, names=column_names)

df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)

df.head(50)

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [3]:
df.shape

(683, 11)

In [4]:
df['Class'] = df['Class'].apply(lambda x : 1 if x == 4 else 0)
df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,0
695,841769,2,1,1,1,2,1,1,1,1,0
696,888820,5,10,10,3,7,3,8,10,2,1
697,897471,4,8,6,4,3,4,10,6,1,1


In [5]:
# Check missing values
df.isnull().sum()

Unnamed: 0,0
Sample code number,0
Clump Thickness,0
Uniformity of Cell Size,0
Uniformity of Cell Shape,0
Marginal Adhesion,0
Single Epithelial Cell Size,0
Bare Nuclei,0
Bland Chromatin,0
Normal Nucleoli,0
Mitoses,0


In [6]:
X = df.iloc[:,0:10]
y = df.iloc[:,10]

In [7]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=11)

In [8]:
# Scalling
s = StandardScaler()
print
X_train = s.fit_transform(X_train)
X_test = s.fit_transform(X_test)

# **MODELING**

In [9]:
# KKN
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)

In [10]:
y_predknn = knn.predict(X_test)

In [11]:
print('Accuracy score KNN: ', knn.score(X_test, y_test))

Accuracy score KNN:  0.9854014598540146


In [12]:
print(classification_report(y_test, y_predknn))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99        86
           1       0.98      0.98      0.98        51

    accuracy                           0.99       137
   macro avg       0.98      0.98      0.98       137
weighted avg       0.99      0.99      0.99       137



In [13]:
cm = confusion_matrix(y_test, y_predknn)
print(cm)

[[85  1]
 [ 1 50]]


In [14]:
# RandomForest
ran = RandomForestClassifier(n_estimators = 300 , random_state=100)
ran.fit(X_train ,y_train)

In [15]:
print(f'train score of RandomForestClassifier : {ran.score(X_train , y_train)}')
print(f'test score of RandomForestClassifier : {ran.score(X_test , y_test)}')

train score of RandomForestClassifier : 1.0
test score of RandomForestClassifier : 0.9927007299270073


In [16]:
y_predran = ran.predict(X_test)

In [17]:
print('Accuracy score random forest:', ran.score(X_test, y_test))

Accuracy score random forest: 0.9927007299270073


In [18]:
cm = confusion_matrix(y_test, y_predran)
print(cm)

[[85  1]
 [ 0 51]]


In [19]:
print(classification_report(y_test, y_predran))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99        86
           1       0.98      1.00      0.99        51

    accuracy                           0.99       137
   macro avg       0.99      0.99      0.99       137
weighted avg       0.99      0.99      0.99       137

