In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score

test_size = 0.3
eta0 = 0.1

In [23]:
column_names = [
    "sample_code_number",
    "clump_thickness",
    "uniformity_of_cell_size",
    "uniformity_of_cell_shape",
    "marginal_adhesion",
    "single_epithelial_cell_size",
    "bare_nuclei",
    "bland_chromatin",
    "normal_nucleoli",
    "mitoses",
    "class"
]

raw_df = pd.read_csv("breast-cancer-wisconsin.data", names = column_names, na_values=["?"]).dropna(how='any')
raw_df

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2.0,1,1,1,2
695,841769,2,1,1,1,2,1.0,1,1,1,2
696,888820,5,10,10,3,7,3.0,8,10,2,4
697,897471,4,8,6,4,3,4.0,10,6,1,4


In [24]:
cleaned_df = raw_df.apply(pd.to_numeric, errors='raise')
cleaned_df

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2.0,1,1,1,2
695,841769,2,1,1,1,2,1.0,1,1,1,2
696,888820,5,10,10,3,7,3.0,8,10,2,4
697,897471,4,8,6,4,3,4.0,10,6,1,4


In [25]:
#feature engineering
cleaned_df["class"] = cleaned_df["class"].apply(lambda x: 0 if x==4 else 1)
cleaned_df


Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1.0,3,1,1,1
1,1002945,5,4,4,5,7,10.0,3,2,1,1
2,1015425,3,1,1,1,2,2.0,3,1,1,1
3,1016277,6,8,8,1,3,4.0,3,7,1,1
4,1017023,4,1,1,3,2,1.0,3,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2.0,1,1,1,1
695,841769,2,1,1,1,2,1.0,1,1,1,1
696,888820,5,10,10,3,7,3.0,8,10,2,0
697,897471,4,8,6,4,3,4.0,10,6,1,0


In [26]:
y = cleaned_df.pop('class')
X = cleaned_df

sc = StandardScaler()
sc.fit(X)

X_std = sc.transform(X)

In [27]:
random_seed_array = range(1, 21)

accuracy_array = []
precision_array = []

model = Perceptron(eta0 = eta0)
for random_state in random_seed_array:
    X_train, X_test, y_train, y_test = train_test_split(
        X_std,
        y,
        test_size = test_size,
        random_state = random_state
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy_array.append(round(accuracy_score(y_test, y_pred) * 100, 2))
    precision_array.append(round(precision_score(y_test, y_pred) * 100, 2))

print('Acurácias obtidas: {0}'.format(accuracy_array))
print('Precisões obtidas: {0}'.format(precision_array))
print('Desvio padrão da acurácia: {0}'.format(np.std(accuracy_array)))
print('Desvio padrão da precisão: {0}'.format(np.std(precision_array)))

Acurácias obtidas: [95.12, 92.68, 96.59, 94.63, 95.61, 97.07, 97.07, 98.05, 97.07, 95.61, 97.07, 94.63, 94.63, 95.12, 94.15, 95.12, 95.61, 94.63, 97.56, 93.17]
Precisões obtidas: [np.float64(96.95), np.float64(97.41), np.float64(97.64), np.float64(97.64), np.float64(96.27), np.float64(99.21), np.float64(98.45), np.float64(99.17), np.float64(96.18), np.float64(96.21), np.float64(98.4), np.float64(94.62), np.float64(97.78), np.float64(94.07), np.float64(93.18), np.float64(96.4), np.float64(96.99), np.float64(95.31), np.float64(98.5), np.float64(94.29)]
Desvio padrão da acurácia: 1.4216872898074295
Desvio padrão da precisão: 1.6928475270974634


In [28]:
#predict example

data = [[999999, 5, 10, 10, 3, 7, 3, 8, 10, 2]]

columns = ['sample_code_number', 'clump_thickness', 'uniformity_of_cell_size',
           'uniformity_of_cell_shape', 'marginal_adhesion',
           'single_epithelial_cell_size', 'bare_nuclei', 'bland_chromatin',
           'normal_nucleoli', 'mitoses']

data_std = sc.transform(pd.DataFrame(data, columns=columns))
pred = model.predict(data_std)
print(f"Predicted: {pred}")

Predicted: [0]
