In [1]:
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

In [2]:
data = pd.read_csv("datasets/breast-cancer.csv")
data.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events


In [3]:
data.dropna(inplace=True)

In [4]:
def find_distinct_values(dataset, columns):
    distinct_value_counts = {}

    for column in columns:
        distinct_values = dataset[column].unique()
        distinct_value_counts[column] = distinct_values

    return distinct_value_counts

In [5]:
categorical_columns = [
    'age',
    'menopause',
    'tumor-size',
    'inv-nodes',
    'node-caps',
    'breast',
    'breast-quad',
    'irradiat',
    'class'
]

distinct_values = find_distinct_values(data, categorical_columns)

# Print the distinct values for each column
for column, values in distinct_values.items():
    print(f"Distinct values in {column}:")
    print(values)
    print()

Distinct values in age:
['40-49' '50-59' '60-69' '30-39' '70-79' '20-29']

Distinct values in menopause:
['premeno' 'ge40' 'lt40']

Distinct values in tumor-size:
['15-19' '35-39' '30-34' '25-29' '40-44' '10-14' '0-4' '20-24' '45-49'
 '50-54' '5-9']

Distinct values in inv-nodes:
['0-2' '3-5' '15-17' '6-8' '9-11' '24-26' '12-14']

Distinct values in node-caps:
['yes' 'no']

Distinct values in breast:
['right' 'left']

Distinct values in breast-quad:
['left_up' 'central' 'left_low' 'right_up' 'right_low']

Distinct values in irradiat:
['no' 'yes']

Distinct values in class:
['recurrence-events' 'no-recurrence-events']



In [6]:
ages = [
    '20-29', '30-39', '40-49', '50-59', '60-69', '70-79'
]
tumor_sizes = [
    '0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54'
]
inv_nodes = [
    '0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', '24-26'
]
ord_columns = ["age","tumor-size", "inv-nodes"]
ord_encoder = OrdinalEncoder(categories=[ages, tumor_sizes, inv_nodes])
data[ord_columns] = ord_encoder.fit_transform(data[ord_columns])
data.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,class
0,2.0,premeno,3.0,0.0,yes,3,right,left_up,no,recurrence-events
1,3.0,ge40,3.0,0.0,no,1,right,central,no,no-recurrence-events
2,3.0,ge40,7.0,0.0,no,2,left,left_low,no,recurrence-events
3,2.0,premeno,7.0,0.0,yes,3,right,left_low,yes,no-recurrence-events
4,2.0,premeno,6.0,1.0,yes,2,left,right_up,no,recurrence-events


In [7]:
oh_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown="ignore")
oh_encoder.set_output(transform="pandas")

oh_columns = ["menopause","node-caps", "breast", "breast-quad", "irradiat"]

result = oh_encoder.fit_transform(data[oh_columns])

data = data.join(result).drop(columns=oh_columns)
data.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig,class,menopause_lt40,menopause_premeno,node-caps_yes,breast_right,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_yes
0,2.0,3.0,0.0,3,recurrence-events,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,3.0,3.0,0.0,1,no-recurrence-events,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,3.0,7.0,0.0,2,recurrence-events,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2.0,7.0,0.0,3,no-recurrence-events,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
4,2.0,6.0,1.0,2,recurrence-events,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
target_column = 'class'

lbl_encoder = LabelEncoder()
data[target_column] = lbl_encoder.fit_transform(data[target_column])

data.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig,class,menopause_lt40,menopause_premeno,node-caps_yes,breast_right,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_yes
0,2.0,3.0,0.0,3,1,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,3.0,3.0,0.0,1,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,3.0,7.0,0.0,2,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2.0,7.0,0.0,3,0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
4,2.0,6.0,1.0,2,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
X = data.drop(columns=[target_column])
y = data[target_column]

model = KNeighborsClassifier(n_neighbors=3)

cv_results = cross_validate(model, X, y)

scores = cv_results["test_score"]
print(f"The accuracy is: {scores.mean():.3f} ± {scores.std():.3f}")

The accuracy is: 0.686 ± 0.033
