In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

#  initialize pandas display: 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
pd.set_option('display.min_rows', 5)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

#  Step 1: Load dataset
df = pd.read_csv("breast cancer.csv")

# Step 2: Dataset overview
print(f" Dataset Shape: {df.shape}")
print(" Column Names:")
print(df.columns.tolist(), "\n")

print(" Preview of Data (all columns, limited rows):")
print(df)

# Step 3: Check and clean missing values
print("\n Missing Values (Before Cleaning):")
print(df.isnull().sum())

df.dropna(inplace=True)

print("\n Missing Values (After Cleaning):")
print(df.isnull().sum(), "\n")

# Step 4: Define features (X) and label (y)
X = df.drop(columns=['diagnosis'])
y = df['diagnosis'].map({'M': 1, 'B': 0})  # Encode 'M' as 1, 'B' as 0

# Step 5: Standardize feature values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Step 7: Find optimal k using GridSearchCV
param_grid = {'n_neighbors': list(range(1, 21))}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='f1')
grid.fit(X_train, y_train)

best_k = grid.best_params_['n_neighbors']
print(f" Best k value from GridSearchCV: {best_k}\n")

# Step 8: Train KNN classifier with best k and evaluate
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Step 9: Model Evaluation
print(" Evaluation on Test Data:")
print(classification_report(y_test, y_pred))
print(f" Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f" Precision: {precision_score(y_test, y_pred):.4f}")
print(f" Recall   : {recall_score(y_test, y_pred):.4f}")
print(f" F1 Score : {f1_score(y_test, y_pred):.4f}")


 Dataset Shape: (569, 32)
 Column Names:
['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'] 

 Preview of Data (all columns, limited rows):
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  compactness_mean  concavity_mean  concave_points_mean  symmetry_mean  fractal_dimension_mean  radius_se  texture_se  perimeter_se  area_se  smoothness_se  compactness_se  concavity_se  concave_points_se  symmetry_se  fractal_dimension_se  radius_worst 