# K-Nearest Neighbors (KNN) Model for Cancer Classification

## Step 1: Load and Explore the Dataset

In [1]:

import pandas as pd

# Load the dataset
file_path = "data/cancer_dataset.csv"  # Update path as needed
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Step 2: Data Preprocessing

In [2]:

from sklearn.preprocessing import StandardScaler, LabelEncoder

# Drop the 'id' column as it's not useful for prediction
df.drop(columns=['id'], inplace=True)

# Encode the 'diagnosis' column (M -> 1, B -> 0)
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

# Separate features and target variable
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

# Standardize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


## Step 3: Split Data into Training and Testing Sets

In [3]:

from sklearn.model_selection import train_test_split

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


## Step 4: Train the KNN Model with Hyperparameter Tuning

In [4]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Implementing KNN with hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': range(1, 21)}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best K value
best_k = grid_search.best_params_['n_neighbors']
best_knn = grid_search.best_estimator_

print(f"Best K value: {best_k}")


Best K value: 3


## Step 5: Evaluate the Model

In [5]:

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Model evaluation
y_pred = best_knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Accuracy: 0.9385964912280702
Confusion Matrix:
[[71  1]
 [ 6 36]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95        72
           1       0.97      0.86      0.91        42

    accuracy                           0.94       114
   macro avg       0.95      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



## Step 6: Save the Model and Scaler for Deployment

In [6]:

import pickle

# Save the trained model and scaler
model_filename = "knn_cancer_model.pkl"
scaler_filename = "scaler.pkl"

with open(model_filename, 'wb') as model_file:
    pickle.dump(best_knn, model_file)

with open(scaler_filename, 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print(f"Model saved as {model_filename}")
print(f"Scaler saved as {scaler_filename}")


Model saved as knn_cancer_model.pkl
Scaler saved as scaler.pkl


## Step 7: Load and Predict Using the Saved Model

In [7]:
import pickle
import numpy as np

def load_and_predict(input_data):
    """
    Load the trained KNN model and scaler, preprocess input data, and return predictions
    with probability scores.
    
    Parameters:
    input_data (array-like): Feature data for prediction (same format as training data)
    
    Returns:
    str: Prediction statement with probability score.
    """
    # Load the trained model and scaler
    with open("knn_cancer_model.pkl", 'rb') as model_file:
        loaded_model = pickle.load(model_file)
    with open("scaler.pkl", 'rb') as scaler_file:
        loaded_scaler = pickle.load(scaler_file)
    
    # Scale input data
    input_scaled = loaded_scaler.transform([input_data])
    
    # Predict using the loaded model
    prediction = loaded_model.predict(input_scaled)[0]
    probabilities = loaded_model.predict_proba(input_scaled)[0]
    
    # Assign labels
    labels = {0: "Benign", 1: "Malignant"}
    
    # Determine the probability of the predicted class
    confidence = probabilities[prediction] * 100
    
    # Return formatted result
    return f"The person has '{labels[prediction]}' cancer with a probability of {confidence:.2f}%."

# Example usage:
sample_data = [17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189]

print(load_and_predict(sample_data))




The person has 'Malignant' cancer with a probability of 100.00%.
