In [18]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Load Data
data = load_iris(as_frame=True)
df = data.frame
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [19]:
# Select feature (X) and output (y) and split the data for traning anf testing
X = df[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Build a Pipeline (Scaling is IMPORTANT for KNN), It will scale the data and train the model
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

### Hyperparameter Tuning
Finding the best settings (hyperparameters) for a model so that it performs best on unseen data.
These are parameters not learned from the data — you set them manually.

Examples:
- Random Forest → number of trees, max depth
- KNN → number of neighbors
- SVM → C, gamma
- XGBoost → learning rate, max depth
- Neural Networks → learning rate, batch size, number of layers

Why Hyperparameter Tuning Is Needed:

Two models using the same algorithm but different hyperparameters can have completely different performance.

Example:
- KNN with K=1 → Overfitting
- KNN with K=20 → Stable, smooth boundary

Good tuning helps achieve maximum generalization.
Below are hyperparameter Tuning Methods:

#### Grid Search
Exhaustively tests every combination.

Pros:
- Finds best combination
- Simple to understand

Cons:
- Very slow
- Not ideal for large search spaces

In [21]:
# Hyperparameters to Search using Grid Search
param_grid = {
    'knn__n_neighbors': np.arange(3, 20),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# Best Results
print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)
print("Test Accuracy:", grid.score(X_test, y_test))

Best Params: {'knn__metric': 'euclidean', 'knn__n_neighbors': np.int64(9), 'knn__weights': 'distance'}
Best Score: 0.9583333333333334
Test Accuracy: 1.0


##### Random Search
Randomly selects combinations from given ranges.

Pros:
- Much faster
- Often finds near-optimal values

Cons:
- Might skip the exact best combination

In [22]:
param_dist = {
    'knn__n_neighbors': np.arange(3, 20),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan'],
}

random_search = RandomizedSearchCV(
    pipe,
    param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Params:", random_search.best_params_)
print("Best Score:", random_search.best_score_)
print("Test Accuracy:", random_search.score(X_test, y_test))

Best Params: {'knn__weights': 'uniform', 'knn__n_neighbors': np.int64(9), 'knn__metric': 'manhattan'}
Best Score: 0.9583333333333334
Test Accuracy: 1.0
