# Grid search with Cross-validation (not using GridSearchCV)

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score

https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset 

In [4]:
diabetes = datasets.load_diabetes()

- We will use Lasso regression model.
- We will use R2 as an evaluation metric
- Conduct grid search on the hyperparameter alpha with 5-fold cross-validation. 
- Finally, check the final generalization performance on the test set with the chosen hyperparameter.

In [5]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2, random_state=0)

kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [6]:
scaler = StandardScaler()
best_score = 0

for alpha in np.logspace(-4, 1, 30): # iterate over this candidates using for statement

    scores_val = []
    for train_idx, val_idx in kfold.split(X_trainval, y_trainval): # one more for statement inside the for statement for cross-validation

        # get X_train, y_train, X_valid, y_valid 
        X_train = X_trainval[train_idx]
        y_train = y_trainval[train_idx]
        X_valid = X_trainval[val_idx]
        y_valid = y_trainval[val_idx]

        # scale X_train, X_valid  
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_valid_scaled = scaler.transform(X_valid)

        # training is performed with the Lasso set to the current alpha.
        lasso = Lasso(alpha = alpha, random_state=0, max_iter=10000)
        lasso.fit(X_train_scaled, y_train)

        # get y_valid_hat with the trained model & store r2 score in scores_val
        y_valid_hat = lasso.predict(X_valid_scaled)
        scores_val.append(r2_score(y_valid, y_valid_hat))

    mean_score = np.mean(scores_val) # get the cross-validation score
    
    # When the mean_score is higher than current best score,best_score is updated and the hyperparameter at that time is saved
    if mean_score > best_score:        
        best_score = mean_score
        best_parameters = {'alpha': alpha}

print("Best score on validation set: {:.7f}".format(best_score))
print("Best hyperparameters: {}".format(best_parameters))

Best score on validation set: 0.5074650
Best hyperparameters: {'alpha': 0.6210169418915616}


In [7]:
# scale X_trainval, X_test  
scaler.fit(X_trainval)
X_trainval_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

# tran Lasso again with the best hyperparameter
lasso = Lasso(**best_parameters, random_state=0, max_iter=10000)
lasso.fit(X_trainval_scaled, y_trainval)

# get the final performance on test set. 
y_test_hat = lasso.predict(X_test_scaled)
test_score = r2_score(y_test, y_test_hat)
print("Test set score with best hyperparameters: {:.7f}".format(test_score))

Test set score with best hyperparameters: 0.3317134


# Evaluation Metric

- Get the probability of belonging to each class
- Draw confusion matrix and calcuate the recall and precision.
- Change the threshold and check the change of the value of metric. 

In [8]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
# binary classification : malignant or benign

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=42)

In [10]:
# train KNN model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# we have to do scaling 
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

clf = KNeighborsClassifier(n_neighbors=30) # set k to 30
clf.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=30)

we can get 'predicted class' with **.predict**

In [11]:
y_val_hat = clf.predict(X_val_scaled)
y_val_hat

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

To calculate the probability of belonging to each class, **.predict_proba** can be used.

In [12]:
y_val_prob = clf.predict_proba(X_val_scaled)
y_val_prob # probability of beloing to negative and positive for each instance

array([[0.03333333, 0.96666667],
       [1.        , 0.        ],
       [0.3       , 0.7       ],
       [0.16666667, 0.83333333],
       [0.36666667, 0.63333333],
       [1.        , 0.        ],
       [0.93333333, 0.06666667],
       [0.96666667, 0.03333333],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.2       , 0.8       ],
       [0.03333333, 0.96666667],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.03333333, 0.96666667],
       [0.        , 1.        ],
       [0.96666667, 0.03333333],
       [0.93333333, 0.06666667],
       [0.33333333, 0.66666667],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.93333333, 0.06666667],
       [0.03333333, 0.96666667],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.03333333, 0.96666667],
       [0.06666667, 0.93333333],
       [1.        , 0.        ],
       [0.

In [13]:
# to get the probability of postive, we just need second column
prob_postivie = y_val_prob[:,1]

In [14]:
# actually, setting threshold to 0.5 makes same result with ".predict"
prob_postivie > 0.5

array([ True, False,  True,  True,  True, False, False, False, False,
        True,  True,  True, False, False,  True,  True, False, False,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True, False, False, False,  True, False, False,
       False,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True, False,  True,  True,  True, False, False,  True, False,
        True, False, False, False,  True,  True, False,  True,  True,
       False,  True, False, False,  True,  True,  True, False,  True,
        True,  True, False,  True,  True, False,  True, False,  True,
        True, False,

Now, lets change the threshold and check the changes in the value of evaluation metric

In [15]:
y_val_hat = prob_postivie > 0.3

In [17]:
from sklearn.metrics import confusion_matrix
# get confusion matrix using the confusion_matrix function
cm = confusion_matrix(y_val, y_val_hat)
cm

array([[44,  9],
       [ 0, 90]], dtype=int64)

Let's say 1(True) is positive.  
calculate recall and precision.

In [18]:
recall = cm[1,1]/ cm[1,:].sum()
recall

1.0

In [19]:
precision = cm[1,1]/cm[:,1].sum()
precision

0.9090909090909091

**By changing the threshold yourself, check how the precision and recall values change.**