# Grid Search for optimal parameters
This notebook conducts a grid search for the optimal KNN parameters along with the optimal number of adjacent wells in the training dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D 
import seaborn as sns; sns.set()
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import glob
import warnings
from sklearn.neighbors import KNeighborsClassifier

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# point this to the directory with the generated training data 
# with the different number of adjacent wells
training_files = glob.glob(r"F:\Geology\WSGS\Projects\jupyter\*.csv")

In [None]:
accuracy_measured = [] # for the cross-validation accuracy
num_neighbors = [] #the number of adjacent wells
grid_params = {
    "n_neighbors": [5,10,20,40,80],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"],
    "algorithm": ["ball_tree", 'kd_tree', 'brute'],
    "leaf_size": [10,30],
}

for file in training_files:
    print(f'reading {file[-15:]}')
    no_of_neighbors = int(file[33:-13])
    dataset = pd.read_csv(file, index_col=[0])
    data_subset0 = dataset.drop(['class'], axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(
        dataset.drop('class', axis=1),
        dataset['class'],
        test_size=0.2, 
        random_state=86,
    )
    gs = GridSearchCV(
        KNeighborsClassifier(), grid_params, verbose=8, cv=5, n_jobs=7
    )
    gs_results = gs.fit(X_train, y_train)
    neigh = KNeighborsClassifier(**gs.best_params_)
    
    cved = cross_val_score(neigh, dataset.drop('class', axis=1),
        dataset['class'], cv=10, scoring='accuracy')
    accuracy_measured.append(cved)
    num_neighbors.append(file[-16:-13])

In [None]:
mean_accuracy = []
for i in enumerate(accuracy_measured):
    plt.plot(accuracy_measured[i[0]], label=str(num_neighbors[i[0]])+' Neighbors')
    mean_accuracy.append(accuracy_measured[i[0]].mean().round(4))
    plt.legend()
plt.xlabel('Fold Number')
plt.ylabel('Accuracy')

In [None]:
plt.plot(num_neighbors, mean_accuracy)
plt.xlabel('number of adjacent wells')
plt.ylabel('mean cross-validated accuracy')