In [1]:
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
import os

# See current working directory
os.chdir('..')

# See current working directory
os.getcwd()

'/home/juan/proyectos/docker/learning-docker/basic-ml-proj'

In [3]:
# Preprocess data
from src.data.preprocessing import preprocess_data

preprocess_data(
    'data/raw/iris.parquet', 
    'data/features/processed_iris.parquet',
    n_components = 2
    )

In [6]:
import yaml

# Loading config file.
with open('config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

config

{'project': {'name': 'ml-container_project',
  'version': '0.1.0',
  'description': 'Basic ML model to train containerization',
  'author': 'Juan Felipe Agudelo Vélez'},
 'paths': {'data': {'absulute_path': None,
   'raw': './data/raw/',
   'features': './data/features/'},
  'models': './models/trained/',
  'logs': './logs/'},
 'model': {'trainParams': {'param_grid': {'n_neighbors': [2, 3, 4],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean']},
   'n_folds': 4,
   'eval_metrics': 'f1_weighted',
   'refit': True}}}

In [13]:
from sklearn.model_selection import train_test_split

# Loading features
features = pd.read_parquet('data/features/processed_iris.parquet')

# Splitting regressor from target.
X, y = features[features.columns[:2]], features[features.columns[-1]]

# Leaving 10% for testing.
X_cv, x_test, y_cv, y_test = train_test_split(
    X, y, 
    test_size = 0.1, 
    train_size = 0.9, 
    shuffle = True,
    stratify = y
)

# Setup model.
model = KNeighborsClassifier()

# Setting up CV Grid Search.
grid_search = GridSearchCV(
    model,
    param_grid = config['model']['trainParams']['param_grid'],
    scoring = config['model']['trainParams']['eval_metrics'],
    refit = config['model']['trainParams']['refit'],
    cv = config['model']['trainParams']['n_folds']
)

grid_search.fit(X_cv, y_cv)

# Retrieve params
best_params = grid_search.best_params_

# Retrieve best model
best_estimator = grid_search.best_estimator_

In [16]:
# Checking results
cv_results_df = pd.DataFrame(grid_search.cv_results_)

# Save results
cv_results_df.to_csv('models/hyperparameters/cv_model_results.csv', index = False)

# Check results
cv_results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002098,0.00059,0.004708,0.000858,minkowski,2,uniform,"{'metric': 'minkowski', 'n_neighbors': 2, 'wei...",0.940498,0.970588,0.940686,0.969634,0.955352,0.014764,9
1,0.001337,0.000247,0.002451,0.000132,minkowski,2,distance,"{'metric': 'minkowski', 'n_neighbors': 2, 'wei...",0.970476,0.970588,1.0,0.907368,0.962108,0.033816,7
2,0.001273,0.000142,0.003695,0.000472,minkowski,3,uniform,"{'metric': 'minkowski', 'n_neighbors': 3, 'wei...",0.970476,0.941176,0.970527,0.969634,0.962954,0.012578,1
3,0.001107,9.2e-05,0.002213,0.000205,minkowski,3,distance,"{'metric': 'minkowski', 'n_neighbors': 3, 'wei...",0.940498,0.941176,0.970527,0.938889,0.947773,0.013164,11
4,0.001212,0.000197,0.003648,0.000388,minkowski,4,uniform,"{'metric': 'minkowski', 'n_neighbors': 4, 'wei...",0.970476,0.941176,0.970527,0.969634,0.962954,0.012578,1
5,0.001376,0.000288,0.002372,0.000201,minkowski,4,distance,"{'metric': 'minkowski', 'n_neighbors': 4, 'wei...",0.970476,0.941176,1.0,0.938889,0.962635,0.02491,5
6,0.001206,0.000121,0.003586,0.000699,euclidean,2,uniform,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",0.940498,0.970588,0.940686,0.969634,0.955352,0.014764,9
7,0.001539,0.000673,0.002456,0.000495,euclidean,2,distance,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",0.970476,0.970588,1.0,0.907368,0.962108,0.033816,7
8,0.001149,0.000154,0.003593,0.000499,euclidean,3,uniform,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",0.970476,0.941176,0.970527,0.969634,0.962954,0.012578,1
9,0.001371,0.000474,0.002292,0.000436,euclidean,3,distance,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",0.940498,0.941176,0.970527,0.938889,0.947773,0.013164,11


In [21]:
import json

from joblib import dump, load

# Save best params
with open('models/hyperparameters/best_params.json', 'w') as params:
    json.dump(best_params, params, indent = 4)
    
# Save best model
dump(best_estimator, 'models/trained/knn_best_estimator.z')

['models/trained/knn_best_estimator.z']