In [1]:
import pandas as pd

# Cargar dataset

In [2]:
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')

In [14]:
# Eliminar las columnas 'smoke', 'tue' y 'bad_habits'
train.drop(columns=['smoke', 'tue', 'bad_habits'], inplace=True)
test.drop(columns=['smoke', 'tue', 'bad_habits'], inplace=True)

# Encodear variable objetivo

In [15]:
from sklearn.preprocessing import OrdinalEncoder

nobeyesdad_categories = ['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III']

# Inicializa el codificador ordinal
ordinal_encoder = OrdinalEncoder(categories=[nobeyesdad_categories])

# Aplica el codificador ordinal a las columnas específicas
train[['nobeyesdad']] = ordinal_encoder.fit_transform(train[['nobeyesdad']])
test[['nobeyesdad']] = ordinal_encoder.fit_transform(test[['nobeyesdad']])

In [26]:
test['nobeyesdad'].unique()
test

Unnamed: 0,gender,age,height,weight,family_history_with_overweight,favc,fcvc,ncp,caec,ch2o,scc,faf,calc,mtrans,nobeyesdad
0,1,21,177.0,116.0,1,1,2.00,3.00,1,2.00,0,1.39,1,2,5.0
1,0,18,167.0,91.0,1,1,1.00,3.00,2,1.00,0,0.00,1,2,4.0
2,0,19,154.0,42.0,0,1,3.00,1.00,1,2.00,0,0.00,0,2,0.0
3,0,19,165.0,82.0,1,1,3.00,3.00,1,1.00,0,0.00,1,2,4.0
4,0,26,162.0,110.0,1,1,3.00,3.00,1,2.69,0,0.00,1,2,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,1,25,181.0,85.6,0,1,1.45,3.98,1,2.14,0,0.04,0,0,2.0
424,1,37,174.0,83.2,1,1,2.74,1.05,1,2.26,0,0.88,0,0,3.0
425,1,20,170.0,65.0,0,1,2.00,3.00,1,2.00,0,0.00,0,2,1.0
426,1,22,179.0,89.9,1,1,1.77,1.89,1,2.00,0,0.00,1,2,3.0


# KNN
https://www.kaggle.com/code/sashatarakanova/knn-with-hyperparameter-tuning-using-optuna

In [27]:
X_train = train.drop('nobeyesdad', axis=1)
y_train = train['nobeyesdad']

In [28]:
import optuna
from optuna.samplers import TPESampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
# Which hyperparameters to tune: https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

def objective(trial):
    # -- Instantiate scaler
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])

    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
                
    # -- Tune estimator algorithm
    n_neighbors = trial.suggest_int("n_neighbors", 1, 30)
    weights = trial.suggest_categorical("weights", ['uniform', 'distance'])
    metric = trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric)
        
    # -- Make a pipeline
    pipeline = make_pipeline(scaler, knn)

    # -- Cross-validate the features reduced by dimensionality reduction methods
    kfold = StratifiedKFold(n_splits=10)
    score = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=kfold)
    score = score.mean()
    return score



In [29]:
sampler = TPESampler(seed=42) # create a seed for the sampler for reproducibility
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=300)

[I 2024-05-08 11:14:36,204] A new study created in memory with name: no-name-eaa28398-8c37-4478-bc72-3dc3209fffac
[I 2024-05-08 11:14:36,395] Trial 0 finished with value: 0.7991881664946681 and parameters: {'scalers': 'standard', 'n_neighbors': 18, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.7991881664946681.
[I 2024-05-08 11:14:36,587] Trial 1 finished with value: 0.7084279325765395 and parameters: {'scalers': 'robust', 'n_neighbors': 25, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 0 with value: 0.7991881664946681.
[I 2024-05-08 11:14:36,710] Trial 2 finished with value: 0.889938080495356 and parameters: {'scalers': 'robust', 'n_neighbors': 5, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 2 with value: 0.889938080495356.
[I 2024-05-08 11:14:36,880] Trial 3 finished with value: 0.7945063639490884 and parameters: {'scalers': 'standard', 'n_neighbors': 19, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 2 with val

In [30]:
# -- Have a look at the best trial
print("Best trial out of 300 is:")
study.best_trial

Best trial out of 300 is:


FrozenTrial(number=22, state=TrialState.COMPLETE, values=[0.8911042311661508], datetime_start=datetime.datetime(2024, 5, 8, 11, 14, 39, 213846), datetime_complete=datetime.datetime(2024, 5, 8, 11, 14, 39, 338874), params={'scalers': 'robust', 'n_neighbors': 4, 'weights': 'distance', 'metric': 'manhattan'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'scalers': CategoricalDistribution(choices=('minmax', 'standard', 'robust')), 'n_neighbors': IntDistribution(high=30, log=False, low=1, step=1), 'weights': CategoricalDistribution(choices=('uniform', 'distance')), 'metric': CategoricalDistribution(choices=('euclidean', 'manhattan', 'minkowski'))}, trial_id=22, value=None)

In [31]:
import plotly

optuna.visualization.plot_optimization_history(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [None]:
optuna.visualization.plot_param_importances(study)