In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score

import pandas as pd

In [2]:
# Load the wine dataset
wine = load_wine()
X = wine.data
y = wine.target


# Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=13)),  # feature selection
    ('svc', SVC())
])


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a parameter grid for the SVC and SelectKBest
param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf', 'linear'],
    'feature_selection__k': [10, 11, 12, 13],
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()]
}

# Apply k-fold cross-validation using GridSearchCV on the pipeline
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=-1)


# Fit the GridSearchCV object on the training data
grid_search.fit(X_train, y_train)

In [3]:

# Convert cv_results_ into a DataFrame
cv_results_df = pd.DataFrame(grid_search.cv_results_)

# Select the columns related to the parameters and the mean test score (accuracy)
results_df = cv_results_df.loc[:, ['params', 'mean_test_score']]

# Convert the 'params' column into separate columns
params_df = pd.json_normalize(results_df['params'])

# Concatenate the parameters DataFrame and the accuracy column
results_df = pd.concat([params_df, results_df['mean_test_score']], axis=1)

# Rename the 'mean_test_score' column to 'Accuracy'
results_df.rename(columns={'mean_test_score': 'Accuracy'}, inplace=True)

# Print the DataFrame
results_df.sort_values(by="Accuracy", ascending=False)

Unnamed: 0,feature_selection__k,scaler,svc__C,svc__gamma,svc__kernel,Accuracy
353,13,RobustScaler(),0.1,1.000,linear,0.986207
355,13,RobustScaler(),0.1,0.100,linear,0.986207
357,13,RobustScaler(),0.1,0.010,linear,0.986207
359,13,RobustScaler(),0.1,0.001,linear,0.986207
310,13,StandardScaler(),10.0,0.001,rbf,0.986207
...,...,...,...,...,...,...
228,12,MinMaxScaler(),0.1,0.010,rbf,0.401724
226,12,MinMaxScaler(),0.1,0.100,rbf,0.401724
198,12,StandardScaler(),0.1,0.001,rbf,0.401724
174,11,RobustScaler(),1.0,0.001,rbf,0.401724


In [4]:
# Print the best parameters found by GridSearchCV
print("Best parameters: ", grid_search.best_params_)

Best parameters:  {'feature_selection__k': 13, 'scaler': StandardScaler(), 'svc__C': 10, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define two pipelines
pipe_svc = Pipeline([
    ('scaler', StandardScaler()),  # Default scaler
    ('feature_selection', SelectKBest(score_func=f_classif, k=13)),
    ('svc', SVC())  # SVC classifier
])

pipe_rf = Pipeline([
    ('scaler', StandardScaler()),  # Default scaler
    ('feature_selection', SelectKBest(score_func=f_classif, k=13)),
    ('rf', RandomForestClassifier())  # Random Forest classifier
])

# Define the parameter grids
param_grid_svc = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf', 'linear'],
    'feature_selection__k': [10, 11, 12, 13],
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()]
}

param_grid_rf = {
    'rf__n_estimators': [10, 50, 100, 200],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'feature_selection__k': [10, 11, 12, 13],
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()]
}

# Create GridSearchCV objects
grid_svc = GridSearchCV(pipe_svc, param_grid_svc)
grid_rf = GridSearchCV(pipe_rf, param_grid_rf)

# Now, you can fit and score each GridSearchCV object separately and compare the results

In [6]:
grid_svc.fit(X_train, y_train)
# Convert cv_results_ into a DataFrame
cv_results_df = pd.DataFrame(grid_svc.cv_results_)

# Select the columns related to the parameters and the mean test score (accuracy)
results_df = cv_results_df.loc[:, ['params', 'mean_test_score']]

# Convert the 'params' column into separate columns
params_df = pd.json_normalize(results_df['params'])

# Concatenate the parameters DataFrame and the accuracy column
results_df = pd.concat([params_df, results_df['mean_test_score']], axis=1)

# Rename the 'mean_test_score' column to 'Accuracy'
results_df.rename(columns={'mean_test_score': 'Accuracy'}, inplace=True)

# Print the DataFrame
results_df.sort_values(by="Accuracy", ascending=False).head(6)

Unnamed: 0,feature_selection__k,scaler,svc__C,svc__gamma,svc__kernel,Accuracy
328,13,MinMaxScaler(),1.0,1.0,rbf,0.985961
136,11,MinMaxScaler(),1.0,1.0,rbf,0.985961
232,12,MinMaxScaler(),1.0,1.0,rbf,0.985961
40,10,MinMaxScaler(),1.0,1.0,rbf,0.985961
374,13,RobustScaler(),10.0,0.001,rbf,0.985714
364,13,RobustScaler(),1.0,0.01,rbf,0.985714
