# Training multiple classifiers on wine dataset

## Importing dataset

In [21]:
from sklearn.datasets import load_wine
df = load_wine(as_frame=True)
df = df.frame

In [22]:
# For MNIST Dataset
# from sklearn.datasets import fetch_openml

# # Fetch the dataset
# mnist = fetch_openml('mnist_784', version=1)

# # Split data and labels
# X, y = mnist["data"], mnist["target"]

# # Convert labels to integers
# y = y.astype(int)

# import pandas as pd
# df_X = pd.DataFrame(X)
# df_y = pd.DataFrame(y)
# df = pd.concat([df_X, df_y], axis=1)

# df = df[:1000]
# df

# import matplotlib.pyplot as plt
# plt.imshow(df.iloc[942,:-1].values.reshape(28,28))

In [23]:
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [24]:
# print(df.isnull().sum())
df.shape

(178, 14)

In [25]:
df.sample()


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
162,12.85,3.27,2.58,22.0,106.0,1.65,0.6,0.6,0.96,5.58,0.87,2.11,570.0,2


In [26]:
# Assuming df is your DataFrame
missing_values = df.isnull().sum()

print(missing_values[missing_values > 0])

Series([], dtype: int64)


## Preprocessing using SKLEARN

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [28]:
trf1 = ColumnTransformer([
    ('impute', SimpleImputer(), slice(0, 13))
], remainder = 'passthrough')

In [29]:
trf2 = ColumnTransformer([
    ('impute2', SimpleImputer(strategy='most_frequent'), [13])
], remainder = 'passthrough')

In [30]:
trf1.fit_transform(df)


array([[  14.23,    1.71,    2.43, ...,    3.92, 1065.  ,    0.  ],
       [  13.2 ,    1.78,    2.14, ...,    3.4 , 1050.  ,    0.  ],
       [  13.16,    2.36,    2.67, ...,    3.17, 1185.  ,    0.  ],
       ...,
       [  13.27,    4.28,    2.26, ...,    1.56,  835.  ,    2.  ],
       [  13.17,    2.59,    2.37, ...,    1.62,  840.  ,    2.  ],
       [  14.13,    4.1 ,    2.74, ...,    1.6 ,  560.  ,    2.  ]])

In [31]:
trf2.fit_transform(df)

array([[0.000e+00, 1.423e+01, 1.710e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [0.000e+00, 1.320e+01, 1.780e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [0.000e+00, 1.316e+01, 2.360e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [2.000e+00, 1.327e+01, 4.280e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [2.000e+00, 1.317e+01, 2.590e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [2.000e+00, 1.413e+01, 4.100e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [32]:
# from sklearn.preprocessing import OneHotEncoder
# trf2 = ColumnTransformer([
#     ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"),[1,6])
# ])

## Splitting X & Y and train_test_split

In [33]:
X = df.iloc[:,0:13]
y = df.iloc[:,13]


In [34]:
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [35]:
y.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [37]:
X_train.shape

(142, 13)

## Scaling Data and PCA

In [38]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [40]:
X_train.shape

(142, 3)

In [43]:
import plotly.express as px
y_train_plot = y_train.astype(str)
fig = px.scatter_3d(df, x = X_train[:,0],
                 y = X_train[:,1],
                 z = X_train[:,2],
                 color = y_train_plot)
fig.update_layout(
    margin=dict(l=20, r=20, b=20, t=20)
)

fig.show()

## Model Training and GridSearchCV

In [44]:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier
from matplotlib import pyplot
import plotly.express as px
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.




In [45]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('XGB', XGBClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('GBM', GradientBoostingClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('ET', ExtraTreesClassifier()))
models.append(('BAG', BaggingClassifier()))
models.append(('LGBM', LGBMClassifier()))
models.append(('MLP', MLPClassifier()))

In [46]:
param_grids = {
    'LR': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs'],
        'penalty': ['l2'],
        'multi_class': ['ovr', 'multinomial']
    },
    'LDA': {
        'solver': ['svd', 'lsqr', 'eigen'],
        'shrinkage': [None, 'auto', 0.1, 0.5, 1.0]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    },
    'CART': {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'NB': {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto'],
        'max_iter': [1000, 5000]
    },
    'XGB': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0]
    },
    'RF': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GBM': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10]
    },
    'ADA': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    },
    'ET': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'BAG': {
        'n_estimators': [10, 50, 100],
        'max_samples': [0.5, 0.7, 1.0],
        'max_features': [0.5, 0.7, 1.0]
    },
    'CAT': {
        'iterations': [100, 200, 500],
        'depth': [4, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'l2_leaf_reg': [3, 5, 7]
    },
    'LGBM': {
        'n_estimators': [50, 100, 200],
        'max_depth': [-1, 10, 20],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100]
    },
    'H2O': {
        'ntrees': [50, 100, 200],
        'max_depth': [5, 10, 15],
        'learn_rate': [0.01, 0.1, 0.2],
        'col_sample_rate': [0.6, 0.8, 1.0]
    },
    'MLP': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [1000, 2000]
    }
}


In [47]:
from sklearn.model_selection import GridSearchCV

# Initialize the results dictionary
results_dict = {}

# Iterate through each model and its parameter grid
for model_name, model in models:
    if model_name in param_grids:
        print(f"Running GridSearchCV for {model_name}...")
        param_grid = param_grids[model_name]

        # Specify multiple metrics for scoring
        scoring = ['precision_macro', 'recall_macro', 'accuracy', 'f1_macro']

        grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                                   scoring=scoring, refit='accuracy',
                                   return_train_score=True, cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # for multiple metric evaluation, the attributes best_index_, best_score_ and best_params_
        # will only be available if refit is set and all of them will be determined w.r.t this specific scorer.
        # see https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.GridSearchCV.html

        # Get the best model and best parameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        # Access other metrics for the best parameters
        # Find the index of the best score for accuracy
        best_index = grid_search.best_index_

        # Get the other metrics from the cv_results_
        best_precision = grid_search.cv_results_['mean_test_precision_macro'][best_index]
        best_recall = grid_search.cv_results_['mean_test_recall_macro'][best_index]
        best_f1 = grid_search.cv_results_['mean_test_f1_macro'][best_index]

        # Store the results in the dictionary
        results_dict[model_name] = {
            'Best Params': best_params,
            'Best Accuracy': best_score,
            'Best Precision': best_precision,
            'Best Recall': best_recall,
            'Best F1 Score': best_f1
        }
    else:
        print(f"No parameter grid defined for {model_name}. Skipping...")

# Display the results
for model_name, metrics in results_dict.items():
    print(f"{model_name}: {metrics}")


Running GridSearchCV for LR...




25 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1267, in fit
    multi_class = _check_multi_class(multi_class, solver, len(self.classes_))
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 99, in _c

Running GridSearchCV for LDA...




20 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/discriminant_analysis.py", line 629, in fit
    raise NotImplementedError("shrinkage not supported with 'svd' solver.")
NotImplementedError: shrinkage not supported with 'svd' solver.



One or more of the test scores are n

Running GridSearchCV for KNN...
Running GridSearchCV for CART...
Running GridSearchCV for NB...
Running GridSearchCV for SVM...
Running GridSearchCV for XGB...
Running GridSearchCV for RF...
Running GridSearchCV for GBM...
Running GridSearchCV for ADA...






Running GridSearchCV for ET...
Running GridSearchCV for BAG...
Running GridSearchCV for LGBM...



invalid value encountered in cast



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 147
[LightGBM] [Info] Number of data points in the train set: 142, number of used features: 3
[LightGBM] [Info] Start training from score -1.242255
[LightGBM] [Info] Start training from score -0.828693
[LightGBM] [Info] Start training from score -1.292265
Running GridSearchCV for MLP...



invalid value encountered in cast



LR: {'Best Params': {'C': 0.1, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'lbfgs'}, 'Best Accuracy': 0.9645320197044335, 'Best Precision': 0.9656630406630408, 'Best Recall': 0.968019943019943, 'Best F1 Score': 0.9658729752770674}
LDA: {'Best Params': {'shrinkage': None, 'solver': 'svd'}, 'Best Accuracy': 0.9573891625615765, 'Best Precision': 0.9570370370370369, 'Best Recall': 0.9670940170940172, 'Best F1 Score': 0.9592198885702722}
KNN: {'Best Params': {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}, 'Best Accuracy': 0.9857142857142858, 'Best Precision': 0.9851851851851852, 'Best Recall': 0.9893162393162394, 'Best F1 Score': 0.9865916453537936}
CART: {'Best Params': {'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}, 'Best Accuracy': 0.9714285714285713, 'Best Precision': 0.9707912457912459, 'Best Recall': 0.9754273504273504, 'Best F1 Score': 0.9724612105711851}
NB: {'Best Params': {'var_smoothing': 1e-09

In [None]:
# # Initialize the dictionary to store results
# results_dict = {}
# scoring = ['precision_macro', 'recall_macro', 'accuracy', 'f1_macro']

# for name, model in models:
#     kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
#     cv_result = cross_validate(model, X, y, scoring=scoring)
#     # Calculate the mean score for each metric
#     precision = cv_result['test_precision_macro'].mean()
#     recall = cv_result['test_recall_macro'].mean()
#     accuracy = cv_result['test_accuracy'].mean()
#     f1 = cv_result['test_f1_macro'].mean()
#     # Store the metrics in the dictionary as a comma-separated string
#     results_dict[name] = f"{precision}, {recall}, {accuracy}, {f1}"

# # Display the results
# results_dict


## Plotting results

In [48]:
import plotly.express as px
import pandas as pd

# Create a DataFrame from the results dictionary
data = {
    'Model': list(results_dict.keys()),
    'Precision': [metrics['Best Precision'] for metrics in results_dict.values()],
    'Recall': [metrics['Best Recall'] for metrics in results_dict.values()],
    'Accuracy': [metrics['Best Accuracy'] for metrics in results_dict.values()],
    'F1 Score': [metrics['Best F1 Score'] for metrics in results_dict.values()]
}

plot_df = pd.DataFrame(data)

# Melt the DataFrame to have a long format suitable for a grouped bar chart
df_melted = plot_df.melt(id_vars='Model', var_name='Metric', value_name='Score')

# Create a grouped bar chart
fig = px.bar(
    df_melted,
    x='Model',
    y='Score',
    color='Metric',
    barmode='group',
    labels={'Score': 'Score', 'Model': 'Model'},
    title='Comparison of Metrics for Different Models',
    log_y=True  # Use log scale for better visibility if scores vary widely
)

# Show the plot
fig.show()


In [None]:
# import plotly.express as px
# import pandas as pd

# # Create a DataFrame from the results dictionary
# data = {
#     'Model': list(results_dict.keys()),
#     'Precision': [float(value.split(',')[0]) for value in results_dict.values()],
#     'Recall': [float(value.split(',')[1]) for value in results_dict.values()],
#     'Accuracy': [float(value.split(',')[2]) for value in results_dict.values()],
#     'F1 Score': [float(value.split(',')[3]) for value in results_dict.values()]
# }

# df = pd.DataFrame(data)

# # Map each model to a numeric ID for coloring purposes
# model_ids = {name: idx for idx, name in enumerate(df['Model'])}
# df['Model_ID'] = df['Model'].map(model_ids)


# # Display the mapping of Model IDs to Model Names
# print("Model ID to Model Name mapping:")
# for model, model_id in model_ids.items():
#     print(f"Model ID {model_id}: {model}")

# # Create a parallel coordinates plot
# fig = px.parallel_coordinates(
#     df,
#     dimensions=['Precision', 'Recall', 'Accuracy', 'F1 Score'],
#     color='Model_ID',
#     color_continuous_scale=px.colors.diverging.Tealrose,
#     labels={'Precision': 'Precision', 'Recall': 'Recall', 'Accuracy': 'Accuracy', 'F1 Score': 'F1 Score'}
# )

# # Show the plot
# fig.show()


Model ID to Model Name mapping:
Model ID 0: LR
Model ID 1: LDA
Model ID 2: NB
Model ID 3: XGB


In [50]:

# # Create a DataFrame for the parallel coordinates plot
# metrics_df = pd.DataFrame({
#     'Model': names,
#     'Accuracy': scoring['test_accuracy'],
#     'Precision': scoring['test_precision_macro'],
#     'Recall': scoring['test_recall_macro'],
#     'F1 Score': scoring['test_f1_macro']

# })

# # Create parallel coordinates plot using Plotly
# fig = px.parallel_coordinates(metrics_df,
#                               dimensions=['Mean Accuracy', 'Std Dev'],
#                               color='Mean Accuracy',
#                               labels={'Mean Accuracy': 'Mean Accuracy',
#                                       'Std Dev': 'Standard Deviation'},
#                               color_continuous_scale=px.colors.sequential.Viridis)

# fig.update_layout(title="Parallel Coordinates Plot of Model Metrics")
# fig.show()


## Pipeline for best classifier

In [None]:
# a dataframe object
# seperate x and y
# train_test_split
# put data in pipeline
# pipeline steps - impute, scale, PCA, model

In [53]:
from sklearn.datasets import load_wine
df = load_wine(as_frame=True)
df = df.frame

In [54]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [55]:
X = df.iloc[:,0:13]
y = df.iloc[:,13]

In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [65]:
X_train.shape

(142, 13)

In [66]:
trf_pipe1 = ColumnTransformer([
    ('impute1', SimpleImputer(), slice(0, 13))],
    remainder = 'passthrough')

scale_pipe2 = ColumnTransformer([
    ('scale', StandardScaler(), slice(0, 13))
])

pca_pipe3 = ColumnTransformer([
    ('pca', PCA(n_components=3), slice(0, 13))
])

# in ColumnTransformer, we can also use Machine learning Models for imputations like below
# model_pipe4 = ColumnTransformer([
#     ('model', LogisticRegression(), slice(0, 13))
# ])

In [67]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
clf2 = ExtraTreesClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50)
pipe = Pipeline([
    ('step1', trf_pipe1),
    ('step2', scale_pipe2),
    ('step3', pca_pipe3),
    ('step4', clf2)
])

#'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50

In [68]:
pipe.fit(X_train, y_train)

In [69]:
y_pred = pipe.predict(X_test)

In [70]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9444444444444444

In [None]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb')


In [None]:
pickle.load(open('pipe.pkl', 'rb'))