Objective: to compare 3 models (SVM, RandomForest, MLP) systematically and determine the best one to predict the target in the wine dataset.
- Author: Jai Fadia
- Python 3.7.7 64-bit Anaconda

In [1]:
# import libraries
import pandas as pd
import scipy.stats
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
import sklearn.datasets

### Step 1: Import dataset

In [2]:
# download dataset
data = sklearn.datasets.load_wine(as_frame = True)['frame']

In [3]:
# view data
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [4]:
# check for null values
data.isna().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

### Step 2: Split dependent and independent variables, split into training and testing sets

In [5]:
# split target and predictor variables
x = data.drop(labels = 'target', axis = 1)
y = data['target']

In [6]:
# split into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

### Step 3: Create pipeline for all 3 models

In [7]:
# create a dictionary in which to store inputs to pipeline for each model
pipe = {}

In [8]:
# store SVM inputs
pipe['svm'] = {}

# use the standardscaler to scale variables from the dataframe and then apply the SVC classifier
pipe['svm']['steps'] = [('scaler', StandardScaler()), ('SVM', SVC())]
pipe['svm']['pipe'] = Pipeline(pipe['svm']['steps'])

# using RandomSearchCV to determine the optimial parameters, so specify the distributions of the parameters
pipe['svm']['parameters'] = {
    'SVM__C' : scipy.stats.expon(scale = 0.1),
    'SVM__gamma' : scipy.stats.expon(scale = 0.1)
    }

# create the model object and specify the number of iterations to test
pipe['svm']['model'] = RandomizedSearchCV(pipe['svm']['pipe'],
    param_distributions = pipe['svm']['parameters'],
    n_iter = 10000)

In [9]:
# store RandomForest inputs
pipe['rf'] = {}

# use the standardscaler to scale variables from the dataframe and then apply the RandomForestClassifier
pipe['rf']['steps'] = [('scaler', StandardScaler()), ('RF', RandomForestClassifier())]
pipe['rf']['pipe'] = Pipeline(pipe['rf']['steps'])

# using RandomSearchCV to determine the optimial parameters, so specify the distributions of the parameters
pipe['rf']['parameters'] = {
    'RF__max_depth' : scipy.stats.randint.rvs(low = 1, high = 1000, size = 1),
    'RF__n_estimators' : scipy.stats.randint.rvs(low = 1, high = 1000, size = 1),
    'RF__min_samples_split' : scipy.stats.randint.rvs(low = 1, high = 100, size = 1),
    'RF__min_samples_leaf' : scipy.stats.randint.rvs(low = 1, high = 100, size = 1),
    'RF__bootstrap' : scipy.stats.randint.rvs(low = 0, high = 1, size = 1)
}

# create the model object and specify the number of iterations to test
pipe['rf']['model'] = RandomizedSearchCV(pipe['rf']['pipe'],
    param_distributions = pipe['rf']['parameters'],
    n_iter = 10000)

In [10]:
# store MLP inputs
pipe['mlp'] = {}

# use the standardscaler to scale variables from the dataframe and then apply the MLPClassifier
pipe['mlp']['steps'] = [('scaler', StandardScaler()), ('MLP', MLPClassifier())]
pipe['mlp']['pipe'] = Pipeline(pipe['mlp']['steps'])

# using RandomSearchCV to determine the optimial parameters, so specify the distributions of the parameters
# randomizing the architecture of the neural network using varying depths and number of neurons per layer
pipe['mlp']['parameters'] = {
    'MLP__activation' : ['relu', 'logistic', 'tanh'],
    'MLP__alpha' : scipy.stats.expon(scale = 0.0001),
    'MLP__hidden_layer_sizes' : scipy.stats.randint.rvs(low = 1, high = 10, size = scipy.stats.randint.rvs(low = 1, high = 10, size = 1))
}

# create the model object and specify the number of iterations to test
pipe['mlp']['model'] = RandomizedSearchCV(pipe['mlp']['pipe'],
    param_distributions = pipe['mlp']['parameters'],
    n_iter = 10000)

### Step 4: Train models and evaluate

In [11]:
# train SVM, evaluate its performance on the testing set, and output the score and optimal parameters
pipe['svm']['model'].fit(x_train, y_train)

print('Score:\t\t{}'.format(pipe['svm']['model'].score(x_test, y_test)))
print(pipe['svm']['model'].best_params_)

Score:		0.9722222222222222
{'SVM__C': 0.2706563239598662, 'SVM__gamma': 0.03130441834741778}


In [12]:
# train RF, evaluate its performance on the testing set, and output the score and optimal parameters
pipe['rf']['model'].fit(x_train, y_train)

print('Score:\t\t{}'.format(pipe['rf']['model'].score(x_test, y_test)))
print(pipe['rf']['model'].best_params_)

Score:		0.9166666666666666
{'RF__n_estimators': 403, 'RF__min_samples_split': 43, 'RF__min_samples_leaf': 12, 'RF__max_depth': 505, 'RF__bootstrap': 0}


In [13]:
# train MLP, evaluate its performance on the testing set, and output the score and optimal parameters
pipe['mlp']['model'].fit(x_train, y_train)

print('Score:\t\t{}'.format(pipe['mlp']['model'].score(x_test, y_test)))
print(pipe['mlp']['model'].best_params_)

Score:		0.75
{'MLP__activation': 'relu', 'MLP__alpha': 6.106042670091599e-05, 'MLP__hidden_layer_sizes': 7}


### Conclusion
The SVM model with C = ~0.27 and gamma = ~0.03 is the optimal model for this use case.