Wine dataset

In [24]:
import numpy as np
from sklearn.datasets import load_wine
wine=load_wine()

In [3]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(wine['data'],wine['target'],random_state=3009)

In [4]:
from sklearn.svm import SVC

In [5]:
svc=SVC()
svc.fit(x_train,y_train)

SVC()

In [6]:
from sklearn.model_selection import cross_val_score
accuracy=np.mean(cross_val_score(svc,x_train,y_train))
print("Accuracy on training set: ",accuracy)

Accuracy on training set:  0.713960113960114


In [7]:
y_pred=svc.predict(x_test)
test_err_rate=np.mean(y_pred!=y_test)
print("Test error rate on the test set",test_err_rate)
print("Expected test error rate: ",1-accuracy)

Test error rate on the test set 0.35555555555555557
Expected test error rate:  0.28603988603988595


Observations: The test error rate calculated is slightly higher than the expected one.

Create Pipeline:

MinMaxScaler

In [8]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(MinMaxScaler(),SVC())
pipe.fit(x_train,y_train)
print("Pipeline accuracy: ",pipe.score(x_test,y_test))
from sklearn.model_selection import GridSearchCV
param_grid={'svc__C':[0.01,0.1,1,10,100],
           'svc__gamma':[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(x_train,y_train)
print("Best cross-validation accuracy: ",grid.best_score_)
print("Test set score: ",grid.score(x_test,y_test))
print("Best parameters: ",grid.best_params_)

y_pred=pipe.predict(x_test)
print("Test Error Rate: ",np.mean(y_pred!=y_test))

Pipeline accuracy:  1.0
Best cross-validation accuracy:  1.0
Test set score:  0.9777777777777777
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.1}
Test Error Rate:  0.0


StandardScaler

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(StandardScaler(),SVC())
pipe.fit(x_train,y_train)
print("Pipeline accuracy: ",pipe.score(x_test,y_test))
from sklearn.model_selection import GridSearchCV
param_grid={'svc__C':[0.01,0.1,1,10,100],
           'svc__gamma':[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(x_train,y_train)
print("Best cross-validation accuracy: ",grid.best_score_)
print("Test set score: ",grid.score(x_test,y_test))
print("Best parameters: ",grid.best_params_)

y_pred=pipe.predict(x_test)
print("Test Error Rate: ",np.mean(y_pred!=y_test))

Pipeline accuracy:  1.0
Best cross-validation accuracy:  0.9925925925925926
Test set score:  1.0
Best parameters:  {'svc__C': 1, 'svc__gamma': 0.01}
Test Error Rate:  0.0


RobustScaler

In [10]:
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(RobustScaler(),SVC())
pipe.fit(x_train,y_train)
print("Pipeline accuracy: ",pipe.score(x_test,y_test))
from sklearn.model_selection import GridSearchCV
param_grid={'svc__C':[0.01,0.1,1,10,100],
           'svc__gamma':[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(x_train,y_train)
print("Best cross-validation accuracy: ",grid.best_score_)
print("Test set score: ",grid.score(x_test,y_test))
print("Best parameters: ",grid.best_params_)

y_pred=pipe.predict(x_test)
print("Test Error Rate: ",np.mean(y_pred!=y_test))

Pipeline accuracy:  1.0
Best cross-validation accuracy:  1.0
Test set score:  0.9777777777777777
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.01}
Test Error Rate:  0.0


Normalizer

In [11]:
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(Normalizer(),SVC())
pipe.fit(x_train,y_train)
print("Pipeline accuracy: ",pipe.score(x_test,y_test))
from sklearn.model_selection import GridSearchCV
param_grid={'svc__C':[0.01,0.1,1,10,100],
           'svc__gamma':[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(x_train,y_train)
print("Best cross-validation accuracy: ",grid.best_score_)
print("Test set score: ",grid.score(x_test,y_test))
print("Best parameters: ",grid.best_params_)

y_pred=pipe.predict(x_test)
print("Test Error Rate: ",np.mean(y_pred!=y_test))

Pipeline accuracy:  0.5111111111111111
Best cross-validation accuracy:  0.9544159544159545
Test set score:  0.8666666666666667
Best parameters:  {'svc__C': 100, 'svc__gamma': 100}
Test Error Rate:  0.4888888888888889


Conclusion: Normalizer works the best because it is less likely to overfit.

USPS dataset

In [16]:
usps_train_data=np.genfromtxt("zip.train",delimiter=" ",usecols=np.arange(1,257))
usps_train_target=np.genfromtxt("zip.train",delimiter=" ",usecols=0,dtype='int')
usps_test_data=np.genfromtxt("zip.test",delimiter=" ",usecols=np.arange(1,257))
usps_test_target=np.genfromtxt("zip.test",delimiter=" ",usecols=0,dtype='int')
print(usps_train_data.shape,usps_train_target.shape,usps_test_data.shape,usps_test_target.shape)

(7291, 256) (7291,) (2007, 256) (2007,)


In [17]:
from sklearn.svm import SVC

In [18]:
svc=SVC()
svc.fit(usps_train_data,usps_train_target)

SVC()

In [19]:
from sklearn.model_selection import cross_val_score
accuracy=np.mean(cross_val_score(svc,usps_train_data,usps_train_target))
print("Accuracy on training set: ",accuracy)

Accuracy on training set:  0.9758604414583903


In [20]:
y_pred=svc.predict(usps_test_data)
test_err_rate=np.mean(y_pred!=usps_test_target)
print("Test error rate on the test set",test_err_rate)
print("Expected test error rate: ",1-accuracy)

Test error rate on the test set 0.05281514698555057
Expected test error rate:  0.02413955854160965


Observations: The test error rate calculated is slightly higher than the expected one.

Create Pipeline:

MinMaxScaler

In [29]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
usps_pipe=make_pipeline(MinMaxScaler(),SVC())
usps_pipe.fit(usps_train_data,usps_train_target)
print("Pipeline accuracy: ",usps_pipe.score(usps_test_data,usps_test_target))
from sklearn.model_selection import GridSearchCV
usps_param_grid={'svc__C':[0.01,0.1,1,10,100],
           'svc__gamma':[0.001,0.01,0.1,1,10,100]}
usps_grid=GridSearchCV(usps_pipe,param_grid=usps_param_grid,cv=5)
usps_grid.fit(usps_train_data,usps_train_target)
print("Best cross-validation accuracy: ",usps_grid.best_score_)
print("Test set score: ",usps_grid.score(usps_test_data,usps_test_target))
print("Best parameters: ",usps_grid.best_params_)

usps_y_pred=usps_pipe.predict(usps_test_data)
print("Test Error Rate: ",np.mean(usps_y_pred!=usps_test_target))

Pipeline accuracy:  0.9471848530144494
Best cross-validation accuracy:  0.9744885113072353
Test set score:  0.9481813652217239
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.01}
Test Error Rate:  0.05281514698555057


StandardScaler

In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
usps_pipe=make_pipeline(StandardScaler(),SVC())
usps_pipe.fit(usps_train_data,usps_train_target)
print("Pipeline accuracy: ",usps_pipe.score(usps_test_data,usps_test_target))
from sklearn.model_selection import GridSearchCV
usps_param_grid={'svc__C':[0.01,0.1,1,10,100],
           'svc__gamma':[0.001,0.01,0.1,1,10,100]}
usps_grid=GridSearchCV(usps_pipe,param_grid=usps_param_grid,cv=5)
usps_grid.fit(usps_train_data,usps_train_target)
print("Best cross-validation accuracy: ",usps_grid.best_score_)
print("Test set score: ",usps_grid.score(usps_test_data,usps_test_target))
print("Best parameters: ",usps_grid.best_params_)

usps_y_pred=usps_pipe.predict(usps_test_data)
print("Test Error Rate: ",np.mean(usps_y_pred!=usps_test_target))

Pipeline accuracy:  0.9377179870453413
Best cross-validation accuracy:  0.9714710547371173
Test set score:  0.9446935724962631
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.001}
Test Error Rate:  0.0622820129546587


RobustScaler

In [33]:
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
usps_pipe=make_pipeline(RobustScaler(),SVC())
usps_pipe.fit(usps_train_data,usps_train_target)
print("Pipeline accuracy: ",usps_pipe.score(usps_test_data,usps_test_target))
from sklearn.model_selection import GridSearchCV
usps_param_grid={'svc__C':[0.01,0.1,1,10,100],
           'svc__gamma':[0.001,0.01,0.1,1,10,100]}
usps_grid=GridSearchCV(usps_pipe,param_grid=usps_param_grid,cv=5)
usps_grid.fit(usps_train_data,usps_train_target)
print("Best cross-validation accuracy: ",usps_grid.best_score_)
print("Test set score: ",usps_grid.score(usps_test_data,usps_test_target))
print("Best parameters: ",usps_grid.best_params_)

usps_y_pred=usps_pipe.predict(usps_test_data)
print("Test Error Rate: ",np.mean(usps_y_pred!=usps_test_target))

Pipeline accuracy:  0.8191330343796711
Best cross-validation accuracy:  0.8964433425378264
Test set score:  0.8729446935724963
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.001}
Test Error Rate:  0.18086696562032886


Normalizer

In [35]:
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
usps_pipe=make_pipeline(Normalizer(),SVC())
usps_pipe.fit(usps_train_data,usps_train_target)
print("Pipeline accuracy: ",usps_pipe.score(usps_test_data,usps_test_target))
from sklearn.model_selection import GridSearchCV
usps_param_grid={'svc__C':[0.01,0.1,1,10,100],
           'svc__gamma':[0.001,0.01,0.1,1,10,100]}
usps_grid=GridSearchCV(usps_pipe,param_grid=usps_param_grid,cv=5)
usps_grid.fit(usps_train_data,usps_train_target)
print("Best cross-validation accuracy: ",usps_grid.best_score_)
print("Test set score: ",usps_grid.score(usps_test_data,usps_test_target))
print("Best parameters: ",usps_grid.best_params_)

usps_y_pred=usps_pipe.predict(usps_test_data)
print("Test Error Rate: ",np.mean(usps_y_pred!=usps_test_target))

Pipeline accuracy:  0.9466865969108121
Best cross-validation accuracy:  0.9784659993174196
Test set score:  0.9491778774289985
Best parameters:  {'svc__C': 10, 'svc__gamma': 1}
Test Error Rate:  0.05331340308918784


Conclusion: MinMaxScaler works the best for this dataset because it has higher accuracy.