### Import Libraries

In [237]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### Import Dataset

In [238]:
df = pd.read_csv("juice.csv")
df[::40]

Unnamed: 0,class_label,class_name,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,1,Barolo,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
40,1,Barolo,13.56,1.71,2.31,16.2,117,3.15,3.29,0.34,2.34,6.13,0.95,3.38,795
80,2,Grignolino,12.0,0.92,2.0,19.0,86,2.42,2.26,0.3,1.43,2.5,1.38,3.12,278
120,2,Grignolino,11.45,2.4,2.42,20.0,96,2.9,2.79,0.32,1.83,3.25,0.8,3.39,625
160,3,Barbera,12.36,3.83,2.38,21.0,88,2.3,0.92,0.5,1.04,7.65,0.56,1.58,520


In [239]:
def label(x):
    if x == 1:
        return 0
    elif x == 2:
        return 1
    else:
        return 2
    
df["class_label"] = df["class_label"].apply(label)
df[::40]

Unnamed: 0,class_label,class_name,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,0,Barolo,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
40,0,Barolo,13.56,1.71,2.31,16.2,117,3.15,3.29,0.34,2.34,6.13,0.95,3.38,795
80,1,Grignolino,12.0,0.92,2.0,19.0,86,2.42,2.26,0.3,1.43,2.5,1.38,3.12,278
120,1,Grignolino,11.45,2.4,2.42,20.0,96,2.9,2.79,0.32,1.83,3.25,0.8,3.39,625
160,2,Barbera,12.36,3.83,2.38,21.0,88,2.3,0.92,0.5,1.04,7.65,0.56,1.58,520


In [240]:
# Features 
X = df[["alcohol", "malic_acid"]]

# Target
y = df["class_label"]

**Linear SVM**

In [241]:
# Train a linear SVM
svm = SVC(kernel = "linear")

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.3,
                                                    random_state = 42)

svm.fit(x_train, y_train)

yhat = svm.predict(x_test)
pd.DataFrame({"Actual": y_test, "Predicted": yhat})[:20]

Unnamed: 0,Actual,Predicted
19,0,2
45,0,2
140,2,2
30,0,0
67,1,1
16,0,0
119,1,1
174,2,2
109,1,1
141,2,0


**Score**

In [242]:
score = accuracy_score(y_test, yhat)
print("Accuracy Score of Model is =", score.round(2))

Accuracy Score of Model is = 0.74


In [243]:
svm.predict([[11.46, 1.46]])

array([1], dtype=int64)

**Make a new data set keeping only the support vectors**

In [244]:
X = df[["alcohol", "malic_acid"]]
y = df["class_label"]

In [245]:
# Train a linear SVM
svm = SVC(kernel = "linear")

svm.fit(X, y)

print("Number of original examples", len(X), "\n")

print("Number of support vectors", len(svm.support_))

Number of original examples 178 

Number of support vectors 81


In [246]:
X_small = X.iloc[svm.support_]
y_small = y.iloc[svm.support_]

# Train a new SVM using only the support vectors
svm_small = SVC(kernel = "linear")

svm_small.fit(X_small, y_small)

yhat = svm_small.predict(X_small)
pd.DataFrame({"Actual": y_small, "Predicted": yhat})[::10]

Unnamed: 0,Actual,Predicted
1,0,0
35,0,0
50,0,1
79,1,2
110,1,1
132,2,1
144,2,2
161,2,2
177,2,2


In [247]:
svm_small.predict([[11.46, 1.46]])

array([1], dtype=int64)

In [248]:
score = accuracy_score(y_small, yhat)
score

0.5185185185185185

### Kernel SVMs

In [249]:
X = df[["alcohol", "malic_acid"]]
y = df["class_label"]

In [250]:
# Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
parameters = {'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(X, y)

# Report the best parameters
print("Best CV params", searcher.best_params_)

Best CV params {'gamma': 0.1}


In [251]:
yhat = searcher.predict(X)

pd.DataFrame({"Actual": y, "Predicted": yhat})[::20]

Unnamed: 0,Actual,Predicted
0,0,0
20,0,0
40,0,0
60,1,1
80,1,1
100,1,1
120,1,1
140,2,2
160,2,2


In [252]:
score = accuracy_score(y, yhat)
score

0.8314606741573034

### Jointly tuning gamma and C with GridSearchCV

In [253]:
X = df[["alcohol", "malic_acid"]]
y = df["class_label"]

In [254]:
# Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
parameters = {"C": [0.1, 1, 10],
              "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1]}

searcher = GridSearchCV(svm, parameters)

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.25,
                                                    random_state = 42)
searcher.fit(x_train, y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_, "\n")

print("Best CV accuracy", searcher.best_score_)

Best CV params {'C': 10, 'gamma': 0.01} 

Best CV accuracy 0.8185185185185185


In [255]:
# Report the test accuracy using these best parameters
print("Test accuracy of best grid search hypers:", searcher.score(x_test, y_test).round(2))

Test accuracy of best grid search hypers: 0.76


In [256]:
yhat = searcher.predict(x_test)
pd.DataFrame({"Actual": y_test, "Predicted": yhat})[:20]

Unnamed: 0,Actual,Predicted
19,0,2
45,0,2
140,2,2
30,0,0
67,1,1
16,0,0
119,1,1
174,2,2
109,1,1
141,2,0


### SGDC Classifier

In [264]:
X = df[["alcohol", "malic_acid"]]
y = df["class_label"]

In [267]:
# We set random_state=0 for reproducibility
linear_classifier = SGDClassifier(random_state = 0)

# Instantiate the GridSearchCV object and run the search
parameters = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
              "loss": ["hinge", "log"],
              "penalty": ["l1", "l2"]}

searcher = GridSearchCV(linear_classifier, parameters, cv = 3)

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 42)

searcher.fit(x_train, y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_, "\n")
print("Best CV accuracy", searcher.best_score_, "\n")
print("Test Accuracy of Best Grid Search Hypers:", searcher.score(x_test, y_test))

Best CV params {'alpha': 1, 'loss': 'hinge', 'penalty': 'l2'} 

Best CV accuracy 0.59465737514518 

Test Accuracy of Best Grid Search Hypers: 0.5
