In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import datasets

In [None]:
np.random.seed(1)

Load first dataframe

In [None]:
data1 = pd.read_pickle('data1.pkl')

Plot the data points using `seaborn`

In [None]:
fig, ax = plt.subplots(figsize=(12, 10))
sns.scatterplot(data=data1, x='f1', y='f2', hue='label', ax=ax)

Split data into feature and labels : 

In [None]:
X1 = data1[['f1', 'f2']].values
y1 = data1['label'].values

Train a classifier to learn the separation : 

In [None]:
svc_linear = SVC(kernel='linear').fit(X1, y1)

Plot decision boundary

In [None]:
fig, ax = plt.subplots(figsize=(12, 10))

x_min = X1[:, 0].min()
x_max = X1[:, 0].max()
y_min = X1[:, 1].min()
y_max = X1[:, 1].max()

XX, YY = np.mgrid[x_min:x_max:300j, y_min:y_max:300j]
Z = svc_linear.decision_function(np.c_[XX.ravel(), YY.ravel()])

Z = Z.reshape(XX.shape)
ax.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
sns.scatterplot(data=data1, x='f1', y='f2', hue='label', ax=ax)

We do the same for the second dataset

In [None]:
data2 = pd.read_pickle('data2.pkl')

X2 = data2[['f1', 'f2']].values
y2 = data2['label'].values

In [None]:
fig, ax = plt.subplots(figsize=(12, 10))
sns.scatterplot(data=data2, x='f1', y='f2', hue='label', ax=ax)

In [None]:
svc_gaussian = SVC(kernel='rbf', gamma='scale').fit(X2, y2)

In [None]:
fig, ax = plt.subplots(figsize=(12, 10))

x_min = X2[:, 0].min()
x_max = X2[:, 0].max()
y_min = X2[:, 1].min()
y_max = X2[:, 1].max()

XX, YY = np.mgrid[x_min:x_max:300j, y_min:y_max:300j]
Z = svc_gaussian.decision_function(np.c_[XX.ravel(), YY.ravel()])

Z = Z.reshape(XX.shape)
ax.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
sns.scatterplot(data=data2, x='f1', y='f2', hue='label', ax=ax)

A walk through explanation of Grid Search with the IRIS dataset

In [None]:
iris = datasets.load_iris()
X = iris['data']
y = iris['target']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
kernels = ['linear', 'rbf', 'sigmoid']
for kernel in kernels:
    scores = []
    for i in range(4):
        # We split the training data to have validation data
        idx_test = range(28*i, 28*(i+1))
        idx_train = [i for i in range(112) if i not in idx_test]
        X_cv_train = X_train[idx_train]
        X_cv_test = X_train[idx_test]
        y_cv_train = y_train[idx_train]
        y_cv_test = y_train[idx_test]
        classifier = SVC(C=10, kernel=kernel, gamma='scale', ).fit(X_cv_train, y_cv_train)
        scores.append(classifier.score(X_cv_test, y_cv_test))
    print(kernel, np.mean(scores))

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid')}
svc = SVC(gamma="scale", C=10)
clf = GridSearchCV(svc, parameters, cv=4, iid=False)
clf.fit(X_train, y_train)

In [None]:
print(clf.best_estimator_)

We retrieve the same result

We can check the score for the test data : 

In [None]:
clf.score(X_test, y_test)