In [17]:
# 1. Import things

import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_validate, StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest



In [18]:
# 2. Load the data

data = sns.load_dataset('diamonds')
data.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

In [19]:
enc = OrdinalEncoder()
to_change = ["cut", "color", "clarity"]
data[to_change] = enc.fit_transform(data[to_change])

In [20]:
X_columns = ['carat', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']

In [21]:
data=data.sample(2000)
y = data["cut"]
X = data[X_columns]

X

Unnamed: 0,carat,color,clarity,depth,table,price,x,y,z
29064,0.40,1.0,2.0,63.3,56.0,687,4.68,4.74,2.98
540,0.70,4.0,6.0,62.3,55.0,2827,5.66,5.70,3.54
3407,0.33,0.0,2.0,62.0,56.0,567,4.44,4.47,2.76
36267,0.30,1.0,4.0,61.6,57.0,936,4.33,4.30,2.66
50348,0.90,1.0,0.0,62.4,63.0,2245,6.13,6.01,3.79
...,...,...,...,...,...,...,...,...,...
45058,0.58,1.0,2.0,60.7,61.0,1641,5.47,5.27,3.28
8595,1.03,1.0,3.0,62.5,56.0,4441,6.45,6.48,4.04
48467,0.51,0.0,5.0,60.1,59.0,1977,5.19,5.16,3.11
40997,0.50,4.0,2.0,63.1,59.0,1181,5.02,4.96,3.15


In [22]:
scaler = StandardScaler()
model = SVC(kernel="linear")
pipeline = make_pipeline(scaler, model)
scoring = ["accuracy", "f1_micro", "precision_micro", "recall_micro"]
cv = StratifiedKFold(n_splits=5)

In [23]:
results = cross_validate(
    estimator=pipeline, X=X, y=y, scoring=scoring, cv=cv,
    return_train_score=True)
results = pd.DataFrame(results)
results.mean()

fit_time                 0.094368
score_time               0.015605
test_accuracy            0.651500
train_accuracy           0.660375
test_f1_micro            0.651500
train_f1_micro           0.660375
test_precision_micro     0.651500
train_precision_micro    0.660375
test_recall_micro        0.651500
train_recall_micro       0.660375
dtype: float64

In [25]:
select = SelectKBest(k=10)
scaler = StandardScaler()
model = SVC(kernel="linear")
pipeline = make_pipeline(scaler, select, model)

param_grid = {
    "svc__C": [0.01, 0.1, 1, 10, 100],
    "svc__kernel": ["linear", "rbf"],
    "selectkbest__k": [3, 6, 9]
}

estimator = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5
)
results = cross_validate(
    estimator=estimator, X=X, y=y, scoring=scoring, cv=cv,
    return_train_score=True, return_estimator=True)
results = pd.DataFrame(results)
results.mean()

  results.mean()


fit_time                 31.101545
score_time                0.035443
test_accuracy             0.737500
train_accuracy            0.805250
test_f1_micro             0.737500
train_f1_micro            0.805250
test_precision_micro      0.737500
train_precision_micro     0.805250
test_recall_micro         0.737500
train_recall_micro        0.805250
dtype: float64