Create data sets with MNIST

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.datasets import fetch_openml
mnist = fetch_openml(name='mnist_784', version=1)
X, y = mnist['data'], mnist['target']
y = y.astype(np.int8)
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((60000, 784), (60000,), (10000, 784), (10000,))

Create Pipeline and train linear SVM

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
y_train_even = (y_train % 2 == 0)
y_test_even = (y_test % 2 == 0)
svm_clf = Pipeline([
    ('Scaler', StandardScaler()),
    ('Linear_svc', LinearSVC(C=1, loss='hinge'))
])
svm_clf.fit(X=X_train, y=y_train_even)



Pipeline(steps=[('Scaler', StandardScaler()),
                ('Linear_svc', LinearSVC(C=1, loss='hinge'))])

Evaluating the model

In [3]:
y_pred = svm_clf.predict(X_train)
num_correct = sum(y_pred==y_train_even)
print('train accuracy: ', num_correct / len(y_pred))

train accuracy:  0.9042833333333333


Training polynomial SVM (I was unable to get the pipeline working for this one)

In [4]:
from sklearn.preprocessing import PolynomialFeatures
X, y = mnist['data'], mnist['target']
y = y.astype(np.int8)
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]
X_train.shape, y_train.shape, X_test.shape, y_test.shape
polynomial_svm_clf = PolynomialFeatures(degree=3)
polynomial_svm_clf.fit(X=X_train, y=y_train_even)

PolynomialFeatures(degree=3)

Creating pipeline for RBF kernel SVM. I did not train this model because whenever I tried to do so it would cause my JupyterLabs to crash. I'm not sure what I was doing wrong for this one.

In [5]:
from sklearn.svm import SVC
rbf_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
])
#rbf_kernel_svm_clf.fit(X=X_train, y=y_train_even)

Importing and preparing student_scores data

In [8]:
from zlib import crc32
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

dataset = pd.read_csv('../HW5/hw2_student_scores.csv')
def test_set_check(identifier, test_ratio=0.2):
    total_size = 2**32
    hex_repr = crc32(np.int64(identifier)) & 0xffffffff
    in_test = hex_repr < (test_ratio * total_size)
    return in_test
dataset_with_id = dataset.reset_index()
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]
train_set, test_set = split_train_test_by_id(data=dataset_with_id, test_ratio=0.2, id_column="index")
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)
dataset['scores_cat'] = pd.cut(x=dataset['Scores'], bins=[0, 1.5, 3, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X=dataset, y=dataset['scores_cat']):
    strat_train_set = dataset.loc[train_index]
    strat_test_set = dataset.loc[test_index]
dataset = strat_train_set.drop("Scores", axis=1)
dataset_labels = strat_train_set['Scores'].copy()
for set_ in (strat_train_set, strat_test_set):
    set_.drop('scores_cat', axis=1, inplace=True)  
dataset_num = dataset.drop("Gender", axis=1)
imputer = SimpleImputer(strategy='median')
imputer.fit(dataset_num)
X = imputer.transform(dataset_num)
dataset_tr = pd.DataFrame(data=X, index=dataset_num.index, columns=dataset_num.columns)
dataset_cat = dataset[['Gender']]
ordinal_encoder = OrdinalEncoder()

dataset_cat_encoded = ordinal_encoder.fit_transform(dataset_cat.values)
one_hot_encoder = OneHotEncoder()
dataset_cat_1hot = one_hot_encoder.fit_transform(dataset_cat.values)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

num_attribs = dataset_num.columns.tolist()
cat_attribs = ["Gender"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])
dataset_prepared = full_pipeline.fit_transform(dataset)

Training the model with SVM regression

In [9]:
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
svm_reg = LinearSVR(epsilon=1.5)
svm_reg.fit(dataset_prepared, dataset_labels)
svm_poly_reg = SVR(kernel='poly', degree=2, C=100, epsilon=0.1, gamma='auto')
svm_poly_reg.fit(dataset_prepared, dataset_labels)

SVR(C=100, degree=2, gamma='auto', kernel='poly')

Predicting the model

In [10]:
some_data = dataset.iloc[:5]
some_labels = dataset_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions: ", svm_poly_reg.predict(some_data_prepared))

Predictions:  [50.71383991 28.91091859 29.67933298 88.007947   33.42321517]
