In [3]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import pandas as pd
import os
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

rng = np.random.default_rng(42)

In [2]:
# 8. Train a LinearSVC on a linearly separable dataset. Then train an SVC and a SGDClassifier on the same dataset.
#    See if you can get them to produce roughly the same model

In [3]:
# create a linearly separable dataset
# for this example, I will create a positive class (y = 1) centered around (1, 1)
# and a negative class (y = 0) centered around (-1, -1)
# each instance has random uniform error, on each axis, in the interval [-.5, .5)
# each class has 100 instances

In [4]:
X_positive = rng.random((100, 2)) + .5
y_positive = np.full((100), 1)

X_negative = -1 * rng.random((100, 2)) - .5
y_negative = np.full((100), 0)

In [5]:
# brief validation to show that there are no values in either column of X_positive that are negative
# and there are no values in either column of X_negative that are positive
# which guarantees that the dataset is linearly separable

In [6]:
X_positive[(X_positive[:, 0] < 0) | (X_positive[:, 1] < 0)]

array([], shape=(0, 2), dtype=float64)

In [7]:
X_negative[(X_negative[:, 0] > 0) | (X_negative[:, 1] > 0)]

array([], shape=(0, 2), dtype=float64)

In [8]:
# brief validation to show that X_positive and X_negative are centered around (1, 1) and (-1, -1), respectively

In [9]:
X_positive.mean(axis=0)

array([0.9700751 , 1.01766401])

In [10]:
X_negative.mean(axis=0)

array([-0.99559213, -1.00285423])

In [11]:
# glue the datasets together, then shuffle

In [12]:
X = np.append(X_positive, X_negative, axis=0)
y = np.append(y_positive, y_negative, axis=0)

X, y = shuffle(X, y, random_state=42)

In [13]:
# briefly verify that training instances and labels were shuffled correctly
# i.e. positive instances assigned to positive class and negative to negative

In [14]:
((np.apply_along_axis(sum, 1, X) >= 0).astype(int) == y)[False]

array([], shape=(0, 200), dtype=bool)

In [15]:
# normally I would create a test set and perform cross-validation but for this example I only care about
# the similarity of the trained models rather than their performance and generalizability.
# and besides, I know the exact logic of how the dataset was created because I created it,
# which is the ultimate form of data snooping

# so I will just train the models and see how similar I can make them

In [16]:
# standard scale features

In [17]:
ss = StandardScaler()
X_scaled = ss.fit_transform(X)

In [18]:
# train models

In [19]:
linear_svc = LinearSVC(loss='hinge')
linear_svc.fit(X_scaled, y)
print(f'intercept : {linear_svc.intercept_} \nweights : {linear_svc.coef_}')

intercept : [0.00287337] 
weights : [[0.86910295 0.97667909]]


In [20]:
svc = SVC(kernel='linear')
svc.fit(X_scaled, y)
print(f'intercept : {svc.intercept_} \nweights : {svc.coef_}')

intercept : [0.00287456] 
weights : [[0.86905621 0.97672094]]


In [21]:
sgd_classifier = SGDClassifier(alpha=.005, tol=.0001, max_iter=1_000_000, n_iter_no_change=100_000, random_state=42)
sgd_classifier.fit(X_scaled, y)
print(f'intercept : {sgd_classifier.intercept_} \nweights : {sgd_classifier.coef_}')

intercept : [0.00286905] 
weights : [[0.86912896 0.97665493]]


In [22]:
#################################################################################################################

In [23]:
# 9. Train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers,
#    you will need to use one-versus-the-rest to classify all 10 digits. You may want to tune the
#    hyperparameters using small validation sets speed up the process. What accuracy can you reach?

In [2]:
#################################################################################################################

In [4]:
# 10. Train an SVM regressor on the California housing dataset

In [5]:
# load data

In [6]:
raw_data = pd.read_csv('housing.csv')

In [7]:
raw_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [8]:
# add a column to raw_data that assigns which decile of median_house_value that instance belongs to

# note that this column is created to allow stratified sampling of the label in the creation of our test set.
# because the deciles are calculated on the entire dataset (train and test),
# it is important that we drop the column as soon as the train/test sets are formed.
# this column should NOT be treated as an input feature because, having been calculated on the full dataset,
# it is data snooping and will likely not generalize well to new data if used to train a model.
# the column exists purely to ensure a wide variety of labels exists in both the train and test sets

In [9]:
decile_1 = raw_data['median_house_value'].quantile(q=.1)
decile_2 = raw_data['median_house_value'].quantile(q=.2)
decile_3 = raw_data['median_house_value'].quantile(q=.3)
decile_4 = raw_data['median_house_value'].quantile(q=.4)
decile_5 = raw_data['median_house_value'].quantile(q=.5)
decile_6 = raw_data['median_house_value'].quantile(q=.6)
decile_7 = raw_data['median_house_value'].quantile(q=.7)
decile_8 = raw_data['median_house_value'].quantile(q=.8)
decile_9 = raw_data['median_house_value'].quantile(q=.9)

decile_bins = [-np.inf, decile_1, decile_2, decile_3, decile_4, decile_5, decile_6, decile_7, decile_8, decile_9, np.inf]

raw_data['decile'] = pd.cut(raw_data['median_house_value'], bins=decile_bins, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [10]:
# split into training and test sets, drop decile column, and separate into features and labels

In [11]:
stratified_splitter = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=42)
train_test_split = stratified_splitter.split(raw_data, raw_data['decile'])

train_set = None
test_set = None

for train_indices, test_indices in train_test_split:
    train_set = raw_data.loc[train_indices]
    test_set = raw_data.loc[test_indices]

train_labels = train_set['median_house_value'].copy()
test_labels = train_set['median_house_value'].copy()

train_features = train_set.drop(columns=['median_house_value', 'decile'])
test_features = test_set.drop(columns=['median_house_value', 'decile'])

In [12]:
# create variables for column names and identify indices for columns used in added numerical features

In [13]:
rooms_string = 'total_rooms'
bedrooms_string = 'total_bedrooms'
population_string = 'population'
households_string = 'households'
ocean_proximity_string = 'ocean_proximity'

train_features_column_names = list(train_features.columns)

rooms_index = train_features_column_names.index(rooms_string)
bedrooms_index = train_features_column_names.index(bedrooms_string)
population_index = train_features_column_names.index(population_string)
households_index = train_features_column_names.index(households_string)

bpr_string = 'bedrooms_per_room'
rph_string = 'rooms_per_household'
pph_string = 'population_per_household'

In [14]:
# create a transformer that adds the extra numerical features

In [15]:
class NumericalFeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_features, add_bedrooms_per_room=False, add_rooms_per_household=False, add_population_per_household=False):
        self.numerical_features = numerical_features
        
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.add_rooms_per_household = add_rooms_per_household
        self.add_population_per_household = add_population_per_household
        
        possible_features = (bpr_string, rph_string, pph_string)
        parameter_settings = (self.add_bedrooms_per_room, self.add_rooms_per_household, self.add_population_per_household)

        self.possible_features_and_parameter_settings = list(zip(possible_features, parameter_settings))
        
        self.added_features = []
        
        for possible_feature, parameter_setting in self.possible_features_and_parameter_settings:
            if parameter_setting:
                self.added_features.append(possible_feature)
        
        return
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        new_columns = None
        
        for possible_feature, parameter_setting in self.possible_features_and_parameter_settings:
            new_column = None
            if parameter_setting:
                if possible_feature == bpr_string:
                    new_column = X[:, bedrooms_index] / X[:, rooms_index]
                elif possible_feature == rph_string:
                    new_column = X[:, rooms_index] / X[:, households_index]
                elif possible_feature == pph_string:
                    new_column = X[:, population_index] / X[:, households_index]
                if new_columns is None:
                    new_columns = new_column
                else:
                    new_columns = np.c_[new_columns, new_column]
        
        if new_columns is None:
            return X
        
        return np.c_[X, new_columns]

In [16]:
# create a column transformer that performs data preparation

In [17]:
class DataPreparer(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_features, categorical_features, add_bpr=False, add_rph=False, add_pph=False):
        self.add_bpr = add_bpr
        self.add_rph = add_rph
        self.add_pph = add_pph
        
        self.numerical_features = numerical_features        
        self.categorical_features = categorical_features
        self.numerical_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                                           ('feature_adder', NumericalFeatureAdder(self.numerical_features, add_bedrooms_per_room=self.add_bpr, add_rooms_per_household=self.add_rph, add_population_per_household=self.add_pph)),
                                           ('scaler', StandardScaler())])
        self.column_transformer = ColumnTransformer([('numerical', self.numerical_pipeline, self.numerical_features)
                                                       , ('categorical', OneHotEncoder(handle_unknown='ignore'), self.categorical_features)])
        self.added_numerical_features = self.numerical_pipeline.named_steps['feature_adder'].added_features
        return
    
    def fit(self, X):
        self.column_transformer.fit(X)
        one_hot_encoder = self.column_transformer.named_transformers_['categorical']
        self.features = self.numerical_features + self.added_numerical_features + list(np.concatenate(one_hot_encoder.categories_))
        return self
    
    def transform(self, X):
        return self.column_transformer.transform(X)

In [18]:
categorical_features = [ocean_proximity_string]
numerical_features = list(train_features.drop(columns=categorical_features))

data_preparer = DataPreparer(numerical_features, categorical_features, add_bpr=True)
prepped_train_features = data_preparer.fit_transform(train_features)

data_preparer.features

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'bedrooms_per_room',
 '<1H OCEAN',
 'INLAND',
 'ISLAND',
 'NEAR BAY',
 'NEAR OCEAN']

In [19]:
# brief validation of data preparer on first row of shuffled training features

In [20]:
train_features.shape

(16512, 9)

In [21]:
prepped_train_features.shape

(16512, 14)

In [22]:
train_features.iloc[0]

longitude             -116.5
latitude               33.82
housing_median_age      16.0
total_rooms            343.0
total_bedrooms          85.0
population              29.0
households              14.0
median_income         2.1042
ocean_proximity       INLAND
Name: 12344, dtype: object

In [23]:
prepped_train_features[0]

array([ 1.53652021, -0.8524502 , -1.00209463, -1.04711441, -1.0773835 ,
       -1.25320775, -1.26619991, -0.92674015,  0.54898116,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ])

In [24]:
(train_features.iloc[0, 0] - train_features['longitude'].mean())/train_features['longitude'].std()

1.5364736790267735

In [25]:
# at this point, you could create a data preparation and prediction pipeline
# if you want to perform a grid/random search including parameters of data preparation.
# however, I am just going to use the default DataPreparer (i.e. no added features), so this is unnecessary

In [26]:
# do a coarse grid/random search with cv=5 to find the best hyperparameter values

# make sure to return train scores so you can compare train and validation scores.
# if there is a large discrepancy between train and validation scores, you are overfitting and should regularize

# the GridSearchCV calls were taking a long time so instead of combining the parameter grids into one list,
# I separated them by kernel (linear, poly, etc.). For future reference, ASDF is the best kernel
# so don't bother running the other cells unless you want to wait a long time

In [27]:
coarse_C_list = [.01, 1, 100]
coarse_epsilon_list = [.001, .1, 10]
coarse_coef0_list = [0, 1, 10]

coarse_linear_parameter_grid = {'kernel' : ['linear'], 'C' : coarse_C_list, 'epsilon' : coarse_epsilon_list}
coarse_poly_parameter_grid = {'kernel' : ['poly'], 'degree' : [2, 3], 'gamma' : ['scale', 'auto'], 'coef0' : coarse_coef0_list, 'C' : coarse_C_list, 'epsilon' : coarse_epsilon_list}
coarse_rbf_parameter_grid = {'kernel' : ['rbf'], 'gamma' : ['scale', 'auto'], 'C' : coarse_C_list, 'epsilon' : coarse_epsilon_list}
coarse_sigmoid_parameter_grid = coarse_poly_parameter_grid = {'kernel' : ['sigmoid'], 'gamma' : ['scale', 'auto'], 'coef0' : coarse_coef0_list, 'C' : coarse_C_list, 'epsilon' : coarse_epsilon_list}

In [None]:
# linear

In [64]:
coarse_linear_grid_search = GridSearchCV(SVR(), coarse_linear_parameter_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
coarse_linear_grid_search.fit(prepped_train_features, train_labels)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [0.01, 1, 100], 'epsilon': [0.001, 0.1, 10],
                         'kernel': ['linear']},
             return_train_score=True, scoring='neg_mean_squared_error')

In [2]:
np.sqrt(-coarse_linear_grid_search.cv_results_['mean_test_score'])

NameError: name 'np' is not defined

In [None]:
# poly

In [71]:
coarse_poly_grid_search = GridSearchCV(SVR(), coarse_poly_parameter_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
coarse_poly_grid_search.fit(prepped_train_features, train_labels)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [0.01, 1, 100], 'coef0': [0, 1, 10],
                         'epsilon': [0.001, 0.1, 10],
                         'gamma': ['scale', 'auto'], 'kernel': ['sigmoid']},
             return_train_score=True, scoring='neg_mean_squared_error')

In [28]:
# rbf

In [None]:
coarse_rbf_grid_search = GridSearchCV(SVR(), coarse_rbf_parameter_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
coarse_rbf_grid_search.fit(prepped_train_features, train_labels)

In [None]:
# sigmoid

In [None]:
coarse_linear_grid_search = GridSearchCV(SVR(), coarse_linear_parameter_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
coarse_linear_grid_search.fit(prepped_train_features, train_labels)

In [None]:
# do a more refined grid/random search centered on the best hyperparameter values

In [None]:
# select model hyperparameters and train on full dataset

In [None]:
# evaluate model on test set