In [26]:
import numpy as np
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import zero_one_loss

# 1. Generate a training sample of 100 points

In [27]:
# 50 from each class
n_train= 100
# 5000 from each class
n_test = 5000
num_features = 10
mean_shift_vector = np.array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) # mu
reps = 100 #monte carlo reps supposedly a common amount
np.random.seed(42)


In [28]:
def generate_data(n_per_class, mean_shift_vector):
    X0 = np.random.multivariate_normal(mean=np.zeros(num_features), cov=np.identity(num_features), size=n_per_class)
    X1 = np.random.multivariate_normal(mean=mean_shift_vector, cov=np.identity(num_features), size=n_per_class)
    X = np.vstack((X0, X1))
    y = np.array([0]*n_per_class + [1]*n_per_class)
    return X, y


# 2. Fit a classifier using the training sample & 3. Evaluate the number of mistakes on a large test set & 4. Average the test error over many replications


In [34]:
def monte_carlo_error(model, num_reps):
    test_X, test_y = generate_data(n_test, mean_shift_vector)
    errors = []

    for _ in range(num_reps):
        train_X, train_y = generate_data(n_train // 2, mean_shift_vector) # Estimates the average test error of a model using Monte Carlo simulation over multiple randomized training sets
        model.fit(train_X, train_y)
        predictions = model.predict(test_X)
        error_rate = zero_one_loss(test_y, predictions)
        errors.append(error_rate)

    return np.mean(errors)

# Py.1

In [30]:
svm_rbf_model = SVC(C=10, kernel='rbf')
error_py1 = round(monte_carlo_error(svm_rbf_model, reps), 2)
print("Expected test error rate using SVM RBF:", error_py1)

Expected test error rate using SVM RBF: 0.24


# Py.2

In [31]:
svm_linear_model = SVC(C=10, kernel='linear')
error_py2 = round(monte_carlo_error(svm_linear_model, reps), 2)
print("Expected test error rate using SVM Linear:", error_py2)

Expected test error rate using SVM Linear: 0.19


# Py.3

In [32]:
lda_model = LinearDiscriminantAnalysis()
error_py3 = round(monte_carlo_error(lda_model, reps), 2)
print("Expected test error rate using LDA:", error_py3)

Expected test error rate using LDA: 0.19
