Imports

In [33]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import zero_one_loss

Read data

In [34]:
X_train = np.loadtxt('X_train_binary.csv', delimiter=',')
y_train = np.loadtxt('y_train_binary.csv', delimiter=',')
X_test = np.loadtxt('X_test_binary.csv', delimiter=',')
y_test = np.loadtxt('y_test_binary.csv', delimiter=',')

Class frequencies

In [35]:
# Calculate class frequencies for training data
unique_classes_train, class_counts_train = np.unique(y_train, return_counts=True)
total_data_points_train = len(y_train)

print("Class frequencies for training data:")
for class_label, class_count in zip(unique_classes_train, class_counts_train):
    class_frequency = class_count / total_data_points_train
    print(f"Class {int(class_label)}: {class_frequency:.2%}")

# Calculate class frequencies for testing data
unique_classes_test, class_counts_test = np.unique(y_test, return_counts=True)
total_data_points_test = len(y_test)

print("\nClass frequencies for testing data:")
for class_label, class_count in zip(unique_classes_test, class_counts_test):
    class_frequency = class_count / total_data_points_test
    print(f"Class {int(class_label)}: {class_frequency:.2%}")

Class frequencies for training data:
Class -1: 45.33%
Class 1: 54.67%

Class frequencies for testing data:
Class -1: 46.95%
Class 1: 53.05%


Normalization

In [36]:
# Normalize training data to zero mean and unit variance
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

print("\nMean and variance of training data after normalization:")
print(f"Mean: {np.mean(X_train_scaled):.2f}")
print(f"Variance: {np.var(X_train_scaled):.2f}")


Mean and variance of training data after normalization:
Mean: 0.00
Variance: 1.00


In [37]:
# Compute the mean and variance of input features independently
mean_values = np.mean(X_train_scaled, axis=0)
variance_values = np.var(X_train_scaled, axis=0)

# Print the mean and variance for each feature
for feature_index, (mean, variance) in enumerate(zip(mean_values, variance_values)):
    print(f"Feature {feature_index + 1}: Mean = {mean:.4f}, Variance = {variance:.4f}")


Feature 1: Mean = 0.0000, Variance = 1.0000
Feature 2: Mean = 0.0000, Variance = 1.0000
Feature 3: Mean = -0.0000, Variance = 1.0000
Feature 4: Mean = 0.0000, Variance = 1.0000
Feature 5: Mean = 0.0000, Variance = 1.0000
Feature 6: Mean = 0.0000, Variance = 1.0000
Feature 7: Mean = -0.0000, Variance = 1.0000
Feature 8: Mean = 0.0000, Variance = 1.0000
Feature 9: Mean = 0.0000, Variance = 1.0000
Feature 10: Mean = -0.0000, Variance = 1.0000
Feature 11: Mean = -0.0000, Variance = 1.0000
Feature 12: Mean = -0.0000, Variance = 1.0000
Feature 13: Mean = -0.0000, Variance = 1.0000
Feature 14: Mean = -0.0000, Variance = 1.0000
Feature 15: Mean = -0.0000, Variance = 1.0000
Feature 16: Mean = 0.0000, Variance = 1.0000
Feature 17: Mean = 0.0000, Variance = 1.0000
Feature 18: Mean = 0.0000, Variance = 1.0000
Feature 19: Mean = 0.0000, Variance = 1.0000
Feature 20: Mean = 0.0000, Variance = 1.0000
Feature 21: Mean = -0.0000, Variance = 1.0000
Feature 22: Mean = 0.0000, Variance = 1.0000
Feature 23

Test data encoding, mean and variance of input features

In [38]:
# Use the same scaler to transform the test data
X_test_scaled = scaler.transform(X_test)

# Compute the mean and variance of the normalized features in the test data
mean_values_test = np.mean(X_test_scaled, axis=0)
variance_values_test = np.var(X_test_scaled, axis=0)

# Print the mean and variance for each feature in the transformed test data
for feature_index, (mean, variance) in enumerate(zip(mean_values_test, variance_values_test)):
    print(f"Feature {feature_index + 1}: Mean = {mean:.4f}, Variance = {variance:.4f}")


Feature 1: Mean = 0.0904, Variance = 1.9290
Feature 2: Mean = 0.1658, Variance = 7.2771
Feature 3: Mean = -0.0632, Variance = 0.7855
Feature 4: Mean = -0.0802, Variance = 0.7411
Feature 5: Mean = -0.0379, Variance = 0.8555
Feature 6: Mean = -0.1084, Variance = 0.9803
Feature 7: Mean = -0.1047, Variance = 1.0681
Feature 8: Mean = -0.2116, Variance = 2.8828
Feature 9: Mean = 0.2688, Variance = 2.9717
Feature 10: Mean = 0.0805, Variance = 1.4777
Feature 11: Mean = 0.0141, Variance = 1.0939
Feature 12: Mean = 0.0575, Variance = 1.1381
Feature 13: Mean = 0.0139, Variance = 1.1177
Feature 14: Mean = 0.0035, Variance = 1.2449
Feature 15: Mean = 0.1325, Variance = 1.2656
Feature 16: Mean = 0.0234, Variance = 1.0073
Feature 17: Mean = 0.1302, Variance = 1.1311
Feature 18: Mean = 0.1306, Variance = 3.8939
Feature 19: Mean = 0.0252, Variance = 5.7140
Feature 20: Mean = 0.1006, Variance = 5.0024
Feature 21: Mean = 0.4756, Variance = 54.2813
Feature 22: Mean = 0.1081, Variance = 1.4396
Feature 23: 

In [41]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# Create an SVM classifier
svm = SVC(kernel='rbf')  # 'rbf' kernel is commonly used for SVM

# Create a scaler for normalization
scaler = StandardScaler()

# Perform grid search with 5-fold cross-validation using zero_one_loss as the scoring metric
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(scaler.fit_transform(X_train), y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Get the best SVM model
best_svm = grid_search.best_estimator_

# Train the best model using the complete training dataset
best_svm.fit(scaler.transform(X_train), y_train)

# Evaluate the training set error (0-1 loss)
training_error = zero_one_loss(y_train, best_svm.predict(scaler.transform(X_train)))
print(f"Training 0-1 Loss: {training_error:.4f}")

# Predict labels on the test set using the trained model
y_test_pred = best_svm.predict(scaler.transform(X_test))

# Calculate the zero-one loss on the test set
test_error = zero_one_loss(y_test, y_test_pred)
print(f"Test 0-1 Loss: {test_error:.4f}")

Best hyperparameters: {'C': 1, 'gamma': 0.01}
Training 0-1 Loss: 0.0467
Test 0-1 Loss: 0.2073


In [43]:
# List of different C values to experiment with
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# Iterate over different C values
for C in C_values:
    # Create an SVM classifier with the specified C
    svm = SVC(kernel='rbf', C=C)
    
    # Fit the SVM model
    svm.fit(X_train_scaled, y_train)
    
    # Count the number of bounded and free support vectors
    bounded_support_vectors = np.sum(np.logical_and(0 < svm.dual_coef_.ravel(), svm.dual_coef_.ravel() < C))
    free_support_vectors = np.sum(svm.dual_coef_.ravel() == C)
    
    # Print the results
    print(f"C = {C}: Bounded Support Vectors = {bounded_support_vectors}, Free Support Vectors = {free_support_vectors}")

C = 0.001: Bounded Support Vectors = 0, Free Support Vectors = 68
C = 0.01: Bounded Support Vectors = 7, Free Support Vectors = 64
C = 0.1: Bounded Support Vectors = 9, Free Support Vectors = 62
C = 1: Bounded Support Vectors = 23, Free Support Vectors = 22
C = 10: Bounded Support Vectors = 40, Free Support Vectors = 0
C = 100: Bounded Support Vectors = 40, Free Support Vectors = 0
