# SVM from Ian

In [4]:
# Load basic libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics as stats
import time

# XGB libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR



# Import data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
sample_sub =  pd.read_csv("data/sample_submission.csv")

# Remove NA column from training data
train_df = train_df.drop(columns='Unnamed: 12')

# Fix column name error
test_df = test_df.rename(columns={'TA1':'TA1.x'})

In [2]:
# Assign features
X = train_df.drop(columns=['id', 'DIC'], axis=1)
y = train_df['DIC']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=808) 

# Scale the data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X.columns)

In [5]:
param_grid = {
    'C':[125],
    'gamma': [0.001],
    'kernel': ['linear'],
    'tol': [0.002],
    'epsilon': [5]
}

# Create GridSearchCV object with SVR
grid_search_svm = GridSearchCV(SVR(), param_grid, cv=5)

start_time = time.time()
# Now fit should work with continuous target
grid_search_svm.fit(X_train_scaled, y_train)
end_time = time.time()

svm_time = end_time - start_time
print(svm_time)

# Retrieve the best SVM model
print(grid_search_svm.best_params_)


NameError: name 'X_train_scaled' is not defined

In [5]:
best_svm = grid_search_svm.best_estimator_

best_svm.fit(X_train_scaled, y_train)

best_svm_preds = best_svm.predict(X_val_scaled)

rmse = np.sqrt(mean_squared_error(best_svm_preds, y_val))
print(f"SVM RMSE: {rmse:.4f}")


SVM RMSE: 7.7657


In [None]:
# Prepare the test data
X_test = test_df.drop(columns=['id'], axis=1)  # Drop id, but no need to drop DIC as it's not in test_df

# Scale the test data using the same scaler
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Make predictions on the test data using your best SVM model
test_predictions = best_svm.predict(X_test_scaled)
print(f"Test Accuracy score: {svm_score}")

In [None]:
# Create a submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'DIC': test_predictions
})

# Preview the submission file
submission.head()

# Save the submission to a CSV file
# submission.to_csv('svm_submission.csv', index=False)

Trying SVM with cross validation as opposed to `train_test_split`...

In [2]:
# Assign features
X = train_df.drop(columns=['id', 'DIC'], axis=1)
y = train_df['DIC']
X_test = test_df.drop(columns=['id'], axis=1) 

# Scale the data
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
# For predictions later on...
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [20]:
# Define K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=808)

# Define parameter grid for GridSearchCV
# param_grid = {
#     'C': [125],
#     'gamma': [0.001],
#     'kernel': ['linear'],
#     'tol': [0.002],
#     'epsilon': [5]
# }

param_grid = {
    'C': [0.1, 1, 10, 50, 100, 200],  # Regularization strength
    'gamma': ['scale', 'auto', 0.0001, 0.001, 0.01, 0.1],  # Kernel coefficient
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'degree': [2],  # Only used for 'poly' kernel
    'epsilon': [0.01, 0.1, 0.5, 1, 5, 10],  # Defines margin of tolerance
    'tol': [0.0001, 0.001, 0.01],  # Tolerance for stopping criteria
}

# Create GridSearchCV object with SVR
grid_search_svm = GridSearchCV(SVR(), param_grid, cv=kf, scoring='neg_root_mean_squared_error', n_jobs=16, verbose=1)

# Run grid search with cross-validation
start_time = time.time()
grid_search_svm.fit(X_scaled, y)
end_time = time.time()

svm_time = end_time - start_time
print(f"Grid Search Time: {svm_time:.2f} seconds")

# Retrieve the best SVM model
print("Best Hyperparameters:", grid_search_svm.best_params_)


Fitting 5 folds for each of 1944 candidates, totalling 9720 fits


Grid Search Time: 125.21 seconds
Best Hyperparameters: {'C': 100, 'degree': 2, 'epsilon': 5, 'gamma': 'scale', 'kernel': 'linear', 'tol': 0.0001}


In [22]:
grid_search_svm.best_score_

-5.658652379600355

In [23]:
best_svm = grid_search_svm.best_estimator_

# Train the best SVM model on full dataset
best_svm.fit(X_scaled, y)

# Predict on test set (make sure it's scaled using the same scaler)
svm_preds = best_svm.predict(X_test_scaled)

# Save predictions (optional)
# pd.DataFrame(svm_preds, columns=['Predicted_DIC']).to_csv('svm_predictions.csv', index=False)


In [24]:
# Add DIC to test dataset
test_df['DIC'] = svm_preds
submission = test_df[['id', 'DIC']]
submission.head()

Unnamed: 0,id,DIC
0,1455,2173.070942
1,1456,2195.057243
2,1457,2327.120345
3,1458,1993.791748
4,1459,2148.123678


In [None]:
# Export for submission
# submission.to_csv('submission.csv', index=False)