In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

In [41]:
# Load the dataset
VT = pd.read_csv("data/processed_VT_Data.csv")
pd.set_option('display.max_columns', None)

# Separate features and target variable
X = VT.drop(columns=["Training_Program", "surgeries", "chronic_conditions"])
y = VT["Training_Program"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12, stratify=y)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Generate synthetic data with SMOTE
smote = SMOTE(random_state=12)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Original Training Data Distribution:")
print(pd.Series(y_train).value_counts())
print("\nTraining Data Distribution After SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# Define the SVM model
svm_model = SVC(probability=True, random_state=12)

# Define hyperparameters for optimization
param_dist = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': [0.001, 0.01,'scale', 'auto'],
    'degree': [2, 3, 4, 5, 6],
    'class_weight': [None, 'balanced']
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=param_dist,
    n_iter=30,
    scoring='accuracy',
    cv=4,
    random_state=12,
    verbose=1,
    n_jobs=-1
)

# Fit the RandomizedSearchCV
random_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model performance
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model
best_svm = random_search.best_estimator_
y_pred = best_svm.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Original Training Data Distribution:
Training_Program
0    26
2    24
3    22
1    21
4    18
5    18
6    12
7    11
Name: count, dtype: int64

Training Data Distribution After SMOTE:
Training_Program
7    26
1    26
2    26
4    26
6    26
3    26
5    26
0    26
Name: count, dtype: int64
Fitting 4 folds for each of 30 candidates, totalling 120 fits
Best Parameters: {'kernel': 'rbf', 'gamma': 0.01, 'degree': 6, 'class_weight': None, 'C': 1000}
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.43      0.55         7
           1       0.57      0.80      0.67         5
           2       0.38      0.50      0.43         6
           3       0.40      0.33      0.36         6
           4       0.25      0.20      0.22         5
           5       0.40      0.50      0.44         4
           6       0.67      0.67      0.67         3
           7       0.67      0.67      0.67         3

    accuracy                           