<a href="https://colab.research.google.com/github/ikoojos/Algorithm-Debt-Research/blob/master/SVM_Count.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My Drive/AD Final Experiments')
import importlib
import utils
importlib.reload(utils)
from utils import *
from preprocessing import preprocess_data
from splitting import split_data

print("Imports loaded successfully!")


Mounted at /content/drive
Imports loaded successfully!


##SVM

In [None]:
from sklearn.svm import SVC

In [None]:
file_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
data = preprocess_data(file_path)

X_train_final, X_val, X_test, y_train_final, y_val, y_test = split_data(data)  # Use split_data to split data

print(f"Training data shape: {X_train_final.shape}, Validation data shape: {X_val.shape}, Test data shape: {X_test.shape}")


param_grid = {
    'C': [0.01, 1, 10],
    'kernel': ['linear', 'rbf'],  # Removed 'poly'
    'gamma': ['scale', 'auto']   # Relevant for RBF kernel
}

best_score = -1
best_params = None
best_model = None

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

# Iterate over all combinations of hyperparameters
for C, kernel, gamma in product(param_grid['C'], param_grid['kernel'], param_grid['gamma']):
    fold_scores = []
    for train_idx, val_idx in skf.split(X_train_final, y_train_final):
        X_train_fold, X_val_fold = X_train_final.iloc[train_idx], X_train_final.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_final.iloc[train_idx], y_train_final.iloc[val_idx]

        try:
            # Define the pipeline
            pipeline = Pipeline([
                ('COUNT', CountVectorizer()),  # CountVectorizer for feature extraction
                ('scaler', StandardScaler(with_mean=False)),  # StandardScaler for scaling
                ('clf', SVC(C=C, kernel=kernel, gamma=gamma, random_state=42, class_weight='balanced'))  # SVM model
            ])

            # Train on training fold
            pipeline.fit(X_train_fold, y_train_fold)

            # Validate on validation fold
            y_val_fold_pred = pipeline.predict(X_val_fold)
            fold_score = accuracy_score(y_val_fold, y_val_fold_pred)
            fold_scores.append(fold_score)

        except Exception as e:
            print(f"Skipping configuration C={C}, kernel={kernel}, gamma={gamma} due to error: {e}")

    # Calculate average score across all folds
    avg_fold_score = np.mean(fold_scores)
    cv_scores.append(avg_fold_score)

    # Update best parameters if current score is better
    if avg_fold_score > best_score:
        best_score = avg_fold_score
        best_params = {'C': C, 'kernel': kernel, 'gamma': gamma}
        best_model = pipeline

print(f"Best parameters found with Stratified CV: {best_params}")

if best_model is not None:

    y_val_pred = best_model.predict(X_val)
    val_score = accuracy_score(y_val, y_val_pred)
    print(f"Validation set accuracy: {val_score}")

    # Evaluate the best model on the test set
    y_test_pred = best_model.predict(X_test)
    conf_matrix_test = confusion_matrix(y_test, y_test_pred)
    classification_rep_test = classification_report(y_test, y_test_pred)

    print("\nTest Confusion Matrix:")
    print(conf_matrix_test)
    print("\nTest Classification Report:")
    print(classification_rep_test)
else:
    print("No valid model found during grid search.")


Training data shape: (24879,), Validation data shape: (6220,), Test data shape: (7775,)
Best parameters found with Stratified CV: {'C': 0.01, 'kernel': 'linear', 'gamma': 'scale'}
Validation set accuracy: 0.8485530546623794

Test Confusion Matrix:
[[  92    6    4   62    0    7    2   27]
 [   2   37    8   21    0    3    4   14]
 [   6    6   58   39    0    6    9   11]
 [  73   34   90 1682   18  130   28  151]
 [   1    2    0    8    9    3    0    0]
 [  13    8   11  102    2  231    8   12]
 [   1    3    6   13    0    3  106   11]
 [  60   25   20  106    4    5   13 4359]]

Test Classification Report:
                        precision    recall  f1-score   support

             ALGORITHM       0.37      0.46      0.41       200
         COMPATIBILITY       0.31      0.42      0.35        89
                DEFECT       0.29      0.43      0.35       135
                DESIGN       0.83      0.76      0.79      2206
         DOCUMENTATION       0.27      0.39      0.32    