<a href="https://colab.research.google.com/github/ikoojos/Algorithm-Debt-Research/blob/master/SVM_Hash.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Add the shared module folder to Python's path
import sys
sys.path.append('/content/drive/My Drive/AD Final Experiments')
import importlib
import utils
importlib.reload(utils)
from utils import *
from preprocessing import preprocess_data  # Import preprocess_data function
from splitting import split_data  # Import split_data function

print("Imports loaded successfully!")


Mounted at /content/drive
Imports loaded successfully!


##SVM

In [2]:
from sklearn.svm import SVC

In [3]:
# Preprocess the data
file_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
data = preprocess_data(file_path)  # Use preprocess_data to preprocess the dataset

# Split the data into training, validation, and test sets
X_train_final, X_val, X_test, y_train_final, y_val, y_test = split_data(data)  # Use split_data to split data

# Check if data splitting was successful
print(f"Training data shape: {X_train_final.shape}, Validation data shape: {X_val.shape}, Test data shape: {X_test.shape}")


param_grid = {
    'C': [0.01, 1, 10],
    'kernel': ['linear', 'rbf'],  # Removed 'poly'
    'gamma': ['scale', 'auto']   # Relevant for RBF kernel
}

# Initialise variables to track the best model
best_score = -1
best_params = None
best_model = None

# Perform stratified 5-fold cross-validation on the training set
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

# Iterate over all combinations of hyperparameters
for C, kernel, gamma in product(param_grid['C'], param_grid['kernel'], param_grid['gamma']):
    fold_scores = []
    for train_idx, val_idx in skf.split(X_train_final, y_train_final):
        X_train_fold, X_val_fold = X_train_final.iloc[train_idx], X_train_final.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_final.iloc[train_idx], y_train_final.iloc[val_idx]

        try:
            # Define the pipeline
            pipeline = Pipeline([
                ('Hash', HashingVectorizer()),  # CountVectorizer for feature extraction
                ('scaler', StandardScaler(with_mean=False)),  # StandardScaler for scaling
                ('clf', SVC(C=C, kernel=kernel, gamma=gamma, random_state=42, class_weight='balanced'))  # SVM model
            ])

            # Train on training fold
            pipeline.fit(X_train_fold, y_train_fold)

            # Validate on validation fold
            y_val_fold_pred = pipeline.predict(X_val_fold)
            fold_score = accuracy_score(y_val_fold, y_val_fold_pred)
            fold_scores.append(fold_score)

        except Exception as e:
            print(f"Skipping configuration C={C}, kernel={kernel}, gamma={gamma} due to error: {e}")

    # Calculate average score across all folds
    avg_fold_score = np.mean(fold_scores)
    cv_scores.append(avg_fold_score)

    # Update best parameters if current score is better
    if avg_fold_score > best_score:
        best_score = avg_fold_score
        best_params = {'C': C, 'kernel': kernel, 'gamma': gamma}
        best_model = pipeline

print(f"Best parameters found with Stratified CV: {best_params}")
#print(f"Best cross-validated accuracy on training set: {best_score}")

# Ensure best_model is not None
if best_model is not None:
    # Evaluate the best model on the validation set
    y_val_pred = best_model.predict(X_val)
    val_score = accuracy_score(y_val, y_val_pred)
    print(f"Validation set accuracy: {val_score}")

    # Evaluate the best model on the test set
    y_test_pred = best_model.predict(X_test)
    conf_matrix_test = confusion_matrix(y_test, y_test_pred)
    classification_rep_test = classification_report(y_test, y_test_pred)

    print("\nTest Confusion Matrix:")
    print(conf_matrix_test)
    print("\nTest Classification Report:")
    print(classification_rep_test)
else:
    print("No valid model found during grid search.")


Training data shape: (24879,), Validation data shape: (6220,), Test data shape: (7775,)
Best parameters found with Stratified CV: {'C': 0.01, 'kernel': 'linear', 'gamma': 'scale'}
Validation set accuracy: 0.8390675241157556

Test Confusion Matrix:
[[  75    0    2   72    0    5    0   46]
 [   1   31    7   30    0    4    1   15]
 [   2    1   50   55    0    6    3   18]
 [  39   11   46 1754    2  108   10  236]
 [   1    0    0    8    7    3    0    4]
 [   7    3    9  120    0  218    3   27]
 [   1    0    6   44    1    9   57   25]
 [  26    4   15  169    4   14    5 4355]]

Test Classification Report:
                        precision    recall  f1-score   support

             ALGORITHM       0.49      0.38      0.43       200
         COMPATIBILITY       0.62      0.35      0.45        89
                DEFECT       0.37      0.37      0.37       135
                DESIGN       0.78      0.80      0.79      2206
         DOCUMENTATION       0.50      0.30      0.38    