In [1]:
import pandas as pd

# Load your CSV file into a DataFrame
df_final = pd.read_csv('updated_csv_file.csv')

  df_final = pd.read_csv('updated_csv_file.csv')


In [2]:



# DATA PRE PROCESSING


import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Step 2: Check for missing values and handle them

# Separate numeric and non-numeric columns
numeric_cols = df_final.select_dtypes(include=['float64', 'int64']).columns
non_numeric_cols = df_final.select_dtypes(exclude=['float64', 'int64']).columns

# Fill missing values for numeric columns with the column mean
df_final[numeric_cols] = df_final[numeric_cols].fillna(df_final[numeric_cols].mean())

# Fill missing values for non-numeric columns with the most frequent value (mode)
for col in non_numeric_cols:
    df_final[col].fillna(df_final[col].mode()[0], inplace=True)

# Step 3: Scale only the numeric columns
scaler = StandardScaler()
df_final_scaled = df_final.copy()  # Create a copy of the DataFrame to hold scaled values
df_final_scaled[numeric_cols] = scaler.fit_transform(df_final[numeric_cols])  # Only scale the numeric columns

# Step 4: Separate features (X) and target (y)
# X = all columns except the last one (Subtype), retaining both numeric and non-numeric columns
X = df_final_scaled.iloc[:, :-1]  # Features
y = df_final_scaled.iloc[:, -1]   # Target (Subtype)

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the shapes of the training and testing sets
print("Preproccessed Successfully")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_final[col].fillna(df_final[col].mode()[0], inplace=True)


Preproccessed Successfully
X_train shape: (756, 16393)
X_test shape: (189, 16393)
y_train shape: (756,)
y_test shape: (189,)


In [8]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import ADASYN
import numpy as np

# Step 1: Ensure numeric columns are selected
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

# Extract numeric columns from X_train and X_test
X_train_numeric = X_train[numeric_cols]
X_test_numeric = X_test[numeric_cols]

# Step 2: Dimensionality Reduction with PCA (retain 95% variance)
pca = PCA(n_components=0.95)  # Automatically selects components retaining 95% variance
X_train_pca = pca.fit_transform(X_train_numeric)
X_test_pca = pca.transform(X_test_numeric)

print("Reduced X_train shape after PCA:", X_train_pca.shape)

# Step 3: Handle class imbalance using ADASYN
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_pca, y_train)

# Step 4: Expanded Grid Search for Hyperparameter Tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10],          # Wider range of C values for regularization
    'gamma': [0.0001, 0.00001, 0.000001],  # Explore smaller gamma values
    'kernel': ['rbf']                   # Use only the RBF kernel
}

# Step 5: Define class weights
class_weights = {'BRCA_Basal': 1, 'BRCA_Her2': 10, 'BRCA_LumA': 1, 'BRCA_LumB': 3}

# Perform GridSearchCV with cross-validation
grid = GridSearchCV(SVC(probability=True, class_weight=class_weights, random_state=42),
                    param_grid, refit=True, verbose=2, cv=5, return_train_score=True)
grid.fit(X_train_resampled, y_train_resampled)

# Step 6: Get Best Parameters and Evaluate Model
print("Best Parameters from Grid Search:", grid.best_params_)

# Cross-validation scores
cv_results = grid.cv_results_
train_scores = cv_results['mean_train_score']
test_scores = cv_results['mean_test_score']

print("\nCross-Validation Results:")
print("Average Training Score:", np.mean(train_scores))
print("Average Validation Score:", np.mean(test_scores))

# Check for Overfitting
if np.mean(train_scores) > np.mean(test_scores) + 0.05:  # Allowing a small margin
    print("\nWarning: The model might be overfitting!")
else:
    print("\nThe model does not appear to be overfitting.")

y_pred_grid = grid.predict(X_test_pca)

print("\nSVM Results after Hyperparameter Tuning:")
print("Accuracy:", accuracy_score(y_test, y_pred_grid))
print(classification_report(y_test, y_pred_grid, zero_division=0))

# Step 7: Evaluate Confusion Matrix and Additional Metrics
cm = confusion_matrix(y_test, y_pred_grid)
print("\nConfusion Matrix for SVM:\n", cm)

mcc = matthews_corrcoef(y_test, y_pred_grid)
y_proba_svm = grid.predict_proba(X_test_pca)
roc_auc = roc_auc_score(y_test, y_proba_svm, multi_class='ovr')

print("\nSVM MCC after Grid Search:", mcc)
print("SVM AUC after Grid Search:", roc_auc)


Reduced X_train shape after PCA: (756, 153)
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ...................C=0.01, gamma=0.0001, kernel=rbf; total time=   3.2s
[CV] END ...................C=0.01, gamma=0.0001, kernel=rbf; total time=   3.3s
[CV] END ...................C=0.01, gamma=0.0001, kernel=rbf; total time=   3.2s
[CV] END ...................C=0.01, gamma=0.0001, kernel=rbf; total time=   3.0s
[CV] END ...................C=0.01, gamma=0.0001, kernel=rbf; total time=   1.5s
[CV] END ....................C=0.01, gamma=1e-05, kernel=rbf; total time=   1.3s
[CV] END ....................C=0.01, gamma=1e-05, kernel=rbf; total time=   1.2s
[CV] END ....................C=0.01, gamma=1e-05, kernel=rbf; total time=   1.2s
[CV] END ....................C=0.01, gamma=1e-05, kernel=rbf; total time=   1.3s
[CV] END ....................C=0.01, gamma=1e-05, kernel=rbf; total time=   1.5s
[CV] END ....................C=0.01, gamma=1e-06, kernel=rbf; total time=   1.4s
[CV]

In [12]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE
import numpy as np

# Step 1: Ensure numeric columns are selected
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

# Extract numeric columns from X_train and X_test
X_train_numeric = X_train[numeric_cols]
X_test_numeric = X_test[numeric_cols]

# Step 2: Dimensionality Reduction with PCA (retain 95% variance)
pca = PCA(n_components=0.95)  # Automatically selects components retaining 95% variance
X_train_pca = pca.fit_transform(X_train_numeric)
X_test_pca = pca.transform(X_test_numeric)

print("Reduced X_train shape after PCA:", X_train_pca.shape)

# Step 3: Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca, y_train)

print(f"Shape after SMOTE Resampling: {X_train_resampled.shape}")

# Step 4: Expanded Grid Search for Hyperparameter Tuning
param_grid = {
    'C': [0.1, 1, 10, 100],          # Wider range of C values for regularization
    'gamma': [0.001, 0.0001, 0.00001],  # Explore smaller gamma values
    'degree': [2, 3],                 # Degrees for the polynomial kernel
    'kernel': ['poly']                # Use the polynomial kernel
}

# Step 5: Define class weights
class_weights = {'BRCA_Basal': 1, 'BRCA_Her2': 15, 'BRCA_LumA': 1, 'BRCA_LumB': 7}

# Perform GridSearchCV with return_train_score=True to capture training scores
grid = GridSearchCV(SVC(probability=True, class_weight=class_weights, random_state=42),
                    param_grid, refit=True, verbose=2, return_train_score=True)
grid.fit(X_train_resampled, y_train_resampled)

# Step 6: Get Best Parameters and Evaluate Model
print("Best Parameters from Grid Search:", grid.best_params_)

y_pred_grid = grid.predict(X_test_pca)

print("\nSVM Results after Hyperparameter Tuning:")
print("Accuracy:", accuracy_score(y_test, y_pred_grid))
print(classification_report(y_test, y_pred_grid, zero_division=0))

# Step 7: Evaluate Confusion Matrix and Additional Metrics
cm = confusion_matrix(y_test, y_pred_grid)
print("\nConfusion Matrix for SVM:\n", cm)

mcc = matthews_corrcoef(y_test, y_pred_grid)
y_proba_svm = grid.predict_proba(X_test_pca)
roc_auc = roc_auc_score(y_test, y_proba_svm, multi_class='ovr')

print("\nSVM MCC after Grid Search:", mcc)
print("SVM AUC after Grid Search:", roc_auc)

# Step 8: Cross-Validation for Overfitting Check
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(grid.best_estimator_, X_train_resampled, y_train_resampled, cv=cv, scoring='accuracy')

print("\nCross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))
print("Standard Deviation of Cross-Validation Accuracy:", np.std(cv_scores))

# Step 9: Check for Overfitting
train_accuracy = grid.best_score_
mean_cv_accuracy = np.mean(cv_scores)

print("\nTraining Accuracy from Grid Search:", train_accuracy)
print("Cross-Validation Accuracy:", mean_cv_accuracy)

if train_accuracy > mean_cv_accuracy + 0.05:  # Threshold for significant overfitting
    print("The model shows signs of overfitting.")
else:
    print("The model does not show significant overfitting.")


Reduced X_train shape after PCA: (756, 153)
Shape after SMOTE Resampling: (1600, 153)
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ..........C=0.1, degree=2, gamma=0.001, kernel=poly; total time=   0.6s
[CV] END ..........C=0.1, degree=2, gamma=0.001, kernel=poly; total time=   0.7s
[CV] END ..........C=0.1, degree=2, gamma=0.001, kernel=poly; total time=   0.7s
[CV] END ..........C=0.1, degree=2, gamma=0.001, kernel=poly; total time=   0.7s
[CV] END ..........C=0.1, degree=2, gamma=0.001, kernel=poly; total time=   0.7s
[CV] END .........C=0.1, degree=2, gamma=0.0001, kernel=poly; total time=   0.9s
[CV] END .........C=0.1, degree=2, gamma=0.0001, kernel=poly; total time=   0.8s
[CV] END .........C=0.1, degree=2, gamma=0.0001, kernel=poly; total time=   0.8s
[CV] END .........C=0.1, degree=2, gamma=0.0001, kernel=poly; total time=   0.9s
[CV] END .........C=0.1, degree=2, gamma=0.0001, kernel=poly; total time=   1.0s
[CV] END ..........C=0.1, degree=2, gamma=

In [18]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import ADASYN
import numpy as np

# Step 1: Ensure numeric columns are selected
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

# Extract numeric columns from X_train and X_test
X_train_numeric = X_train[numeric_cols]
X_test_numeric = X_test[numeric_cols]

# Step 2: Dimensionality Reduction with PCA (retain 95% variance)
pca = PCA(n_components=0.95)  # Automatically selects components retaining 95% variance
X_train_pca = pca.fit_transform(X_train_numeric)
X_test_pca = pca.transform(X_test_numeric)

print("Reduced X_train shape after PCA:", X_train_pca.shape)

# Step 3: Handle class imbalance using ADASYN
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_pca, y_train)

# Step 4: Expanded Grid Search for Hyperparameter Tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10],          # Wider range of C values for regularization
    'gamma': [0.0001, 0.00001, 0.000001],  # Explore smaller gamma values
    'kernel': ['sigmoid']                   # Use 'sigmoid' kernel
}

# Step 5: Define class weights
class_weights = {'BRCA_Basal': 1, 'BRCA_Her2': 10, 'BRCA_LumA': 1, 'BRCA_LumB': 3}

# Perform GridSearchCV with cross-validation
grid = GridSearchCV(SVC(probability=True, class_weight=class_weights, random_state=42),
                    param_grid, refit=True, verbose=2, cv=5, return_train_score=True)
grid.fit(X_train_resampled, y_train_resampled)

# Step 6: Get Best Parameters and Evaluate Model
print("Best Parameters from Grid Search:", grid.best_params_)

# Cross-validation scores
cv_results = grid.cv_results_
train_scores = cv_results['mean_train_score']
test_scores = cv_results['mean_test_score']

print("\nCross-Validation Results:")
print("Average Training Score:", np.mean(train_scores))
print("Average Validation Score:", np.mean(test_scores))

# Check for Overfitting
if np.mean(train_scores) > np.mean(test_scores) + 0.05:  # Allowing a small margin
    print("\nWarning: The model might be overfitting!")
else:
    print("\nThe model does not appear to be overfitting.")

y_pred_grid = grid.predict(X_test_pca)

print("\nSVM Results after Hyperparameter Tuning:")
print("Accuracy:", accuracy_score(y_test, y_pred_grid))
print(classification_report(y_test, y_pred_grid, zero_division=0))

# Step 7: Evaluate Confusion Matrix and Additional Metrics
cm = confusion_matrix(y_test, y_pred_grid)
print("\nConfusion Matrix for SVM:\n", cm)

mcc = matthews_corrcoef(y_test, y_pred_grid)
y_proba_svm = grid.predict_proba(X_test_pca)
roc_auc = roc_auc_score(y_test, y_proba_svm, multi_class='ovr')

print("\nSVM MCC after Grid Search:", mcc)
print("SVM AUC after Grid Search:", roc_auc)


Reduced X_train shape after PCA: (756, 153)
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ...............C=0.01, gamma=0.0001, kernel=sigmoid; total time=   1.5s
[CV] END ...............C=0.01, gamma=0.0001, kernel=sigmoid; total time=   1.7s
[CV] END ...............C=0.01, gamma=0.0001, kernel=sigmoid; total time=   1.2s
[CV] END ...............C=0.01, gamma=0.0001, kernel=sigmoid; total time=   1.1s
[CV] END ...............C=0.01, gamma=0.0001, kernel=sigmoid; total time=   1.2s
[CV] END ................C=0.01, gamma=1e-05, kernel=sigmoid; total time=   1.2s
[CV] END ................C=0.01, gamma=1e-05, kernel=sigmoid; total time=   1.2s
[CV] END ................C=0.01, gamma=1e-05, kernel=sigmoid; total time=   1.2s
[CV] END ................C=0.01, gamma=1e-05, kernel=sigmoid; total time=   1.1s
[CV] END ................C=0.01, gamma=1e-05, kernel=sigmoid; total time=   1.2s
[CV] END ................C=0.01, gamma=1e-06, kernel=sigmoid; total time=   1.2s
[CV]

In [23]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    matthews_corrcoef,
    roc_auc_score,
)
from sklearn.model_selection import ParameterGrid
from imblearn.over_sampling import ADASYN
import numpy as np
import pandas as pd

# Step 1: Check for non-numeric columns in X_train and X_test
X_train = pd.DataFrame(X_train)  # Ensure it's a DataFrame
X_test = pd.DataFrame(X_test)

# Drop non-numeric columns (if any)
X_train_numeric = X_train.select_dtypes(include=["number"])
X_test_numeric = X_test.select_dtypes(include=["number"])

# Step 2: Handle class imbalance using ADASYN
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_numeric, y_train)

# Step 3: Dimensionality Reduction with PCA (retain 95% variance)
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_resampled)
X_test_pca = pca.transform(X_test_numeric)

print("Reduced X_train shape after PCA:", X_train_pca.shape)

# Custom utility kernel
def create_utility_kernel(X, Y, p=1):
    return 7 * (np.dot(X, Y.T)) ** p  # Example: dot product kernel, raised to the power of p

# Step 5: Expanded Grid Search for Hyperparameter Tuning
param_grid = {
    "C": [0.0001, 0.001, 0.01],  # Regularization parameter
    "p": [1, 2, 3],              # Different exponents for the kernel
}

best_score = -np.inf
best_params = None
best_model = None

# Step 6: Perform grid search manually for the custom kernel
for params in ParameterGrid(param_grid):
    kernel_func = lambda X, Y: create_utility_kernel(X, Y, params["p"])
    train_kernel_matrix = kernel_func(X_train_pca, X_train_pca)
    test_kernel_matrix = kernel_func(X_test_pca, X_train_pca)

    svc = SVC(kernel="precomputed", probability=True, random_state=42, C=params["C"])
    svc.fit(train_kernel_matrix, y_train_resampled)

    y_pred = svc.predict(test_kernel_matrix)
    score = accuracy_score(y_test, y_pred)

    if score > best_score:
        best_score = score
        best_params = params
        best_model = svc

# Print the best parameters


# Step 7: Evaluate the best model
y_pred_best = best_model.predict(kernel_func(X_test_pca, X_train_pca))
print("\nSVM Results with Best Parameters:")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best, zero_division=0))

# Confusion Matrix and Additional Metrics
cm = confusion_matrix(y_test, y_pred_best)
print("\nConfusion Matrix for Best SVM:\n", cm)

mcc = matthews_corrcoef(y_test, y_pred_best)
y_proba_best = best_model.predict_proba(kernel_func(X_test_pca, X_train_pca))
roc_auc = roc_auc_score(y_test, y_proba_best, multi_class="ovr")

print("\nSVM MCC with Best Parameters:", mcc)
print("SVM AUC with Best Parameters:", roc_auc)




Preprocessing Complete. Shapes:
X_train_numeric shape: (756, 16384)
X_test_numeric shape: (189, 16384)
Original X_train shape: (756, 16384)
Reduced X_train shape after PCA: (756, 153)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ........................C=0.001, kernel=precomputed; total time=   0.0s
[CV] END ........................C=0.001, kernel=precomputed; total time=   0.0s
[CV] END ........................C=0.001, kernel=precomputed; total time=   0.0s
[CV] END ........................C=0.001, kernel=precomputed; total time=   0.0s
[CV] END ........................C=0.001, kernel=precomputed; total time=   0.0s
[CV] END .........................C=0.01, kernel=precomputed; total time=   0.0s
[CV] END .........................C=0.01, kernel=precomputed; total time=   0.0s
[CV] END .........................C=0.01, kernel=precomputed; total time=   0.0s
[CV] END .........................C=0.01, kernel=precomputed; total time=   0.0s
[CV] END ..................

In [21]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

# Custom utility kernel function
def utilitykernel(x1, x2, a, p=1):
    x1 = x1.flatten()
    x2 = x2.flatten()
    dot_product = np.dot(x1, x2)
    result = np.log(np.power(np.cosh(a * dot_product), p))
    max_value = 700
    return np.clip(result, -max_value, max_value)

# Custom Gram matrix computation
def gaussianKernelGramMatrix(X1, X2, a, K_function=utilitykernel):
    gram_matrix = np.zeros((X1.shape[0], X2.shape[0]))
    for i, x1 in enumerate(X1):
        for j, x2 in enumerate(X2):
            gram_matrix[i, j] = K_function(x1, x2, a)
    return gram_matrix

# Assuming X_train, X_test, y_train, and y_test are defined as DataFrames
# Preprocessing: Select only numeric columns
X_train_numeric = X_train.select_dtypes(include=[np.number])
X_test_numeric = X_test.select_dtypes(include=[np.number])

print("Preprocessing Complete. Shapes:")
print("X_train_numeric shape:", X_train_numeric.shape)
print("X_test_numeric shape:", X_test_numeric.shape)

# Step 1: Apply PCA
pca = PCA(n_components=0.95)  # Adjust number of components as needed
X_train_pca = pca.fit_transform(X_train_numeric)
X_test_pca = pca.transform(X_test_numeric)

print("Original X_train shape:", X_train_numeric.shape)
print("Reduced X_train shape after PCA:", X_train_pca.shape)

# Step 2: Generate Gram matrices using the custom kernel
a = 0.0001
p = 2

# Compute the Gram matrix for training data
X_train_gram = gaussianKernelGramMatrix(X_train_pca, X_train_pca, a)
# print("\nGram Matrix for Training Data:")
# print(X_train_gram)

# Compute the Gram matrix for test data
X_test_gram = gaussianKernelGramMatrix(X_test_pca, X_train_pca, a)
# print("\nGram Matrix for Test Data:")
# print(X_test_gram)

# Step 3: Define parameter grid for GridSearchCV
param_grid = {
    'C': [0.001, 0.01, 0.0001, 1, 5, 10],
    'kernel': ['precomputed']
}

# Step 4: Initialize and fit GridSearchCV with SVM
grid = GridSearchCV(
    SVC(probability=True, class_weight='balanced', random_state=42),
    param_grid,
    refit=True,
    verbose=2,
    return_train_score=True,
    cv=5
)
grid.fit(X_train_gram, y_train)

# Step 5: Get best parameters from Grid Search
print("Best Parameters from Grid Search:", grid.best_params_)

# Step 6: Evaluate cross-validation results for overfitting
train_scores = grid.cv_results_['mean_train_score']
val_scores = grid.cv_results_['mean_test_score']
score_diff = train_scores - val_scores

# print("\nCross-Validation Results:")
# for i, C in enumerate(param_grid['C']):
#     print(f"C={C}: Mean Training Score: {train_scores[i]:.4f}, Mean Validation Score: {val_scores[i]:.4f}, Difference: {score_diff[i]:.4f}")

# # Check for overfitting
# if any(diff > 0.1 for diff in score_diff):
#     print("\nPotential overfitting detected. Significant gap between training and validation scores.")
# else:
#     print("\nNo significant overfitting detected. Training and validation scores are similar.")

# Step 7: Make predictions on the test set
y_pred_grid = grid.predict(X_test_gram)

# Step 8: Evaluate model performance
print("\nSVM Results after Hyperparameter Tuning:")
print("Accuracy:", accuracy_score(y_test, y_pred_grid))
print(classification_report(y_test, y_pred_grid, zero_division=0))

# Step 9: Additional metrics (Confusion Matrix, Sensitivity, Specificity, MCC, AUC)
cm = confusion_matrix(y_test, y_pred_grid)
FP = cm.sum(axis=0) - np.diag(cm)
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

TPR = TP / (TP + FN)
TNR = TN / (TN + FP)

print("\nConfusion Matrix for SVM:\n", cm)
print("\nSensitivity (TPR) for SVM:", TPR)
print("Specificity (TNR) for SVM:", TNR)

mcc = matthews_corrcoef(y_test, y_pred_grid)
y_proba_svm = grid.predict_proba(X_test_gram)
roc_auc = roc_auc_score(y_test, y_proba_svm, multi_class='ovr')

print("\nSVM MCC after Grid Search:", mcc)
print("SVM AUC after Grid Search:", roc_auc)




Reduced X_train shape after PCA: (1667, 141)

SVM Results with Best Parameters:
Accuracy: 0.5767195767195767
              precision    recall  f1-score   support

  BRCA_Basal       0.69      0.89      0.77        27
   BRCA_Her2       0.38      0.19      0.25        16
   BRCA_LumA       0.71      0.61      0.66        99
   BRCA_LumB       0.35      0.47      0.40        47

    accuracy                           0.58       189
   macro avg       0.53      0.54      0.52       189
weighted avg       0.59      0.58      0.58       189


Confusion Matrix for Best SVM:
 [[24  1  0  2]
 [ 4  3  5  4]
 [ 4  1 60 34]
 [ 3  3 19 22]]

SVM MCC with Best Parameters: 0.35882484792510133
SVM AUC with Best Parameters: 0.7725731078142196
