In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive/')

data = pd.read_csv('/content/drive/MyDrive/data/image_features_cleaned.csv')

# Prepare features and labels

y = data['Label'].map({'Malignant': 1, 'Benign': 0})
X = data.drop('Label', axis=1)
columns_to_drop = ['diagnostics_Image-original_Dimensionality', 'diagnostics_Image-original_Spacing',
                   'diagnostics_Versions_PyRadiomics', 'diagnostics_Versions_Numpy',
                   'diagnostics_Versions_SimpleITK', 'diagnostics_Versions_PyWavelet',
                   'diagnostics_Versions_Python', 'diagnostics_Configuration_Settings',
                   'diagnostics_Configuration_EnabledImageTypes', 'diagnostics_Image-original_Minimum','diagnostics_Image-original_Maximum']

# Drop the specified columns from X
X = X.drop(columns=columns_to_drop)
# Iterate over columns in X
for column in X.columns:
    # Check if any value in the column is of string type
    if X[column].apply(lambda x: isinstance(x, str)).any():
        # Drop the column if it contains strings
        X.drop(column, axis=1, inplace=True)
X_normalized = (X-X.mean()) / X.std()
X_normalized


from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the minority class (malignant data)
X_resampled, y_resampled = smote.fit_resample(X_normalized, y)
X_resampled



Mounted at /content/drive/


Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,original_firstorder_Maximum,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,0.500921,-0.808355,-0.182702,-0.388981,-0.380692,-0.681164,0.136458,-0.502899,0.132630,0.215393,...,0.924904,0.035137,-1.512136,1.677121,-0.118505,-0.719993,1.538854,0.149736,-0.196354,1.878063
1,-0.116501,4.836150,-0.182702,-0.514145,0.572523,5.136778,1.019384,0.795263,-0.653384,0.215393,...,-0.106721,-1.236449,2.100885,-0.871979,-0.040241,2.669437,-0.769375,-0.498899,-0.053020,-0.757636
2,-0.158499,-0.564573,-0.182702,0.706205,1.267575,-0.066380,0.478570,-0.159268,-0.098052,0.215393,...,1.109031,-0.564680,-0.070079,0.026514,-0.114686,-0.647151,-0.376859,1.061416,0.685695,-0.463820
3,-1.434687,0.245671,-0.182702,-0.420272,-1.770796,-0.404614,-1.213551,-1.457430,2.656374,0.215393,...,-0.328156,1.147078,0.202359,-0.159405,-0.062515,0.765652,-0.700292,0.104573,-0.609771,-0.545762
4,0.986965,0.349953,-0.182702,-0.576727,-0.797723,-0.175291,0.044177,-0.350174,0.234660,0.215393,...,-0.299018,-0.204331,0.726731,-0.697603,-0.056757,0.311976,-0.624010,-0.443317,-0.528320,-0.536116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,-0.931023,-0.187543,-0.182702,0.222011,0.521042,-0.005503,0.796790,0.352288,-0.573583,0.215393,...,1.075707,-0.205682,-0.017043,-0.242828,-0.109036,-0.411742,-0.432748,-0.192103,-0.037761,-0.560101
696,0.168764,-0.388477,-0.182702,0.564622,0.820919,-0.101541,0.982962,0.834074,-0.653584,0.215393,...,0.149582,-0.621950,0.608047,-0.050140,-0.115965,-0.449846,-0.383751,0.115942,0.344314,-0.570339
697,0.802988,0.402062,-0.182702,0.468042,0.029502,0.321706,0.239548,-0.363320,-0.253671,0.215393,...,-0.939531,-0.885817,1.215903,-0.480149,-0.090992,0.090578,-0.662428,-0.342206,-0.431628,-0.654694
698,-0.576190,1.121745,-0.182702,-1.139966,-0.695845,-0.036353,-0.807597,0.078223,0.132334,0.215393,...,-0.666684,0.477322,0.749314,-1.012763,0.327903,1.624875,-0.658129,-0.873127,-0.551441,-0.513318


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

import matplotlib.pyplot as plt

# apply SelectKBest
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Perform feature selection
k = 50  # Choose the number of top features you want to select
selector = SelectKBest(score_func=mutual_info_classif, k=k) # try different score_func
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

#Define an SVM classifier
svm_classifier = SVC()

# Define hyperparameters to tune
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
}

# Perform grid search with cross-validation
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=folds)
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the best model
best_svm_classifier = grid_search.best_estimator_
accuracy = best_svm_classifier.score(X_test_selected, y_test)
print("Accuracy:", accuracy)

# Get the prediction results
y_pred = best_svm_classifier.predict(X_test_selected)

# Calculate precision, recall, and f1 score
print("Classification Report:")
print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix

# Calculate specificity and sensitivity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)

# Print specificity and sensitivity
print("Specificity for cleaned data:", specificity)
print("Sensitivity for cleaned data:", sensitivity)

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Best Hyperparameters: {'C': 100, 'kernel': 'rbf'}
Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.92      0.84        61
           1       0.93      0.80      0.86        79

    accuracy                           0.85       140
   macro avg       0.85      0.86      0.85       140
weighted avg       0.86      0.85      0.85       140

Specificity for cleaned data: 0.9180327868852459
Sensitivity for cleaned data: 0.7974683544303798
Confusion Matrix:
[[56  5]
 [16 63]]
