In [None]:
from MLScript import *

In [None]:
from sklearn.model_selection import KFold
import pandas as pd
# Total samples
n_samples = 1980
# Number of folds
n_splits = 10

# Initialize KFold with 10 splits, shuffling for randomness
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Generate indices for each fold
fold_indices = []
for fold, (train_index, test_index) in enumerate(kf.split(range(n_samples))):
    fold_indices.append({
        'fold': fold + 1,
        'train_index': train_index,
        'test_index': test_index
    })
fold_df = pd.DataFrame(fold_indices)


In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_curve, roc_auc_score

def SVM(X, Y, fold_df, save_results=True):
    C = 0.88
    kernel = 'rbf'
    gamma = 0.005

    svm_classifier = SVC(C=C, kernel=kernel, gamma=gamma, probability=True, random_state=42)

    # Initialize lists to store metrics for each fold
    accuracy_scores = []
    f1_scores = []
    precision_scores = []
    recall_scores = []
    all_predicted_probabilities = []
    all_true_labels = []
    time_per_fold = []
    memory_per_fold = []

    # Loop over each fold's indices from fold_df
    for _, row in fold_df.iterrows():
        train_index = row['train_index']
        test_index = row['test_index']

        # Split data into training and testing sets for the current fold
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Track time and memory usage for the fold
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        # Train the SVM classifier
        svm_classifier.fit(X_train, Y_train)

        # Make predictions
        Y_pred = svm_classifier.predict(X_test)
        Y_pred_prob = svm_classifier.predict_proba(X_test)[:, 1]

        # End time and memory tracking
        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        # Calculate time and memory usage
        elapsed_time = end_time - start_time
        memory_usage = end_memory - start_memory

        time_per_fold.append(elapsed_time)
        memory_per_fold.append(memory_usage)

        # Collect metrics for this fold
        accuracy_scores.append(accuracy_score(Y_test, Y_pred))
        f1_scores.append(f1_score(Y_test, Y_pred))
        precision_scores.append(precision_score(Y_test, Y_pred))
        recall_scores.append(recall_score(Y_test, Y_pred))
        all_predicted_probabilities.extend(Y_pred_prob)
        all_true_labels.extend(Y_test)

    # Calculate mean and standard deviation for each metric across all folds
    results = {
        'Accuracy': f"{np.mean(accuracy_scores):.2f} ± {np.std(accuracy_scores):.2f}",
        'F1': f"{np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}",
        'Precision': f"{np.mean(precision_scores):.2f} ± {np.std(precision_scores):.2f}",
        'Recall': f"{np.mean(recall_scores):.2f} ± {np.std(recall_scores):.2f}",
        'Total Time (s)': f"{np.sum(time_per_fold):.2f}",
        'Total Memory (MB)': f"{np.sum(memory_per_fold):.2f}"
    }

    # Calculate ROC curve and AUC score using the aggregated probabilities
    fpr, tpr, _ = roc_curve(all_true_labels, all_predicted_probabilities)
    auc_score = roc_auc_score(all_true_labels, all_predicted_probabilities)

    # Print the results
    print("Results of SVM:")
    for metric, value in results.items():
        print(f"{metric}: {value}")

    # Optionally save results to a file
    if save_results:
        with open("svm_performance_results.txt", "w") as results_file:
            for metric, value in results.items():
                results_file.write(f"{metric}: {value}\n")

    return results, auc_score, fpr, tpr



def RF(X, Y, fold_df, save_results=True):
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    scoring_metrics = ['accuracy', 'f1', 'precision', 'recall']
    results = {metric.capitalize(): [] for metric in scoring_metrics}
    
    all_predicted_probabilities = []
    all_true_labels = []
    time_per_fold = []
    memory_per_fold = []

    # Loop through each fold in fold_df
    for _, row in fold_df.iterrows():
        train_index = row['train_index']
        test_index = row['test_index']
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Track time and memory usage
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        rf_classifier.fit(X_train, Y_train)
        Y_pred = rf_classifier.predict(X_test)
        Y_pred_prob = rf_classifier.predict_proba(X_test)[:, 1]

        # End time and memory tracking
        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        elapsed_time = end_time - start_time
        memory_usage = end_memory - start_memory

        time_per_fold.append(elapsed_time)
        memory_per_fold.append(memory_usage)

        # Collect metrics for this fold
        results['Accuracy'].append(accuracy_score(Y_test, Y_pred))
        results['F1'].append(f1_score(Y_test, Y_pred, average='macro'))
        results['Precision'].append(precision_score(Y_test, Y_pred, average='macro'))
        results['Recall'].append(recall_score(Y_test, Y_pred, average='macro'))
        
        all_predicted_probabilities.extend(Y_pred_prob)
        all_true_labels.extend(Y_test)
    
    # Calculate mean and std for each metric
    metrics_results = {metric: f"{np.mean(scores):.2f} ± {np.std(scores):.2f}" for metric, scores in results.items()}
    metrics_results['Total Time (s)'] = f"{np.sum(time_per_fold):.2f} "
    metrics_results['Total Memory (MB)'] = f"{np.sum(memory_per_fold):.2f} "

    fpr, tpr, _ = roc_curve(all_true_labels, all_predicted_probabilities)
    auc_score = roc_auc_score(all_true_labels, all_predicted_probabilities)

    # Print results
    print("Results of Random Forest Classifier:")
    for metric, value in metrics_results.items():
        print(f"{metric}: {value}")

    # Optionally save results
    if save_results:
        with open("rf_performance_results.txt", "w") as results_file:
            for metric, value in metrics_results.items():
                results_file.write(f"{metric}: {value}\n")

    return metrics_results, auc_score, fpr, tpr

def RF_explainer(X, Y, column_dict, csv_filename):
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X, Y)
    feature_importances = rf_classifier.feature_importances_
    sorted_indices = np.argsort(feature_importances)[::-1]
    sorted_feature_names = [column_dict[i + 1] for i in sorted_indices]
    sorted_importance_values = feature_importances[sorted_indices]
    
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Feature Name', 'Importance Value'])
        for feature_name, importance_value in zip(sorted_feature_names, sorted_importance_values):
            writer.writerow([feature_name, importance_value])
    
    return sorted_feature_names, sorted_importance_values

def KNN(X, Y, fold_df, save_results=True):
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    scoring_metrics = ['accuracy', 'f1', 'precision', 'recall']
    results = {metric.capitalize(): [] for metric in scoring_metrics}
    
    all_predicted_probabilities = []
    all_true_labels = []
    time_per_fold = []
    memory_per_fold = []

    for _, row in fold_df.iterrows():
        train_index = row['train_index']
        test_index = row['test_index']
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Track time and memory usage
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        knn_classifier.fit(X_train, Y_train)
        Y_pred = knn_classifier.predict(X_test)
        Y_pred_prob = knn_classifier.predict_proba(X_test)[:, 1]

        # End time and memory tracking
        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        elapsed_time = end_time - start_time
        memory_usage = end_memory - start_memory

        time_per_fold.append(elapsed_time)
        memory_per_fold.append(memory_usage)

        # Collect metrics for this fold
        results['Accuracy'].append(accuracy_score(Y_test, Y_pred))
        results['F1'].append(f1_score(Y_test, Y_pred, average='macro'))
        results['Precision'].append(precision_score(Y_test, Y_pred, average='macro'))
        results['Recall'].append(recall_score(Y_test, Y_pred, average='macro'))
        
        all_predicted_probabilities.extend(Y_pred_prob)
        all_true_labels.extend(Y_test)

    # Calculate mean and std for each metric
    metrics_results = {metric: f"{np.mean(scores):.2f} ± {np.std(scores):.2f}" for metric, scores in results.items()}
    metrics_results['Total Time (s)'] = f"{np.sum(time_per_fold):.2f} "
    metrics_results['Total Memory (MB)'] = f"{np.sum(memory_per_fold):.2f} "

    fpr, tpr, _ = roc_curve(all_true_labels, all_predicted_probabilities)
    auc_score = roc_auc_score(all_true_labels, all_predicted_probabilities)

    print("Results of K-Nearest Neighbors Classifier:")
    for metric, value in metrics_results.items():
        print(f"{metric}: {value}")

    # Optionally save results
    if save_results:
        with open("knn_performance_results.txt", "w") as results_file:
            for metric, value in metrics_results.items():
                results_file.write(f"{metric}: {value}\n")

    return metrics_results, auc_score, fpr, tpr

def GaussianNB_(X, Y, fold_df, save_results=True):
    gnb_classifier = GaussianNB()
    scoring_metrics = ['accuracy', 'f1', 'precision', 'recall']
    results = {metric.capitalize(): [] for metric in scoring_metrics}
    
    all_predicted_probabilities = []
    all_true_labels = []
    time_per_fold = []
    memory_per_fold = []

    for _, row in fold_df.iterrows():
        train_index = row['train_index']
        test_index = row['test_index']
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Track time and memory usage
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        gnb_classifier.fit(X_train, Y_train)
        Y_pred = gnb_classifier.predict(X_test)
        Y_pred_prob = gnb_classifier.predict_proba(X_test)[:, 1]

        # End time and memory tracking
        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        elapsed_time = end_time - start_time
        memory_usage = end_memory - start_memory

        time_per_fold.append(elapsed_time)
        memory_per_fold.append(memory_usage)

        # Collect metrics for this fold
        results['Accuracy'].append(accuracy_score(Y_test, Y_pred))
        results['F1'].append(f1_score(Y_test, Y_pred, average='macro'))
        results['Precision'].append(precision_score(Y_test, Y_pred, average='macro'))
        results['Recall'].append(recall_score(Y_test, Y_pred, average='macro'))
        
        all_predicted_probabilities.extend(Y_pred_prob)
        all_true_labels.extend(Y_test)

    # Calculate mean and std for each metric
    metrics_results = {metric: f"{np.mean(scores):.2f} ± {np.std(scores):.2f}" for metric, scores in results.items()}
    metrics_results['Total Time (s)'] = f"{np.sum(time_per_fold):.2f} "
    metrics_results['Total Memory (MB)'] = f"{np.sum(memory_per_fold):.2f} "

    fpr, tpr, _ = roc_curve(all_true_labels, all_predicted_probabilities)
    auc_score = roc_auc_score(all_true_labels, all_predicted_probabilities)

    print("Results of Gaussian Naive Bayes Classifier:")
    for metric, value in metrics_results.items():
        print(f"{metric}: {value}")

    # Optionally save results
    if save_results:
        with open("gaussian_nb_performance_results.txt", "w") as results_file:
            for metric, value in metrics_results.items():
                results_file.write(f"{metric}: {value}\n")

    return metrics_results, auc_score, fpr, tpr



from sklearn.ensemble import HistGradientBoostingClassifier


def SKLearnBoosting(X, Y, fold_df, save_results=True):
    # Initialize the scikit-learn HistGradientBoostingClassifier
    sk_boosting_classifier = HistGradientBoostingClassifier(random_state=42)
    scoring_metrics = ['accuracy', 'f1', 'precision', 'recall']
    results = {metric.capitalize(): [] for metric in scoring_metrics}
    
    all_predicted_probabilities = []
    all_true_labels = []
    time_per_fold = []
    memory_per_fold = []

    # Loop through each fold in fold_df
    for _, row in fold_df.iterrows():
        train_index = row['train_index']
        test_index = row['test_index']
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # Track time and memory usage
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        sk_boosting_classifier.fit(X_train, Y_train)
        Y_pred = sk_boosting_classifier.predict(X_test)
        Y_pred_prob = sk_boosting_classifier.predict_proba(X_test)[:, 1]

        # End time and memory tracking
        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / (1024 ** 2)  # Memory in MB

        elapsed_time = end_time - start_time
        memory_usage = end_memory - start_memory

        time_per_fold.append(elapsed_time)
        memory_per_fold.append(memory_usage)

        # Collect metrics for this fold
        results['Accuracy'].append(accuracy_score(Y_test, Y_pred))
        results['F1'].append(f1_score(Y_test, Y_pred, average='macro'))
        results['Precision'].append(precision_score(Y_test, Y_pred, average='macro'))
        results['Recall'].append(recall_score(Y_test, Y_pred, average='macro'))
        
        all_predicted_probabilities.extend(Y_pred_prob)
        all_true_labels.extend(Y_test)

    # Calculate mean and std for each metric
    metrics_results = {metric: f"{np.mean(scores):.2f} ± {np.std(scores):.2f}" for metric, scores in results.items()}
    metrics_results['Total Time (s)'] = f"{np.sum(time_per_fold):.2f} "
    metrics_results['Total Memory (MB)'] = f"{np.sum(memory_per_fold):.2f} "

    fpr, tpr, _ = roc_curve(all_true_labels, all_predicted_probabilities)
    auc_score = roc_auc_score(all_true_labels, all_predicted_probabilities)

    # Print results
    print("Results of HistGradientBoosting Classifier:")
    for metric, value in metrics_results.items():
        print(f"{metric}: {value}")

    # Optionally save results
    if save_results:
        with open("sk_boosting_performance_results.txt", "w") as results_file:
            for metric, value in metrics_results.items():
                results_file.write(f"{metric}: {value}\n")

    return metrics_results, auc_score, fpr, tpr

def plot_roc_curves(models_fpr_tpr_auc, title):
    plt.figure(figsize=(8, 6))
    
    # Plot each model's ROC curve
    for model_name, (fpr, tpr, auc_score) in models_fpr_tpr_auc.items():
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_score:.2f})')
    
    # Plot a dashed diagonal line for random guessing
    plt.plot([0, 1], [0, 1], color="gray", linestyle="--", label="Random Guessing")
    
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()



### Core Genome 

In [None]:
X,Y, column_dict, genome_id = process_genome_matrix('core_genome.csv')

In [None]:
results_svm_c, auc_score_svm_c, fpr_svm_c, tpr_svm_c = SVM(X, Y, fold_df, save_results=False)
results_rf_c, auc_score_rf_c, fpr_rf_c, tpr_rf_c = RF(X, Y, fold_df)
results_knn_c, auc_score_knn_c, fpr_knn_c, tpr_knn_c = KNN(X, Y, fold_df)
results_XGBoost_c, auc_score4_XGBoost_c, fpr4_XGBoost_c, tpr4_XGBoost_c  = SKLearnBoosting(X, Y, fold_df, save_results=True)
results_NB_c, auc_score4_NB_c, fpr_NB_c, tpr_NB_c = GaussianNB_(X, Y, fold_df, save_results=True)


In [None]:
models_fpr_tpr_auc = {
    'Support Vector Machine': (fpr_svm_c, tpr_svm_c, auc_score_svm_c),
    'K-Nearest Neighbors': (fpr_knn_c, tpr_knn_c, auc_score_knn_c),
    'Random Forest': (fpr_rf_c, tpr_rf_c, auc_score_rf_c),
    'Gradient Boosting Trees': (fpr4_XGBoost_c, tpr4_XGBoost_c, auc_score4_XGBoost_c),
    'Naive Bayes': (fpr_NB_c, tpr_NB_c, auc_score4_NB_c)
}

plot_roc_curves(models_fpr_tpr_auc, 'Model performance based on core pangenome')

In [None]:
# Combine all results into a dictionary with the desired order
models = ['Support Vector Machine', 'K-Nearest Neighbors', 'Random Forest', 'Gradient Boosting Trees', 'Naive Bayes']
results_ordered = [results_svm_c, results_knn_c, results_rf_c, results_XGBoost_c, results_NB_c]

# Metrics to extract
metrics = ['Accuracy', 'F1', 'Precision', 'Recall']

# Parse results into means and stds
means = []
stds = []

for model_results in results_ordered:
    model_means = []
    model_stds = []
    for metric in metrics:
        mean, std = map(float, model_results[metric].split(' ± '))
        model_means.append(mean)
        model_stds.append(std)
    means.append(model_means)
    stds.append(model_stds)

# Convert to numpy arrays for easier plotting
means = np.array(means)
stds = np.array(stds)

# Define colors consistent with the ROC curve
colors = ['blue', 'orange', 'green', 'red', 'purple']

# Bar graph parameters
x = np.arange(len(metrics))  # Metric indices
width = 0.15  # Bar width

# Plot bars for each model
fig, ax = plt.subplots(figsize=(10, 6))
for i, (model, model_means, model_stds, color) in enumerate(zip(models, means, stds, colors)):
    ax.bar(
        x + i * width, 
        model_means, 
        width, 
        label=f"{model}", 
        color=color, 
        yerr=model_stds, 
        capsize=5
    )

# Add labels, title, and legend
ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('ROC curves for models based on core pangenome')
ax.set_xticks(x + width * (len(models) - 1) / 2)
ax.set_xticklabels(metrics)

# Set y-axis limits
ax.set_ylim(0.0, 1.0)

# Show the plot
plt.tight_layout()
plt.show()

### core soft pangenome


In [None]:
X,Y, column_dict, genome_id = process_genome_matrix('core_soft_genome.csv')

In [None]:
results_svm_c, auc_score_svm_c, fpr_svm_c, tpr_svm_c = SVM(X, Y, fold_df, save_results=False)
results_rf_c, auc_score_rf_c, fpr_rf_c, tpr_rf_c = RF(X, Y, fold_df)
results_knn_c, auc_score_knn_c, fpr_knn_c, tpr_knn_c = KNN(X, Y, fold_df)
results_XGBoost_c, auc_score4_XGBoost_c, fpr4_XGBoost_c, tpr4_XGBoost_c  = SKLearnBoosting(X, Y, fold_df, save_results=True)
results_NB_c, auc_score4_NB_c, fpr_NB_c, tpr_NB_c = GaussianNB_(X, Y, fold_df, save_results=True)


In [None]:
models_fpr_tpr_auc = {
    'Support Vector Machine': (fpr_svm_c, tpr_svm_c, auc_score_svm_c),
    'K-Nearest Neighbors': (fpr_knn_c, tpr_knn_c, auc_score_knn_c),
    'Random Forest': (fpr_rf_c, tpr_rf_c, auc_score_rf_c),
    'Gradient Boosting Trees': (fpr4_XGBoost_c, tpr4_XGBoost_c, auc_score4_XGBoost_c),
    'Naive Bayes': (fpr_NB_c, tpr_NB_c, auc_score4_NB_c)
}

plot_roc_curves(models_fpr_tpr_auc, 'Model performance based on soft core pangenome')

In [None]:
# Combine all results into a dictionary with the desired order
models = ['Support Vector Machine', 'K-Nearest Neighbors', 'Random Forest', 'Gradient Boosting Trees', 'Naive Bayes']
results_ordered = [results_svm_c, results_knn_c, results_rf_c, results_XGBoost_c, results_NB_c]

# Metrics to extract
metrics = ['Accuracy', 'F1', 'Precision', 'Recall']

# Parse results into means and stds
means = []
stds = []

for model_results in results_ordered:
    model_means = []
    model_stds = []
    for metric in metrics:
        mean, std = map(float, model_results[metric].split(' ± '))
        model_means.append(mean)
        model_stds.append(std)
    means.append(model_means)
    stds.append(model_stds)

# Convert to numpy arrays for easier plotting
means = np.array(means)
stds = np.array(stds)

# Define colors consistent with the ROC curve
colors = ['blue', 'orange', 'green', 'red', 'purple']

# Bar graph parameters
x = np.arange(len(metrics))  # Metric indices
width = 0.15  # Bar width

# Plot bars for each model
fig, ax = plt.subplots(figsize=(10, 6))
for i, (model, model_means, model_stds, color) in enumerate(zip(models, means, stds, colors)):
    ax.bar(
        x + i * width, 
        model_means, 
        width, 
        label=f"{model}", 
        color=color, 
        yerr=model_stds, 
        capsize=5
    )

# Add labels, title, and legend
ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('ROC curves for models based on soft core pangenome')
ax.set_xticks(x + width * (len(models) - 1) / 2)
ax.set_xticklabels(metrics)

# Set y-axis limits
ax.set_ylim(0.0, 1.0)

# Show the plot
plt.tight_layout()
plt.show()

### core + shell pangenome

In [None]:
X,Y, column_dict, genome_id = process_genome_matrix('Core_shell_genome.csv')

In [None]:
results_svm_c, auc_score_svm_c, fpr_svm_c, tpr_svm_c = SVM(X, Y, fold_df, save_results=False)
results_rf_c, auc_score_rf_c, fpr_rf_c, tpr_rf_c = RF(X, Y, fold_df)
results_knn_c, auc_score_knn_c, fpr_knn_c, tpr_knn_c = KNN(X, Y, fold_df)
results_XGBoost_c, auc_score4_XGBoost_c, fpr4_XGBoost_c, tpr4_XGBoost_c  = SKLearnBoosting(X, Y, fold_df, save_results=True)
results_NB_c, auc_score4_NB_c, fpr_NB_c, tpr_NB_c = GaussianNB_(X, Y, fold_df, save_results=True)


In [None]:
models_fpr_tpr_auc = {
    'Support Vector Machine': (fpr_svm_c, tpr_svm_c, auc_score_svm_c),
    'K-Nearest Neighbors': (fpr_knn_c, tpr_knn_c, auc_score_knn_c),
    'Random Forest': (fpr_rf_c, tpr_rf_c, auc_score_rf_c),
    'Gradient Boosting Trees': (fpr4_XGBoost_c, tpr4_XGBoost_c, auc_score4_XGBoost_c),
    'Naive Bayes': (fpr_NB_c, tpr_NB_c, auc_score4_NB_c)
}

plot_roc_curves(models_fpr_tpr_auc, 'Model performance based on soft core + shell pangenome')

In [None]:
# Combine all results into a dictionary with the desired order
models = ['Support Vector Machine', 'K-Nearest Neighbors', 'Random Forest', 'Gradient Boosting Trees', 'Naive Bayes']
results_ordered = [results_svm_c, results_knn_c, results_rf_c, results_XGBoost_c, results_NB_c]

# Metrics to extract
metrics = ['Accuracy', 'F1', 'Precision', 'Recall']

# Parse results into means and stds
means = []
stds = []

for model_results in results_ordered:
    model_means = []
    model_stds = []
    for metric in metrics:
        mean, std = map(float, model_results[metric].split(' ± '))
        model_means.append(mean)
        model_stds.append(std)
    means.append(model_means)
    stds.append(model_stds)

# Convert to numpy arrays for easier plotting
means = np.array(means)
stds = np.array(stds)

# Define colors consistent with the ROC curve
colors = ['blue', 'orange', 'green', 'red', 'purple']

# Bar graph parameters
x = np.arange(len(metrics))  # Metric indices
width = 0.15  # Bar width

# Plot bars for each model
fig, ax = plt.subplots(figsize=(10, 6))
for i, (model, model_means, model_stds, color) in enumerate(zip(models, means, stds, colors)):
    ax.bar(
        x + i * width, 
        model_means, 
        width, 
        label=f"{model}", 
        color=color, 
        yerr=model_stds, 
        capsize=5
    )

# Add labels, title, and legend
ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('ROC curves for models based on soft core + shell pangenome')
ax.set_xticks(x + width * (len(models) - 1) / 2)
ax.set_xticklabels(metrics)

# Set y-axis limits
ax.set_ylim(0.0, 1.0)

# Show the plot
plt.tight_layout()
plt.show()

### whole pangenome

In [None]:
X,Y, column_dict, genome_id = process_genome_matrix('genome_matrix_full.csv')

In [None]:
results_svm, auc_score_svm, fpr_svm, tpr_svm = SVM(X, Y, fold_df, save_results=False)
results_rf, auc_score_rf, fpr_rf, tpr_rf = RF(X, Y, fold_df)
results_knn, auc_score_knn, fpr_knn, tpr_knn = KNN(X, Y, fold_df)
results_XGBoost, auc_score4_XGBoost, fpr4_XGBoost, tpr4_XGBoost  = SKLearnBoosting(X, Y, fold_df, save_results=True)
results_NB, auc_score4_NB, fpr_NB, tpr_NB = GaussianNB_(X, Y, fold_df, save_results=True)


In [None]:
models_fpr_tpr_auc = {
    'Support Vector Machine': (fpr_svm, tpr_svm, auc_score_svm),
    'K-Nearest Neighbors': (fpr_knn, tpr_knn, auc_score_knn),
    'Random Forest': (fpr_rf, tpr_rf, auc_score_rf),
    'Gradient Boosting Trees': (fpr4_XGBoost, tpr4_XGBoost, auc_score4_XGBoost),
    'Naive Bayes': (fpr_NB, tpr_NB, auc_score4_NB)
}

plot_roc_curves(models_fpr_tpr_auc, 'Model performance based on full pangenome')

In [None]:
# Plot bars for each model
fig, ax = plt.subplots(figsize=(10, 6))
for i, (model, model_means, model_stds, color) in enumerate(zip(models, means, stds, colors)):
    ax.bar(
        x + i * width, 
        model_means, 
        width, 
        label=f"{model}", 
        color=color, 
        yerr=model_stds, 
        capsize=5
    )

# Add labels, title, and legend
ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('ROC curves for models based on selected pangenome')
ax.set_xticks(x + width * (len(models) - 1) / 2)
ax.set_xticklabels(metrics)

# Move legend outside the main figure
ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1), borderaxespad=0.)

# Set y-axis limits
ax.set_ylim(0.0, 1.0)
plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust rect to leave space on the right for the legend

# Adjust layout to make room for the legend
plt.tight_layout(rect=[0, 0, 1.2, 1])  # Adjust rect to leave space on the right for the legend
plt.show()
