In [None]:
from MLScript import *
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from kmodes.kmodes import KModes
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [None]:
def perform_pca(X, Y, n_components=2):
    """Perform PCA and visualize the results.

    Args:
        X (np.array): Input features
        Y (np.array): True labels
        n_components (int): Number of PCA components

    Returns:
        tuple: (PCA transformed data, PCA object)
    """
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)

    # Create PCA visualization
    plt.figure(figsize=(10, 8))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=Y, cmap='viridis')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title('PCA of Genome Data')
    plt.colorbar(label='Class Label')
    plt.savefig('pca_visualization.png')
    plt.close()

    print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
    return X_pca, pca

def perform_gmm(X, X_pca, Y, n_components=2):
    """Perform Gaussian Mixture Model clustering and evaluate results.

    Args:
        X (np.array): Original input features
        X_pca (np.array): PCA transformed features for visualization
        Y (np.array): True labels
        n_components (int): Number of clusters

    Returns:
        tuple: (cluster labels, performance metrics dict)
    """
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm_labels = gmm.fit_predict(X)

    # Visualize GMM results
    plt.figure(figsize=(10, 8))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=gmm_labels, cmap='viridis')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title('GMM Clustering Results')
    plt.colorbar(label='Cluster Label')
    plt.savefig('gmm_clustering.png')
    plt.close()

    # Calculate performance metrics
    metrics = {
        'accuracy': accuracy_score(Y, gmm_labels),
        'precision': precision_score(Y, gmm_labels),
        'recall': recall_score(Y, gmm_labels),
        'f1': f1_score(Y, gmm_labels)
    }

    print("\nGMM Clustering Results:")
    print(f"Number of samples in each cluster: {np.bincount(gmm_labels)}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")

    return gmm_labels, metrics

def perform_kmodes(X, X_pca, Y, n_clusters=2):
    """Perform K-modes clustering and evaluate results.

    Args:
        X (np.array): Original input features
        X_pca (np.array): PCA transformed features for visualization
        Y (np.array): True labels
        n_clusters (int): Number of clusters

    Returns:
        tuple: (cluster labels, performance metrics dict)
    """
    kmode = KModes(n_clusters=n_clusters, init='Huang', random_state=42)
    kmode_labels = kmode.fit_predict(X)

    # Visualize K-modes results
    plt.figure(figsize=(10, 8))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmode_labels, cmap='viridis')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title('K-modes Clustering Results')
    plt.colorbar(label='Cluster Label')
    plt.savefig('kmode_clustering.png')
    plt.close()

    # Calculate performance metrics
    metrics = {
        'accuracy': accuracy_score(Y, kmode_labels),
        'precision': precision_score(Y, kmode_labels),
        'recall': recall_score(Y, kmode_labels),
        'f1': f1_score(Y, kmode_labels)
    }

    print("\nK-modes Clustering Results:")
    print(f"Number of samples in each cluster: {np.bincount(kmode_labels)}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")

    return kmode_labels, metrics


### Core Genome


In [None]:
X,Y, column_dict, genome_id = process_genome_matrix('core_genome.csv')
# Perform PCA
X_pca, pca = perform_pca(X, Y)

# Perform GMM clustering
gmm_labels, gmm_metrics = perform_gmm(X, X_pca, Y)

# Perform K-modes clustering
kmode_labels, kmode_metrics = perform_kmodes(X, X_pca, Y)


### core soft pangenome


In [None]:
X,Y, column_dict, genome_id = process_genome_matrix('core_soft_genome.csv')
# Perform PCA
X_pca, pca = perform_pca(X, Y)

# Perform GMM clustering
gmm_labels, gmm_metrics = perform_gmm(X, X_pca, Y)

# Perform K-modes clustering
kmode_labels, kmode_metrics = perform_kmodes(X, X_pca, Y)


### core + shell pangenome



In [None]:
X,Y, column_dict, genome_id = process_genome_matrix('Core_shell_genome.csv')
# Perform PCA
X_pca, pca = perform_pca(X, Y)

# Perform GMM clustering
gmm_labels, gmm_metrics = perform_gmm(X, X_pca, Y)

# Perform K-modes clustering
kmode_labels, kmode_metrics = perform_kmodes(X, X_pca, Y)


### whole pangenome


In [None]:
X,Y, column_dict, genome_id = process_genome_matrix('genome_matrix_full.csv')
# Perform PCA
X_pca, pca = perform_pca(X, Y)

# Perform GMM clustering
gmm_labels, gmm_metrics = perform_gmm(X, X_pca, Y)

# Perform K-modes clustering
kmode_labels, kmode_metrics = perform_kmodes(X, X_pca, Y)
