In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import scipy.stats as stats
from multiprocessing import Pool

def compute_bic(n):
    gmm = GaussianMixture(n_components=n, random_state=0)
    gmm.fit(read_depth_data)
    return gmm.bic(read_depth_data)

read_depth_data = pd.read_csv("/g/data/xl04/ka6418/chromosome_graph/rTilRug_HiC_pctg_Illumina_sorted.bam.binned.depth.csv")
read_depth_data = read_depth_data['AverageDepth'].values.reshape(-1, 1)

n_values = range(1, 11)

# Use Pool to parallelize the computation
with Pool() as pool:
    bic_values = pool.map(compute_bic, n_values)
    
# Plotting BIC values
plt.figure(figsize=(10, 6))
plt.plot(n_values, bic_values, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (n)')
plt.ylabel('BIC Value')
plt.title('BIC for Different Numbers of Clusters')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()



In [None]:
def plot_gaussian_distributions(data, n_clusters):
    """
    Plots Gaussian distributions for read depth data based on a specified number of clusters.
    
    Parameters:
    - data: DataFrame with read depth data.
    - n_clusters: Integer specifying the number of Gaussian components or clusters.
    - contig_id: ID of the contig for visualization.
    """
    # Convert data to DataFrame for easier processing
    contig_depth_data = pd.DataFrame(data, columns=['AverageDepth'])
    contig_depth_data = contig_depth_data[contig_depth_data['AverageDepth'] < 200]
    contig_depth_data = contig_depth_data[contig_depth_data['AverageDepth'] > 0]
    read_depth_values = contig_depth_data['AverageDepth'].values.reshape(-1, 1)



    # Fitting a GMM with specified number of components
    gmm = GaussianMixture(n_components=n_clusters, random_state=0)
    gmm.fit(read_depth_values)
    labels = gmm.predict(read_depth_values)

    # Plotting
    plt.figure(figsize=(14, 7))
    
    colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
    
    for i in range(n_clusters):
        cluster_data = read_depth_values[labels == i]
        
        # Calculating summary statistics for the cluster
        cluster_mean = cluster_data.mean()
        cluster_std = cluster_data.std()

        # Define the domain for plotting
        x = np.linspace(read_depth_values.min(), read_depth_values.max(), 1000)

        # Calculate the Gaussian PDF for the cluster
        pdf_cluster = stats.norm.pdf(x, cluster_mean, cluster_std)

        # Plot histogram
        plt.hist(cluster_data, bins=50, density=True, alpha=0.5, color=colors[i], label=f'Cluster {i} Data')

        # Plot Gaussian PDF
        plt.plot(x, pdf_cluster, color=colors[i], linestyle='-', linewidth=2, label=f'Cluster {i} Gaussian')

    plt.xlabel('Read Depth')
    plt.xlim(0,200)
    plt.ylabel('Density')
    plt.title(f'Gaussian Distributions of Read Depth for Contig')
    plt.legend()
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.tight_layout()
    plt.show()
    

# Example use of the function for the ninth contig with 2 clusters
plot_gaussian_distributions(read_depth_data, 2)


In [None]:
def label_data_with_gmm(data_path, n_clusters, label_dict):
    """
    Labels read depth data based on Gaussian Mixture Model (GMM) clustering and 
    writes the labeled data to a new CSV file.
    
    Parameters:
    - data_path: Path to the CSV file with read depth data.
    - n_clusters: Integer specifying the number of Gaussian components or clusters.
    - label_dict: Dictionary specifying labels for each Gaussian cluster.
                  For example: {1: 'incorrect', 2: 'correct', 3: 'incorrect'}
    """
    # Load data from provided path
    contig_depth_data = pd.read_csv(data_path)
    contig_depth_data_filtered = contig_depth_data[contig_depth_data['AverageDepth'] < 200]
    read_depth_values = contig_depth_data_filtered['AverageDepth'].values.reshape(-1, 1)
    
    # Fitting a GMM with specified number of components
    gmm = GaussianMixture(n_components=n_clusters, random_state=0)
    gmm.fit(read_depth_values)
    labels = gmm.predict(read_depth_values)
    print(labels)
    
    # Map labels based on the provided dictionary
    mapped_labels = [label_dict[cluster + 1] for cluster in labels]  # +1 because dictionary is 1-indexed
    
    # Add the mapped labels to the original DataFrame
    contig_depth_data_filtered['GMM_label'] = mapped_labels
    
    # Determine the output path based on input data path
    output_path = data_path.replace(".csv", "_GMM.csv")
    
    # Save the DataFrame with labeled data to the determined CSV file
    contig_depth_data_filtered.to_csv(output_path, index=False)

    return contig_depth_data_filtered

# Sample call to the function
# Note: I'm providing the function here. You can use it with your data on your machine.
labeled_df = label_data_with_gmm("/g/data/xl04/ka6418/chromosome_graph/rTilRug_HiC_pctg_Illumina_sorted.bam.binned.fixed.depth.csv", 2, {1: 'incorrect', 2: 'correct'})

