### This notebook is used to draw the density plot of distance matrices and the cluster-map of diatnce matrix in which sample etiology label were shown. The figures were generated based on all the samples for each cancer type.
- The file 'TCGA_all_distDFs_dict_for_{cancer_type}_Sample_exper01.pickle' are the nested dictionaries of distance matrix of different versions for the five cancer types respectively.
- The file 'TCGA_SigFrac_dict_for_all_cancer_Samples_exper01.pickle' holding the normalized exposures and etiology based multi-labeling info for each sample

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [None]:
### Load the distance matrix, and load the fraction data of signatures exposures

cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']

dict_distDFs_dict = {}

for cancer_type in cancer_types:
    ## load the distance matrix data for each cancer type
    with open(f'TCGA_all_distDFs_dict_for_{cancer_type}_Sample_exper01.pickle', 'rb') as handle: 
        distDFs_dict = pickle.load(handle)
        dict_distDFs_dict[cancer_type] = distDFs_dict

# Load 'dict_allFrac' to a pickle file
with open('TCGA_SigFrac_dict_for_all_cancer_Samples_exper01.pickle', 'rb') as handle:
    dict_allFrac = pickle.load(handle)

### These two files are used for next analysis
dict_distDFs_dict
dict_allFrac
dict_distDFs_dict['Breast-cancer']['Euclidean']

In [None]:
from scipy.stats import skew

for cancer_type in cancer_types:
    dict_DF = dict_distDFs_dict[cancer_type]
    
    for dist_type, df_sample in dict_DF.items():
        pairwise_distance_matrix = df_sample.values
        mask = np.tril(np.ones_like(pairwise_distance_matrix, dtype=bool))
        lower_tri_values = pairwise_distance_matrix[mask].tolist()
        
        plt.figure(figsize=(9, 7))
        sns.histplot(lower_tri_values, kde=True, color='blue')
        
        # Calculate skewness
        skewness = skew(lower_tri_values)
        skewness_str = f"Skewness: {skewness:.2f}"
        
        plt.xlabel('Distance', fontsize=14)
        plt.ylabel('Density', fontsize=14)
        plt.title(f'Density of {dist_type} distances, {cancer_type} samples, TCGA, {skewness_str}', fontsize=15)
        
        plt.savefig(f"Density of original {dist_type} distances for {cancer_type} samples_TCGA_exper01.pdf",
                    format="pdf", dpi=999, bbox_inches="tight")
        plt.show()
        plt.close()

In [None]:
### Draw the cluster map of distance matrix

import matplotlib.patches as mpatches

def plot_distance_clustermap(cancer_type, distance_matrix, dist_type, multi_label_info, vMax, flag=True):
    '''
    cancer_type: To specify which cancer data that is applied
    distance_matrix: The pairwise distance matrix of samples
    multi_label_info: The labels of samples used to specify the groups of samples
    dist_type: Type of distance metric
    flag: Perform the clustering or not, default is not
    '''

    # Ensure multi_label_info is a DataFrame
    if not isinstance(multi_label_info, pd.DataFrame):
        raise ValueError("multi_label_info must be a pandas DataFrame")
        
    # Specific color mappings for each label
    color_mappings = {
    'Apobec': {'Apobec': "#1f77b4", 'nonApobec': 'lightgrey'},
    'MMR': {'MMR': "#ff7f0e", 'nonMMR': 'lightgrey'},
    'Tobacco': {'Tobacco': "#2ca02c", 'nonTobacco': 'lightgrey'},
    'UV': {'UV': "#d62728", 'nonUV': 'lightgrey'},
    'POLE': {'POLE': "#9467bd", 'nonPOLE': 'lightgrey'},
    'ClockLike': {'ClockLike': "#8c564b", 'nonClockLike': 'lightgrey'},
    'BER': {'BER': "#e377c2", 'nonBER': 'lightgrey'},
    'Platinum': {'Platinum': "#17becf", 'nonPlatinum': 'lightgrey'},  
    }

    # Map labels to specific colors and combine
    label_color_dfs = []
    for label in multi_label_info.columns:
        if label in color_mappings:
            color_labels = multi_label_info[label].map(color_mappings[label])
            label_color_dfs.append(color_labels)

    combined_label_colors = pd.concat(label_color_dfs, axis=1)
    
    ## Prepare the custom legend
    color_mappings_ss = {
    "Apobec": "#1f77b4",      # Blue
    "MMR": "#ff7f0e",         # Orange
    "Tobacco": "#2ca02c",     # Green
    "UV": "#d62728",          # Red
    "POLE": "#9467bd",        # Purple
    "ClockLike": "#8c564b",   # Brown
    "BER": "#e377c2",         # Magenta
    "Platinum": "#17becf",    # Cyan
    }

    # Create a list of patches to add to the legend
    patches = [mpatches.Patch(color=color, label=label) for label, color in color_mappings_ss.items()]

    # Draw the clustermap
    sns.set(font_scale=0.9)
    sns.set_style('ticks')
    g = sns.clustermap(distance_matrix, method='average', 
                       col_colors=combined_label_colors, 
                       row_colors=combined_label_colors,
                       cmap="coolwarm", 
                       vmax=vMax,
                       col_cluster=flag, row_cluster=flag,
                       
                       figsize=(12, 12))

    # Set x-label and y-label
    g.ax_heatmap.set_xlabel("Samples", fontsize=14)
    g.ax_heatmap.set_ylabel("Samples", fontsize=14)
    
    plt.title('Clustermap of Pairwise {0} Distances in {1} sample, TCGA'.format(dist_type, cancer_type), 
              fontsize=15, loc="left", y=1.05)

    ## save the plot individually
    plt.savefig(f"TCGA_Clustermap of Distanmce Matrix of Signatures Fraction, categorical_label, regarding {dist_type}, in {cancer_type} sample_TCGA_exper01.pdf"
                ,format="pdf"
                ,dpi=999
                ,bbox_inches="tight")
    plt.show()
    plt.close()

### Prepare the data
labels = ['label_Apobec', 'label_MMR', 'label_Tobacco', 'label_UV', 'label_POLE', 'label_ClockLike', 'label_BER', 'label_Platinum']
cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']
vMaxs = [1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 240, 1.2]

for cancer_type in cancer_types:
    
    ## The distance of matrix
    dict_DF = dict_distDFs_dict[cancer_type]
    
    ## mutational signatures exposure data
    df_frac = dict_allFrac[cancer_type]
    
    ## Draw the plots
    dist_types = dict_DF.keys()
    df_samples = dict_DF.values()
    for dist_type, df_sample, vMax in zip(dist_types, df_samples, vMaxs):
        multi_label_info = df_frac[labels]
        multi_label_info.columns = ['Apobec', 'MMR', 'Tobacco', 'UV', 'POLE', 'ClockLike', 'BER', 'Platinum']
        plot_distance_clustermap(cancer_type, df_sample, dist_type, multi_label_info, vMax)  



In [None]:
### Draw the plots, indicating the density of Signatures Signals

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.colors

def plot_distance_clustermap(cancer_type, distance_matrix, dist_type, multi_label_info, vMax, flag=True):
    '''
    cancer_type: To specify which cancer data that is applied
    distance_matrix: The pairwise distance matrix of samples
    df_name: The name of the distance matrix, used to specify which data is working on
    multi_label_info: The numerical labels of samples used to specify the groups of samples
    flag: Perform the clustering or not, default is True
    '''

    # Ensure multi_label_info is a DataFrame
    if not isinstance(multi_label_info, pd.DataFrame):
        raise ValueError("multi_label_info must be a pandas DataFrame")

    # Find global min and max values across all labels
    global_min = multi_label_info.min().min()
    global_max = multi_label_info.max().max()

    # Create a unified color map
    cmap = sns.color_palette("viridis", as_cmap=True)
    norm = matplotlib.colors.Normalize(vmin=global_min, vmax=global_max)

    # Map label values to colors using this unified scale
    label_colors = multi_label_info.apply(lambda x: x.apply(lambda y: cmap(norm(y))))

    # Draw the clustermap
    sns.set(font_scale=0.9)
    sns.set_style('ticks')
    g = sns.clustermap(distance_matrix, 
                       method='average', 
                       col_colors=label_colors, 
                       row_colors=label_colors,
                       cmap="coolwarm", 
                       vmax=vMax,
                       col_cluster=flag, 
                       row_cluster=flag,
                       figsize=(12, 12))
    
    # Set x-label and y-label
    g.ax_heatmap.set_xlabel("Samples", fontsize=14)
    g.ax_heatmap.set_ylabel("Samples", fontsize=14)
    
    plt.title('Clustermap of Pairwise {0} Distances in {1} sample'.format(dist_type, cancer_type), fontsize=15, loc="left", y=1.05)
    
    # Add a unified color bar for the labels at the top left
    # Adjust [left, bottom, width, height] for positioning and size
    ax_color_bar = g.fig.add_axes([0.15, 0.7, 0.2, 0.02])  # Top left; adjust as needed
    colorbar = matplotlib.colorbar.ColorbarBase(ax_color_bar, cmap=cmap, norm=norm, orientation='horizontal')
    colorbar.set_label('Density of Signature Signal', fontsize=10)
    colorbar.ax.tick_params(labelsize=8)  # Reduce tick label size
    
    ## save the plot individiually
    plt.savefig(f"TCGA_Clustermap of Distanmce Matrix of Signatures Fraction, numerical_label, regarding {dist_type}, in {cancer_type} sample v1_TCGA_new.pdf"
                ,format="pdf"
                ,dpi=199
                ,bbox_inches="tight")
    plt.show()
    plt.close()


### Prepare the data
labels = ['signal_Apobec', 'signal_MMR', 'signal_Tobacco', 'signal_UV', 'signal_POLE', 'signal_ClockLike', 'signal_BER', 'signal_Platinum']
cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']
vMaxs = [1.2, 1.2, 1, 1, 1, 1.2, 1, 240, 1]
 
for cancer_type in cancer_types:
    
    ## The distance of matrix
    dict_DF = dict_distDFs_dict[cancer_type]
    
    ## mutational signatures exposure data
    df_frac = dict_allFrac[cancer_type]
    
    ## Draw the plots
    dist_types = dict_DF.keys()
    df_samples = dict_DF.values()
    for dist_type, df_sample, vMax in zip(dist_types, df_samples, vMaxs):
        multi_label_info = df_frac[labels]
        multi_label_info.columns = ['Apobec', 'MMR', 'Tobacco', 'UV', 'POLE', 'ClockLike', 'BER', 'Platinum']
        plot_distance_clustermap(cancer_type, df_sample, dist_type, multi_label_info, vMax)  
        
