## This is the notebook for clustering performace evaluation, Focus on the multiple rounds analysis that based on the subsampling data from the whole data sets from TCGA cohort.
## For the results based on Hartwig data, same procedures were applied

In [None]:
# import modules 
import os
import time
import random
import pandas as pd
import numpy as np

# modules for calculation
import sklearn
import scipy
import scipy.stats as spst
import scipy.spatial as sp
import scipy.cluster.hierarchy as hc
from sklearn import metrics

# modules for plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

os.chdir('/path/to/EMD_analysis')
os.getcwd()


In [None]:
### Load related files that generated before

In [None]:
## load the exposure data 
df = pd.read_csv('TCGA_SBS_Exposures_in_Samples_new.csv')
CancerTypes = df['Cancer Types']
print(CancerTypes.unique())
print('*'*80)

## Take cancer types and mutational signatures
d0 = df.drop(['Unnamed: 0', 'Accuracy', 'Sample Names'], axis=1)
d0 = d0.rename(columns={'Cancer Types': 'Cancer_Types'})
d0 = d0.set_index('clinic_ID')
print(d0.head())
print(d0['Cancer_Types'].value_counts())
print('*'*80)

## Take the name of Signatutes
Sigs = d0.columns[1:]
print(len(Sigs))
print(Sigs)

SigEtioTable = pd.read_csv('Etiology Information of Signatures_SBS5_Unknown_20240527.csv', 
                           index_col='Unnamed: 0')
SigEtioTable

In [None]:
### 
# Extract the five cancer types, Breast cancer, Lung cancer Colon cancer and liver cancer
# and calculate the frequency of signature exposures
###

cancer_types_raw = [['Breast-cancer'], ['Lung-AdenoCa'], ['Lung-SCC'], ['ColoRect-AdenoCa'], ['Liver-HCC']]
cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']
dict_allFrac = {} ## dict to accept prepared data

## Prepare the data
for cancer_type, cancer_type_raw in zip(cancer_types, cancer_types_raw):

    ## 
    df_ = d0[d0['Cancer_Types'].isin(cancer_type_raw)]
    df_raw = df_.drop(['Cancer_Types'], axis = 1)

    ## Calculate the Fractions of mutational signatuters exposures in samples
    total_counts = df_raw.sum(axis=1)
    df_allFrac = df_raw.div(total_counts, axis=0)

    ## multi-labeling of samples according to the existences of certain class of signatures

    # Apobec signaitures
    df_allFrac['label_Apobec'] = df_allFrac.apply(lambda row: 'nonApobec' if (row['SBS2'] == 0 and row['SBS13'] == 0) else 'Apobec', axis=1)
    df_allFrac['signal_Apobec'] = df_allFrac['SBS2'] + df_allFrac['SBS13']

    # Mismatch Repair signatures
    df_allFrac['label_MMR'] = df_allFrac.apply(lambda row: 'nonMMR' if (row['SBS6'] == 0 
                                                                        and row['SBS14'] == 0 
                                                                        and row['SBS15'] == 0 
                                                                        and row['SBS20'] == 0 
                                                                        and row['SBS21'] == 0 
                                                                        and row['SBS26'] == 0 
                                                                        and row['SBS44'] == 0) 
                                                     else 'MMR', axis=1)
    df_allFrac['signal_MMR'] = df_allFrac['SBS6'] + df_allFrac['SBS14'] + df_allFrac['SBS15'] + df_allFrac['SBS20'] + df_allFrac['SBS21'] + df_allFrac['SBS26'] + df_allFrac['SBS44'] 

    # Tobacco signatures
    df_allFrac['label_Tobacco'] = df_allFrac.apply(lambda row: 'nonTobacco' if (row['SBS4'] == 0 and row['SBS29'] == 0) else 'Tobacco', axis=1)
    df_allFrac['signal_Tobacco'] = df_allFrac['SBS4'] + df_allFrac['SBS29']

    # UV signatures
    df_allFrac['label_UV'] = df_allFrac.apply(lambda row: 'nonUV' if (row['SBS7a'] == 0 
                                                                      and row['SBS7b'] == 0 
                                                                      and row['SBS7c'] == 0 
                                                                      and row['SBS7d'] == 0 
                                                                      and row['SBS38'] == 0) else 'UV', axis=1)
    df_allFrac['signal_UV'] = df_allFrac['SBS7a'] + df_allFrac['SBS7b'] + df_allFrac['SBS7c'] + df_allFrac['SBS7d'] + df_allFrac['SBS38']

    # POLE signatures
    df_allFrac['label_POLE'] = df_allFrac.apply(lambda row: 'nonPOLE' if (row['SBS10a'] == 0 and row['SBS10b'] == 0) else 'POLE', axis=1)
    df_allFrac['signal_POLE'] = df_allFrac['SBS10a'] + df_allFrac['SBS10b']

    # Clock-likesignatures
    df_allFrac['label_ClockLike'] = df_allFrac.apply(lambda row: 'nonClockLike' if (row['SBS1'] == 0) else 'ClockLike', axis=1)
    df_allFrac['signal_ClockLike'] = df_allFrac['SBS1']

    # Base_Excision_Repair, BER
    df_allFrac['label_BER'] = df_allFrac.apply(lambda row: 'nonBER' if (row['SBS30'] == 0 and row['SBS36'] == 0) else 'BER', axis=1)
    df_allFrac['signal_BER'] = df_allFrac['SBS18'] + df_allFrac['SBS30'] + df_allFrac['SBS36']

    # Platinum Treatment
    df_allFrac['label_Platinum'] = df_allFrac.apply(lambda row: 'nonPlatinum' if (row['SBS31'] == 0 and row['SBS35'] == 0) else 'Platinum', axis=1)
    df_allFrac['signal_Platinum'] = df_allFrac['SBS31'] + df_allFrac['SBS35']

    ## Take a look of the data
    df_allFrac['Cancer_Types'] = cancer_type
    
    dict_allFrac[cancer_type] = df_allFrac

dict_allFrac['Breast-cancer']  

In [None]:
### Prepare the ratio of different etiology labels in different cance types
dict_allFrac
cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']
label_types = ['label_Apobec', 'label_MMR', 'label_Tobacco', 'label_UV', 'label_POLE', 'label_ClockLike', 'label_BER', 'label_Platinum'] ## type of labels

# Dictionary to store the ratios
all_ratios = {} ## dict for all results

for cancer_type in cancer_types: ## looping cancer types
    ## for each cancer type
    df_frac = dict_allFrac[cancer_type]
    
    ratios = {} ## dict to hold ratios for certain cancer type
    for column in label_types:
        # Count the occurrences of each category
        counts = df_frac[column].value_counts()

        # Calculate the ratio
        if len(counts) == 1:
            ratios[column] = 'One-Class'

        if len(counts) == 2:
            key1, key2 = counts.keys()
            if key1.startswith('non'):
                ratio = counts[key2] / counts[key1]
            else:
                ratio = counts[key1] / counts[key2]

            # Store the ratio in the dictionary
            ratios[column] = round(ratio, 3)
    ## collect the ratios
    all_ratios[cancer_type] = ratios
    
dict_all_ratios = all_ratios.copy()

for cancer_type in cancer_types:
    print(cancer_type, dict_all_ratios[cancer_type])

### Load the clustering results files and put them in the proper dictionary structures

In [None]:
import pickle

# The file path of the clustering results
clustering_result_filepath = '/path/to/results'

### Load the clustering files
clustering_types = ['Hierarchical_complete', 
                    'KMedoids', 
                   ]
clustering_types_m = ['Hierarchical', 
                    'KMedoids', 
                   ]
cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']
dist_types = ['Euclidean', 'Cosine', 'cEMD', 'eEMD', 'hEMD', 'gEuclidean', 'gCosine', 'rEuclidean']
rounds = ['round' + str(i) for i in range(10)]

## Initialze dict hold all scores, using rounds as keys 
all_clustering_scores = {}
## load the data
for round in rounds:
    ## Intialize a dict for each round
    all_clustering_scores_round = {}
    for clustering_type, clustering_type_m in zip(clustering_types, clustering_types_m):
        ## Initialize dict to hold scores, using cancer types as keys
        dict_cancer_scores = {}
        
        for cancer_type in cancer_types:
            ## initialize dict to hold scores for different score types
            vScore_df_lst = {} 
            homogeneityScore_df_lst = {}
            completenessScore_df_lst = {}
            
            for dist_type in dist_types:
    
                clustering_result_filename = os.path.join(clustering_result_filepath, 
                                                          f"TCGA_{cancer_type}_Clustering_Scores_{clustering_type}_{dist_type}_{round}_exper01.pickle")
                with open(clustering_result_filename, 'rb') as handle:
                    scores = pickle.load(handle)
                    ## Collect scores of different type
                    vScore_df_lst.update(scores[cancer_type][1])
                    homogeneityScore_df_lst.update(scores[cancer_type][2])
                    completenessScore_df_lst.update(scores[cancer_type][3])
            
            ## Collect scores, using cancer type as keys
            dict_cancer_scores[cancer_type] = [vScore_df_lst, 
                                               homogeneityScore_df_lst, 
                                               completenessScore_df_lst, 
                                              ]
        
        ## Collect scores,using clustering type as keys
        all_clustering_scores_round[clustering_type_m] = dict_cancer_scores
    all_clustering_scores[round] = all_clustering_scores_round    
             
dict_cancer_scores 
all_clustering_scores_round
all_clustering_scores      

### Draw the V-score curves

In [None]:
###+++++++++++++++++++++++++++++++
# define the plot for draw curves
###+++++++++++++++++++++++++++++++

## all aetiology labels
full_label_types = ['label_Apobec', 'label_MMR', 'label_Tobacco', 'label_UV', 'label_POLE', 'label_ClockLike', 'label_BER', 'label_Platinum'] 
## all score types
full_score_types = ['V score', 'Homogeneity', 'Completeness']
## all distance types
full_dist_types = ['Euclidean', 'Cosine', 'cEMD', 'eEMD', 'hEMD', 'gEuclidean', 'gCosine', 'rEuclidean']
rounds = ['round' + str(i) for i in range(10)]
## color palette for distance metrics
color_palette = {
    "Euclidean": "#7f7f7f",       # Gray (medium)
    "Cosine": "#1f77b4",          # Blue (standard)
    "cEMD": "#ff9999",            # Light Red
    "eEMD": "#ff4d4d",            # Medium Red
    "hEMD": "#b32400",            # Dark Red
    "gEuclidean": "#bfbfbf",      # Light Gray
    "gCosine": "#1f99e4",         # Light Blue
    "rEuclidean": "#4d4d4d",      # Dark Gray
    "rCosine": "#1f4db4",         # Dark Blue
}
 
## varied line styles
line_styles = ['--', '--', '-', '-', '-', '-.', '-.', ':', ':']  # You can extend this list based on the number of columns
line_palette = {
    "Euclidean": '--',
    "Cosine": '--',
    "cEMD": '-',
    "eEMD": '-',
    "hEMD": '-',
    "gEuclidean": '-.',
    "gCosine": '-.',
    "rEuclidean": ':',
    "rCosine": ':',
}

import matplotlib.lines as mlines

def draw_score_curves(clustering_type, cancer_type, score_type, label_type):
    """
    Paras:
    all_clustering_scores: a dict holding all the scores
    dict_all_ratios: ratios of aetiologies signatures in cancer types
    color_palette: color palette for distance metrics
    
    full_scores_types: full of the score types in a list
    full_label_types: full of the aetiologies in a list
    
    clustering_type: types of clustering, Hierarchical and KMedoids
    cancer_type: types of cancer, in our data, there are five types
    label_type: the aetiology label
    dist_type: the type of distance metrics

    Return:
    The curves of scores and the corresponding AUC values
    """
    global all_clustering_scores
    global rounds
    global dict_all_ratios
    global full_score_types
    global full_label_types
    global full_dist_types
    global color_palette
    global line_palette
    global line_styles

    ### Extract the data, for each cancer type under each clustering type
    df_plot = pd.DataFrame()
    # Specify the columns to keep as the original index
    id_vars = ['Index', 'Round']
    # Specify the columns to melt
    value_vars = ['Euclidean', 'Cosine', 'cEMD', 'eEMD', 'hEMD', 'gEuclidean', 'gCosine', 'rEuclidean']

    for round in rounds:
        dict_scores = all_clustering_scores[round][clustering_type][cancer_type] ## the dict of scores, using score types as keys
        dict_labelRatios = dict_all_ratios[cancer_type] ## the dict of label ratios, using label types as keys
        
        ### Take the silhouette score
        index = full_score_types.index(score_type)
        dict_score = dict_scores[index]
        
        ### Reshape the data, make all the Entropy score for specific Class label gether together
        df_reshape = {key: pd.DataFrame() for key in full_label_types}
        for dist_type in full_dist_types:
            df_origion = dict_score[dist_type] ## original form of the data
            df_reshape[label_type][dist_type] = df_origion[label_type] ## reshape the original form for next steps
    
        ## Prepare the data for ploting
        df_lineplot = df_reshape[label_type]
    
        df_lineplot['Round'] = round
        df_lineplot['Index'] = df_lineplot.index
        df_melt = pd.melt(df_lineplot, id_vars=id_vars, value_vars=value_vars, 
                     var_name='Distance metrics', value_name='Score')
        df_plot = pd.concat([df_plot, df_melt], axis=0)
        
    # Plotting
    plt.figure(figsize=(12, 8))
    # Dictionary to store areas under the curve (AUC)
    sns.lineplot(data=df_plot, 
                 x="Index", y="Score", 
                 hue="Distance metrics", 
                 #style="Distance metrics",
                 markers=False, 
                 dashes=True, 
                 palette=color_palette, 
                 #style_order=line_styles,
                )
    # Set line styles using matplotlib
    for idx, line_style in enumerate(line_styles):
        plt.gca().get_lines()[idx].set_linestyle(line_style)

    # Create custom legend handles with line styles
    legend_handles = [mlines.Line2D([], [], color=color_palette[label], linestyle=line_styles[i], label=label) 
                      for i, label in enumerate(df_plot['Distance metrics'].unique())]
    # Show legend with custom handles
    plt.legend(handles=legend_handles, title='Distance metrics', loc='best', fontsize=10)

    # Adding legend with AUC values
    plt.tick_params(axis='x', labelsize=12)  # Enlarge x-tick labels
    plt.tick_params(axis='y', labelsize=12)  # Enlarge y-tick labels
    plt.xlabel('Number of Clusters', fontsize=15)
    # modify and save the plot
    if score_type == 'Silhouette':
        plt.ylabel(f'{score_type} score', fontsize=15)
        plt.title(f'{score_type} score of {clustering_type} Clustering of TCGA {cancer_type} samples', 
                  fontsize=18, loc='left')
        plt.savefig(f'{score_type} of {clustering_type} Clustering, of in {cancer_type}, TCGA, original, multiple_exper01.pdf',
                    format="pdf", dpi=999, bbox_inches="tight")
        plt.show()
        plt.close()
    elif score_type == 'V score':
        plt.ylabel(score_type, fontsize=15)
        new_label = label_type.split('_')[1]
        plt.title(f'{score_type} of {clustering_type} Clustering, regarding {new_label}, in TCGA {cancer_type}', 
                  fontsize=18, loc='left')
        plt.savefig(f'{score_type} of {clustering_type} Clustering, of {label_type}, in {cancer_type}, TCGA, original, muliple_exper01.pdf',
                    format="pdf", dpi=999, bbox_inches="tight")
        plt.show()
        plt.close()

    # Printing out the AUC values
    print()


In [None]:
###+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Draw the curves of scores and calcualte the scores under the curve
###+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

from sklearn.metrics import auc
import seaborn as sns
import warnings

clustering_types = ['Hierarchical', 
                    'KMedoids', 
                   ]
cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']
## type of scores
score_types = ['V score', 
              ] 
# Specify the columns to keep as the original index
id_vars = ['Index', 'Round']
# Specify the columns to melt
value_vars = ['Euclidean', 'Cosine', 'cEMD', 'eEMD', 'hEMD', 'gEuclidean', 'gCosine', 'rEuclidean',]


## Draw the plot and collect the AUC values

for clustering_type in clustering_types:
    for cancer_type in cancer_types:
        for score_type in score_types:
            label_types = ['label_Apobec', 'label_MMR', 'label_Tobacco', 'label_UV', 'label_POLE', 
                           'label_ClockLike', 'label_BER', 'label_Platinum']
            for label_type in label_types:
                draw_score_curves(clustering_type, cancer_type, score_type, label_type)
        

#### Draw the AUC values with barplot

In [None]:
###++++++++++++
# Define the function to calculate and collect the AUC values
###++++++++++++

all_clustering_scores ## the file which holds all scores
dict_all_ratios ##  all the ratios

## all aetiology labels
full_label_types = ['label_Apobec', 'label_MMR', 'label_Tobacco', 'label_UV', 'label_POLE', 'label_ClockLike', 'label_BER', 'label_Platinum'] 
## all score types
full_score_types = ['V score', 'Homogeneity', 'Completeness']
## all distance types
full_dist_types = ['Euclidean', 'Cosine', 'cEMD', 'eEMD', 'hEMD', 'gEuclidean', 'gCosine', 'rEuclidean']

def calculate_auc(clustering_type, cancer_type):

    """
    Paras:
    all_clustering_scores: a dict holding all the scores
    
    full_scores_types: full of the score types in a list
    full_label_types: full of the aetiologies in a list
    full_dist_type: the type of distance metrics
    
    clustering_type: types of clustering, Hierarchical and KMedoids
    cancer_type: types of cancer, in our data, there are five types
    
    Return:
    The corresponding AUC values, for certain cancer type under certain clustering type
    """
    
    global all_clustering_scores_round
    global full_score_types
    global full_label_types
    global full_dist_types

    # scores for certain cancer type of certain clustering type
    dict_scores = all_clustering_scores_round[clustering_type][cancer_type] ## the dict of scores, using score types as keys
    
    ## Define the dataframe for auc values

    vScore_auc = pd.DataFrame(index=full_label_types, columns=full_dist_types)
    homogeneity_auc = pd.DataFrame(index=full_label_types, columns=full_dist_types)
    completeness_auc = pd.DataFrame(index=full_label_types, columns=full_dist_types)

    ## put the auc_df into a list
    auc_df_lst = [vScore_auc, homogeneity_auc, completeness_auc]

    ## Looping dict_scores of different score types 
    for dict_score, score_type, auc_df in zip(dict_scores, full_score_types, auc_df_lst):

        ### Reshape the data, make all the Entropy score for specific Class label gether together
        df_reshape = {key: pd.DataFrame() for key in full_label_types}

        for label_type in full_label_types:
            for dist_type in full_dist_types:
                df_origion = dict_score[dist_type] ## original form of the data
                df_reshape[label_type][dist_type] = df_origion[label_type] ## reshape the original form for next steps

            ## take the data of certaion class label
            df_lineplot = df_reshape[label_type]
            
            # Dictionary to store areas under the curve (AUC)
            auc_values_lst = []

            # Iterate through each column, plot and calculate AUC
            for column in df_lineplot.columns:
                x = df_lineplot.index
                y = df_lineplot[column]
                # Store the AUC
                auc_values_lst.append(auc(x, y))

            auc_df.loc[label_type] = auc_values_lst
        
    return auc_df_lst


In [None]:
###++++++++++++++++++++++++
# Calculate the AUC values, 
###++++++++++++++++++++++++

from sklearn.metrics import auc

clustering_types = ['Hierarchical', 
                    'KMedoids', 
                   ]
cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']
rounds = ['round' + str(i) for i in range(10)]

### Calculate the auc values
all_auc_values = {}
for round in rounds:
    all_clustering_scores_round = all_clustering_scores[round]
    all_auc_values_round = {}
    for clustering_type in clustering_types:
        auc_values_cancer = {}
        for cancer_type in cancer_types:
            auc_df_lst = calculate_auc(clustering_type, cancer_type)
            ## Collect the AUC values for each cancer type
            auc_values_cancer[cancer_type] = auc_df_lst
        ## Collect the auc values for each clustering type
        all_auc_values_round[clustering_type] = auc_values_cancer
    all_auc_values[round] = all_auc_values_round
all_auc_values

### Calculate the auc values and substract the value of rEuclidean

import pandas as pd

def diff_to_col(df, col_name):
  """
  Calculates the difference between all columns and a specified column in a DataFrame.

  Args:
      df: The pandas DataFrame.
      col_name: The name of the column to compare with.

  Returns:
      A new DataFrame with columns named after "col_name" - original_column_name 
      containing the difference values.
  """
  result = {}
  for col in df.columns:
      result[f"{col}"] = df[col] - df[col_name]
  return pd.DataFrame(result)

all_auc_values_m = {}
for round in rounds:
    all_clustering_scores_round = all_clustering_scores[round]
    all_auc_values_round = {}
    for clustering_type in clustering_types:
        auc_values_cancer = {}
        for cancer_type in cancer_types:
            auc_df_lst = calculate_auc(clustering_type, cancer_type)
            auc_df_lst_m = [diff_to_col(auc_df.copy(), 'rEuclidean') for auc_df in auc_df_lst]
            ## Collect the AUC values for each cancer type
            auc_values_cancer[cancer_type] = auc_df_lst_m
        ## Collect the auc values for each clustering type
        all_auc_values_round[clustering_type] = auc_values_cancer
    all_auc_values_m[round] = all_auc_values_round
all_auc_values_m

### Use the modified auc values which all substarct 'rEuclidean'

In [None]:
### Define the plot to draw the plot
color_palette = {
    "Euclidean": "#7f7f7f",       # Gray (medium)
    "Cosine": "#1f77b4",          # Blue (standard)
    "cEMD": "#ff9999",            # Light Red
    "eEMD": "#ff4d4d",            # Medium Red
    "hEMD": "#b32400",            # Dark Red
    "gEuclidean": "#bfbfbf",      # Light Gray
    "gCosine": "#1f99e4",         # Light Blue
    "rEuclidean": "#4d4d4d",      # Dark Gray
    "rCosine": "#1f4db4",         # Dark Blue
}

score_types = ['Silhouette', 'V score']
new_xtick_labels = ['Apobec', 'MMR', 'Tobacco', 'UV', 'POLE', 'Clock-Like', 'BER', 'Platinum']

def draw_auc(cancer_type, clustering_type, score_type):

    global all_auc_values_m
    global score_types
    global color_palette
    global new_xtick_labels

    df_plot = pd.DataFrame()
    for round in rounds:
        all_auc_values_round = all_auc_values_m[round]
        all_auc_values_r = dict.fromkeys(cancer_types) 
        all_auc_values_r[cancer_type] = dict.fromkeys(clustering_types)
        all_auc_values_r[cancer_type][clustering_type] = all_auc_values_round[clustering_type][cancer_type]
        
        dict_auc_values = all_auc_values_r[cancer_type] ## The dict of scores for certain cancer type, with clustering type as keys
    
        ## the list of dataframe of scores with multilabel as index and dist_types as columns, for different score types
        lst_df_auc = dict_auc_values[clustering_type]
        idx = score_types.index(score_type)
        
        df_auc = lst_df_auc[idx]
        #print(df_auc)
        ## sort according to the desired column order
        desired_order = ['cEMD', 'eEMD', 'hEMD', 'Cosine', 'gCosine', 'Euclidean', 'gEuclidean', 'rEuclidean']
        # Sort DataFrame columns
        df_ordered = df_auc.reindex(desired_order, axis=1)
        df_ordered = df_ordered.drop(['rEuclidean'], axis=1)
        
        ## Reshape the DataFrame for Seaborn
        df_melted = pd.melt(df_ordered.reset_index(), id_vars='index', var_name='Distance Metric', value_name='AUC')
        df_melted['round'] = round
        df_plot = pd.concat([df_plot, df_melted], axis=0)
            
    # Create side-by-side bar plot using Seaborn
    df_plot.index = range(df_plot.shape[0])
    plt.figure(figsize=(12, 4))
    sns.set_style('ticks')
    ax = sns.barplot(x='index', y='AUC', hue='Distance Metric', data=df_plot, palette=color_palette)

    # Customize the plot
    plt.xlabel('Aetiology labels', fontsize=13)
    plt.ylabel(f'Area under {score_type} curve', fontsize=13)
    plt.xticks(ticks=range(len(new_xtick_labels)), labels=new_xtick_labels, rotation=0, fontsize=12)
    # Adding legend back with better positioning
    ax.legend(title='Distance metrics',title_fontsize='11', loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=10)
    plt.title(f'AUC values under {score_type} curve in {clustering_type} clustering, for {cancer_type}, substract rEuclidean, TCGA', fontsize=15, loc='left')
    
    plt.savefig(f'AUC values under {score_type} curve in {clustering_type} clustering, for {cancer_type}_TCGA_original_substract rEuclidean_multiple_exper01.pdf',
                format="pdf",
                dpi=999,
                bbox_inches="tight")

    plt.show()
    plt.close()

    return df_plot

clustering_types = ['Hierarchical', 
                    'KMedoids', 
                   ]
cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']

for cancer_type in cancer_types:
    for clustering_type in clustering_types:
        for score_type in score_types:
            df_plot = draw_auc(cancer_type, clustering_type, score_type)
        

### Find the best metrics for each cancer types, regarding AUC values

In [None]:
all_auc_values
clustering_types = ['Hierarchical', 
                    'KMedoids', 
                   ]
cancer_types = ['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC']
rounds = ['round' + str(i) for i in range(10)]

### sum-up the values of different rounds, regarding only the V score
largest_distMat_dict = {}
for clustering_type in clustering_types:
    largest_distMat = pd.DataFrame(index=['Apobec','MMR', 'Tobacco', 'UV', 'POLE', 'Clock-Like', 'BER', 'Platinum', 'Sum-up'],
                                  columns=['Breast-cancer', 'Lung-AdenoCa', 'Lung-SCC', 'ColoRect-AdenoCa', 'Liver-HCC'])
    for cancer_type in cancer_types:
        dfs = []
        for round in rounds:
            df = all_auc_values[round][clustering_type][cancer_type][1]
            dfs.append(df)
        sum_df = pd.concat(dfs).groupby(level=0).sum()
        sum_df.loc['Sum-up'] = sum_df.sum()
        # Get column names of largest values for each row
        largest_columns = sum_df.apply(lambda row: row.idxmax(), axis=1)
        largest_distMat[cancer_type] = largest_columns.values
    largest_distMat_dict[clustering_type] = largest_distMat

sum_df.index
largest_columns.values
largest_distMat_dict

In [None]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
from openpyxl.styles.colors import Color

color_palette = {
    "Euclidean": "#7f7f7f",       # Gray (medium)
    "Cosine": "#1f77b4",          # Blue (standard)
    "cEMD": "#ff9999",            # Light Red
    "eEMD": "#ff4d4d",            # Medium Red
    "hEMD": "#b32400",            # Dark Red
    "gEuclidean": "#bfbfbf",      # Light Gray
    "gCosine": "#1f99e4",         # Light Blue
    "rEuclidean": "#4d4d4d",      # Dark Gray
    "rCosine": "#1f4db4",         # Dark Blue
}

# Convert hex color to RGB
def hex_to_color(hex_color):
    hex_color = hex_color.lstrip('#')
    rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    hex_rgb = f"{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}"  # Convert RGB tuple to hex string
    return Color(rgb=hex_rgb)

# Save DataFrame to Excel
for clustering_type in clustering_types:
    df = largest_distMat_dict[clustering_type].copy().T
    
    excel_path = f'{clustering_type}_Clustering_distance_metrics_for_largest_AUC_V score_TCGA_exper01.xlsx'
    df.to_excel(excel_path, index=True, sheet_name='Sheet1')
    
    # Load workbook and select worksheet
    wb = load_workbook(excel_path)
    ws = wb['Sheet1']
    
    # Iterate through 'Distance metrics' column to change text colors
    for row in ws.iter_rows(min_row=2, min_col=2, max_row=ws.max_row, max_col=ws.max_column): 
        for cell in row:
            metric = cell.value
            if metric in color_palette.keys():
                hex_color = color_palette[metric]
                color = hex_to_color(hex_color)
                fill = PatternFill(start_color=color, end_color=color, fill_type="solid")
                cell.fill = fill
                
    # Save changes
    wb.save(excel_path)
