## This notebook is used to generate the distance matrices between SBS signatures that are used as cost matrices during EMD calculation between samples regarding signature exposures (for TCGA data)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load the Etiology information of SBS Signatures. 
- The file 'TCGA_SBS_Exposures_in_Samples_new.csv' is basically the exposure data of SBS signature downloaded from ICGC/PCAWG, and modified by notebook '_Prepare_Signatures_in_Samples_Sigs_Etiology_Info.ipynb
'.
- The file 'Etiology Information of Signatures_SBS5_Unknown_20240527.csv' was generated by jupyter notebook '_Prepare_Signatures_in_Samples_Sigs_Etiology_Info.ipynb'

In [None]:
## load the exposure data 
df = pd.read_csv('TCGA_SBS_Exposures_in_Samples_new.csv')
CancerTypes = df['Cancer Types']
print(CancerTypes.unique())
print('*'*80)

## Take cancer types and mutational signatures
d0 = df.drop(['Unnamed: 0', 'Accuracy', 'Sample Names'], axis=1)
d0 = d0.rename(columns={'Cancer Types': 'Cancer_Types'})
d0 = d0.set_index('clinic_ID')
print(d0.head())
print(d0['Cancer_Types'].value_counts())
print('*'*80)

## Take the name of Signatutes
Sigs = d0.columns[1:]
print(len(Sigs))
print(Sigs)

SigEtioTable = pd.read_csv('Etiology Information of Signatures_SBS5_Unknown_20240527.csv', index_col='Unnamed: 0')
SigEtioTable

### Prepare the three diffrent distance matrix between signatutres
The file 'sigProfiler_SBS_signatures_2019_05_22.csv' is downloaded from ICGC/PCAWG

In [None]:
### Prepare the three diffrent distance matrix between signatutres

from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine

## Load the Signatures profile from PCAWG
Sig_raw = pd.read_csv('sigProfiler_SBS_signatures_2019_05_22.csv')
Sig = Sig_raw[Sigs]
Sig

#+++ cosine distance
Sig_cosine = pd.DataFrame()

for i in range(Sig.shape[1]):
    for j in range(Sig.shape[1]):
        dist_ = cosine(Sig.T.iloc[i, ], Sig.T.iloc[j, ])
        Sig_cosine.loc[i, j] = dist_
        
Sig_cosine.columns, Sig_cosine.index = Sig.columns, Sig.columns   
Sig_cosine


In [None]:
#+++ Define the distance between etiologies 
#+++ 0.01 for same etiologies
#+++ 1 for different etiologies
#+++ ...and iterate trough a range of 0.1 to 0.9 for the remaining etiologies
#+++ The distance for unkown/artifact signatures was set to 0.5 to minimalize their influence on the clustering

aetiology_info = {
    'SBS1': 'Clock-like',
    'SBS2': 'Apobec',
    'SBS13': 'Apobec',
    
    'SBS3': 'HR',
    
    'SBS4': 'Tobacco',
    'SBS29': 'Tobacco',
    
    'SBS6': 'MMR',
    'SBS14': 'MMR',
    'SBS15': 'MMR',
    'SBS20': 'MMR',
    'SBS21': 'MMR',
    'SBS26': 'MMR',
    'SBS44': 'MMR',
    'SBS7a': 'UV_light',
    'SBS7b': 'UV_light',
    'SBS7c': 'UV_light',
    'SBS7d': 'UV_light',
    'SBS38': 'UV_light',
    'SBS10a': 'POLE',
    'SBS10b': 'POLE',
    'SBS30': 'BER',
    'SBS36': 'BER',
    'SBS31': 'Platinum_treat',
    'SBS35': 'Platinum_treat',
    'SBS9': 'POLEerase',
    'SBS11': 'Temozolomide_treat',
    'SBS18': 'Reactive_oxygen',
    'SBS22': 'Aristolochic_acid',
    'SBS24': 'Aflatoxin_expo',
    'SBS25': 'Chemotherapy',
    'SBS32': 'Azathioprine_treat',
    'SBS42': 'Haloalkane_expo',
    'SBS5': 'unknown',
    'SBS8': 'unknown',
    'SBS12': 'unknown',
    'SBS16': 'unknown',
    'SBS17a': 'unknown',
    'SBS17b': 'unknown',
    'SBS19': 'unknown',
    'SBS23': 'unknown',
    'SBS28': 'unknown',
    'SBS33': 'unknown',
    'SBS34': 'unknown',
    'SBS37': 'unknown',
    'SBS39': 'unknown',
    'SBS40': 'unknown',
    'SBS41': 'unknown',
    'SBS27': 'Artefact',
    'SBS43': 'Artefact',
    'SBS45': 'Artefact',
    'SBS46': 'Artefact',
    'SBS47': 'Artefact',
    'SBS48': 'Artefact',
    'SBS49': 'Artefact',
    'SBS50': 'Artefact',
    'SBS51': 'Artefact',
    'SBS52': 'Artefact',
    'SBS53': 'Artefact',
    'SBS54': 'Artefact',
    'SBS55': 'Artefact',
    'SBS56': 'Artefact',
    'SBS57': 'Artefact',
    'SBS58': 'Artefact',
    'SBS59': 'Artefact',
    'SBS60': 'Artefact',
}

one_data = np.ones(shape=(65, 65))
n = len(one_data)
for i in range(n):
    one_data[i][i] = 0
    
Sig_uniform = pd.DataFrame(one_data, columns=Sig.columns, index=Sig.columns)

def build_Sig_func(tuning_dist):

    """
    Build the aetiology matrix of signatures, 
    based on the given distance related to unknown or artefact signatures
    """

    global SigEtioTable
    global Sig_uniform
    
    Sig_func = Sig_uniform.copy()
    n = Sig_func.shape[0]
    
    for i in range(n):
        for j in range(n):
            col = Sig_func.columns[i]
            row = Sig_func.columns[j]
            if (
                (SigEtioTable.loc[col] == SigEtioTable.loc[row]).iloc[0] and 
                (SigEtioTable.loc[col] != 'Unknown').iloc[0] and 
                (SigEtioTable.loc[col] != 'Artefact').iloc[0] and 
                (SigEtioTable.loc[row] != 'unknown').iloc[0] and 
                (SigEtioTable.loc[row] != 'Artefact').iloc[0]
            ): 
                Sig_func.iloc[i, j] = 0.01
            elif (
                (SigEtioTable.loc[col] != SigEtioTable.loc[row]).iloc[0] and 
                (SigEtioTable.loc[col] != 'Unknown').iloc[0] and 
                (SigEtioTable.loc[col] != 'Artefact').iloc[0] and 
                (SigEtioTable.loc[row] != 'Unknown').iloc[0] and 
                (SigEtioTable.loc[row] != 'Artefact').iloc[0]
            ): 
                Sig_func.iloc[i, j] = 1.0
            else:
                Sig_func.iloc[i, j] = tuning_dist
    
    arr = Sig_func.values 
    np.fill_diagonal(arr, 0) # set the diagonal to 0
    Sig_func = pd.DataFrame(arr, columns=Sig_func.columns, index=Sig_func.index)  # convert the array back to a DataFrame
    return Sig_func

Sig_funcs = []
tuning_dist = 0.1
while tuning_dist <= 0.9:
    Sig_func = build_Sig_func(tuning_dist)
    Sig_funcs.append(Sig_func)
    tuning_dist += 0.05

len(Sig_funcs)

In [None]:
### Hybrid distance

## If the two signatures are known, use the Sig_func value, otherwise using the Sig_cosine value
Sig_hybrid = Sig_funcs[0].copy()
Sig_cosine

# Creat a boolean mask for values eauqal to 0.1 in Sig_hybrid
mask = (Sig_hybrid == 0.1)
# Update values in Sig_func with corresponding values from Sig_cosine where mask is True
Sig_hybrid[mask] = Sig_cosine[mask]
Sig_hybrid



In [None]:

def draw_distmap(distMat, distType, aetiology_info):
    plt.figure(figsize=(24, 22))

    # Create a color mapping for aetiology
    aetiology_colors = {
        'Clock-like': '#1f77b4',       # Blue
        'Apobec': '#ff7f0e',           # Orange
        'HR': '#2ca02c',               # Green
        'Tobacco': '#d62728',          # Red
        'MMR': '#9467bd',              # Purple
        'UV_light': '#8c564b',         # Brown
        'POLE': '#e377c2',             # Pink
        'BER': '#17becf',              # Cyan
        'Platinum_treat': '#bcbd22',  # Yellow
        'POLEerase': '#7f7f7f',        # Light Gray
        'Temozolomide_treat': '#ff9896',  # Light Red
        'Reactive_oxygen': '#c5b0d5',  # Lavender
        'Aristolochic_acid': '#ffbb78', # Light Orange
        'Aflatoxin_expo': '#c49c94',  # Light Brown
        'Chemotherapy': '#f7b6d2',     # Light Pink
        'Azathioprine_treat': '#8ca252', # Olive Green
        'Haloalkane_expo': '#7f7f7f',  # Light Gray
        'unknown': '#c0c0c0',          # Gray
        'Artefact': '#7f7f7f',         # Darker gray
    }
    
    # Map aetiology to colors
    row_colors = pd.Series(
        [aetiology_colors[aetiology_info[sig]] for sig in distMat.index],
        index=distMat.index
    )

    # Draw the clustermap
    g = sns.clustermap(distMat, annot=False, cmap='viridis', linewidths=.5,
                       xticklabels=True, yticklabels=True,
                       row_colors=row_colors,
                       col_colors=row_colors)

    # Adjust x-tick and y-tick label sizes
    g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), fontsize=7)
    g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), fontsize=7)

    # plt.title(f'{distType} distance between SBS signatures', size=15, loc='left')

    # Create legend for aetiology
    handles = [plt.Line2D([0], [0], color=color, lw=4) for color in aetiology_colors.values()]
    labels = list(aetiology_colors.keys())
    plt.legend(handles, labels, title="Aetiology", loc='upper left', bbox_to_anchor=(19, 0))

    plt.savefig(f'{distType} distance between SBS signatures_exper01_updated.pdf', 
                format="pdf", dpi=299, bbox_inches="tight")
    plt.show()

# Usage of Hybrid distance

draw_distmap(Sig_hybrid, "Hybrid", aetiology_info)

In [None]:
### Draw the map of distance matrix between signatures

def draw_distmap(distMat, distType):

    plt.figure(figsize=(24, 22))
    g = sns.clustermap(distMat, annot=False, cmap='viridis', linewidths=.5, fmt='.2f', 
                       xticklabels=True, yticklabels=True)

    # Adjust x-tick and y-tick label sizes
    g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), fontsize=7)
    g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), fontsize=7)

    plt.title(f'{distType} distance between SBS signatures', size=15, loc='left')
    plt.savefig(f'{distType} distance between SBS signatures_exper01.pdf', format="pdf", dpi=299, bbox_inches="tight")
    plt.show()

Sig_cosine
Sig_funcs
Sig_hybrid

# Draw plots for Cosine and Hybrid
distMats = [Sig_cosine, Sig_hybrid]
distTypes = ['Cosine', 'Hybrid']
for distMat, distType in zip(distMats, distTypes):
    draw_distmap(distMat, distType)

# Draw plots for Aetiology distance
distMats = Sig_funcs
distTypes = []
tuning_dist = 0.1

while tuning_dist <= 0.9:
    type = f'Etiology distance with thirdDist={tuning_dist:.2f}'
    distTypes.append(type)
    tuning_dist += 0.05
    
for distMat, distType in zip(distMats, distTypes):
    draw_distmap(distMat, distType)

In [None]:
### Save the distance matrix

## For Cosine and Hybrid distance
distMats = [Sig_cosine, Sig_hybrid]
distTypes = ['Cosine', 'Hybrid']
for distMat, distType in zip(distMats, distTypes):
    distMat.to_csv(f'{distType} distance between SBS signatures_exper01.csv')

## For etiological distance
distMats = Sig_funcs
distTypes = []
tuning_dist = 0.1
while tuning_dist <= 0.9:
    type = f'Etiology Distance with thirdDist={tuning_dist:.2f}'
    distTypes.append(type)
    tuning_dist += 0.05 
for distMat, distType in zip(distMats, distTypes):
    distMat.to_csv(f'{distType} distance between SBS signatures_exper01.csv')
    