## This notebook is used to generate the distance matrices between SBS signatures that are used as cost matrices during EMD calculation between samples regarding signature exposures (for Hartwig data)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load the Etiology information of SBS Signatures. 
The file 'Etiology Information of Signatures_SBS5_Unknown_20240527.csv' is generated in the '_Prepare_Signatures_in_Samples_Sigs_Etiology_Info.ipynb' notebook

In [None]:
SigEtioTable = pd.read_csv('Etiology Information of Signatures_SBS5_Unknown_20240527.csv', index_col='Unnamed: 0')
Sigs = SigEtioTable.index
Sigs

### Prepare the three diffrent distance matrix between signatutres
The file 'sigProfiler_SBS_signatures_2019_05_22.csv' is downloaded from ICGC/PCAWG

In [None]:
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine

## Load the Signatures profile from PCAWG
Sig_raw = pd.read_csv('sigProfiler_SBS_signatures_2019_05_22.csv')
Sig = Sig_raw[Sigs]
Sig

#+++ cosine distance
Sig_cosine = pd.DataFrame()

for i in range(Sig.shape[1]):
    for j in range(Sig.shape[1]):
        dist_ = cosine(Sig.T.iloc[i, ], Sig.T.iloc[j, ])
        Sig_cosine.loc[i, j] = dist_
        
Sig_cosine.columns, Sig_cosine.index = Sig.columns, Sig.columns   
Sig_cosine


In [None]:
#+++ Define the distance between etiologies 
#+++ 0.01 for same etiologies
#+++ 1 for different etiologies
#+++ ...and iterate trough a range of 0.1 to 0.9 for the remaining etiologies
#+++ The distance for unkown/artifact signatures was set to 0.5 to minimalize their influence on the clustering

one_data = np.ones(shape=(67, 67))
n = len(one_data)
for i in range(n):
    one_data[i][i] = 0
    
Sig_uniform = pd.DataFrame(one_data, columns=Sig.columns, index=Sig.columns)

def build_Sig_func(tuning_dist):

    """
    Build the aetiology matrix of signatures, 
    based on the given distance related to unknown or artefact signatures
    """

    global SigEtioTable
    global Sig_uniform
    
    Sig_func = Sig_uniform.copy()
    n = Sig_func.shape[0]
    
    for i in range(n):
        for j in range(n):
            col = Sig_func.columns[i]
            row = Sig_func.columns[j]
            if (
                (SigEtioTable.loc[col] == SigEtioTable.loc[row]).iloc[0] and 
                (SigEtioTable.loc[col] != 'Unknown').iloc[0] and 
                (SigEtioTable.loc[col] != 'Artefact').iloc[0] and 
                (SigEtioTable.loc[row] != 'Unknown').iloc[0] and 
                (SigEtioTable.loc[row] != 'Artefact').iloc[0]
            ): 
                Sig_func.iloc[i, j] = 0.01
            elif (
                (SigEtioTable.loc[col] != SigEtioTable.loc[row]).iloc[0] and 
                (SigEtioTable.loc[col] != 'Unknown').iloc[0] and 
                (SigEtioTable.loc[col] != 'Artefact').iloc[0] and 
                (SigEtioTable.loc[row] != 'Unknown').iloc[0] and 
                (SigEtioTable.loc[row] != 'Artefact').iloc[0]
            ): 
                Sig_func.iloc[i, j] = 1.0
            else:
                Sig_func.iloc[i, j] = tuning_dist
    
    arr = Sig_func.values 
    np.fill_diagonal(arr, 0) # set the diagonal to 0
    Sig_func = pd.DataFrame(arr, columns=Sig_func.columns, index=Sig_func.index)  # convert the array back to a DataFrame
    return Sig_func

Sig_funcs = []
tuning_dist = 0.1
while tuning_dist <= 0.9:
    Sig_func = build_Sig_func(tuning_dist)
    Sig_funcs.append(Sig_func)
    tuning_dist += 0.05

len(Sig_funcs)

In [None]:
### Hybrid distance matrix 

## If the two signatures are known, use the Sig_func value, otherwise using the Sig_cosine value
Sig_hybrid = Sig_funcs[0].copy()
Sig_cosine

# Creat a boolean mask for values eauqal to 0.1 in Sig_hybrid
mask = (Sig_hybrid == 0.1)
# Update values in Sig_func with corresponding values from Sig_cosine where mask is True
Sig_hybrid[mask] = Sig_cosine[mask]
Sig_hybrid



In [None]:
### Draw the map of distance matrix between signatures

def draw_distmap(distMat, distType):

    plt.figure(figsize=(24, 22))
    g = sns.clustermap(distMat, annot=False, cmap='viridis', linewidths=.5, fmt='.2f', xticklabels=True, yticklabels=True)

    # Adjust x-tick and y-tick label sizes
    g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), fontsize=7)
    g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), fontsize=7)

    plt.title(f'{distType} distance between SBS signatures', size=15, loc='left')
    plt.savefig(f'{distType} distance between SBS signatures_exper01_Hartwig.pdf', format="pdf", dpi=299, bbox_inches="tight")
    plt.show()

Sig_cosine
Sig_funcs
Sig_hybrid

# Draw plots for Cosine and Hybrid
distMats = [Sig_cosine, Sig_hybrid]
distTypes = ['Cosine', 'Hybrid']
for distMat, distType in zip(distMats, distTypes):
    draw_distmap(distMat, distType)

# Draw plots for Aetiology distance
distMats = Sig_funcs
distTypes = []
tuning_dist = 0.1

while tuning_dist <= 0.9:
    type = f'Etiology distance with thirdDist={tuning_dist:.2f}'
    distTypes.append(type)
    tuning_dist += 0.05
    
for distMat, distType in zip(distMats, distTypes):
    draw_distmap(distMat, distType)

In [None]:
### Save the distance matrix

## For Cosine and Hybrid distance
distMats = [Sig_cosine, Sig_hybrid]
distTypes = ['Cosine', 'Hybrid']
for distMat, distType in zip(distMats, distTypes):
    distMat.to_csv(f'{distType} distance between SBS signatures_exper01_Hartwig.csv')

## For etiological distance
distMats = Sig_funcs
distTypes = []
tuning_dist = 0.1
while tuning_dist <= 0.9:
    type = f'Etiology Distance with thirdDist={tuning_dist:.2f}'
    distTypes.append(type)
    tuning_dist += 0.05 
for distMat, distType in zip(distMats, distTypes):
    distMat.to_csv(f'{distType} distance between SBS signatures_exper01_Hartwig.csv')

