## This notebook explains how to prepare the exposure data of mutational signatures in TCGA cancer samples and Etiology info table of signatures
The file 'clinical_PANCAN_patient_with_followup.tsv' and 'TCGA_WES_sigProfiler_SBS_signatures_in_samples.csv' were downloaded from ICGC/PCAWG

In [None]:
import pandas as pd

In [None]:
### Prepare the file 'TCGA_SBS_Exposures_in_Samples_new.csv'

## Clinic data
clinic = pd.read_csv('clinical_PANCAN_patient_with_followup.tsv', sep='\t', encoding='unicode_escape', low_memory=False)

clinic['clinic_ID'] = clinic['bcr_patient_barcode']
clinic
# for column in clinic.columns:
#    print(column)

## Exposure of signatures in samples
sigs = pd.read_csv('TCGA_WES_sigProfiler_SBS_signatures_in_samples.csv')
sigs['clinic_ID'] = sigs['Sample Names'].str.slice(start=0, stop=12)
sigs

## taking the overlaps of the clinic_ID
clinic['clinic_ID'] = clinic['clinic_ID'].astype(str)
sigs['clinic_ID'] = sigs['clinic_ID'].astype(str)

overlap = set(clinic['clinic_ID']).intersection(set(sigs['clinic_ID']))
len(overlap) # 9244

clinic_new = clinic[clinic['clinic_ID'].isin(overlap)]
print(clinic_new.shape)
clinic_new.to_csv('TCGA_Clinic_Info_new.csv') # save to file

sigs_new = sigs[sigs['clinic_ID'].isin(overlap)]

## drop the duplicated rows based on 'clinic_ID'

sigs_new = sigs_new.copy()
sigs_new.drop_duplicates(subset=['clinic_ID'], keep='last', inplace=True)
print(sigs_new.shape)
sigs_new.to_csv('TCGA_SBS_Exposures_in_Samples_new.csv') # save to file


In [None]:
### The aetiologies of signatures according to Alexandrov et al. (Nature 2020)
### SBS5 was set to be of unknown etiology

aetilogy_info = {
    'SBS1': 'Clock-like', # Deamination of 5-methylcytosine
    
    'SBS2': 'APOBEC', # APOBEC activity
    'SBS13': 'APOBEC',  # APOBEC activity
    
    'SBS3': 'HR', # Defective HR DNA repaire; BRCA1/2 mutation
    
    'SBS4': 'Tobacco', # Smoking
    'SBS29': 'Tobacco', # Smoking
    
    'SBS6': 'MMR', # Defective DNA mismatch repair()
    'SBS14': 'MMR', # POLE mutation and mismatch repair deficiency
    'SBS15': 'MMR', # Defective DNA mismatch repair
    'SBS20': 'MMR', # POLD1 mutation and mismatch repair deficiency
    'SBS21': 'MMR', # Defective DNA mismatch repair
    'SBS26': 'MMR', # Defective DNA mismatch repair
    'SBS44': 'MMR', # Defective DNA mismatch repair
    
    'SBS7a':'UV_light', # Ultraviolet light exposure
    'SBS7b':'UV_light', # Ultraviolet light exposure
    'SBS7c':'UV_light', # Ultraviolet light exposure
    'SBS7d':'UV_light', # Ultraviolet light exposure
    'SBS38':'UV_light', # Indirect effect of ultravioleat light
    
    'SBS10a': 'POLE', # POLE mutation
    'SBS10b': 'POLE', # POLE mutation
    
    'SBS30': 'BER', # Defective base excision repair; NTHL1 mutation
    'SBS36': 'BER', # Defective base excision repair; MUTYH mutation
    
    'SBS31': 'Platinum_treat', # Platinum treatment
    'SBS35': 'Platinum_treat', # Platinum treatment
    
    'SBS9': 'POLEerase', # In part, POLEerase activity
    'SBS11': 'Temozolomide_treat', # Temozolomide treatment
    'SBS18': 'Reactive_oxygen', # Reactive oxygen species
    'SBS22': 'Aristolochic_acid', # Aristolochic acid exposure
    'SBS24': 'Aflatoxin_expo', # Aflatoxin exposure
    'SBS25': 'Chemotherapy', # Chemotherapy
    'SBS32': 'Azathioprine_treat', # Azathioprine treatment
    'SBS42': 'Haloalkane_expo', # Haloalkane exposure
    
    'SBS5': 'unknown', 
    
    'SBS8': 'unknown',
    'SBS12': 'unknown',
    'SBS16': 'unknown', 
    'SBS17a': 'unknown',
    'SBS17b': 'unknown',
    'SBS19': 'unknown', 
    'SBS23': 'unknown', 
    'SBS28': 'unknown',
    'SBS33': 'unknown', 
    'SBS34': 'unknown', 
    'SBS37': 'unknown', 
    'SBS39': 'unknown',
    'SBS40': 'unknown',
    'SBS41': 'unknown',
    
    
    'SBS27':'Artefact', 
    'SBS43':'Artefact', 
    'SBS45':'Artefact',
    'SBS46':'Artefact',
    'SBS47':'Artefact',
    'SBS48':'Artefact',
    'SBS49':'Artefact',
    'SBS50':'Artefact',
    'SBS51':'Artefact',
    'SBS52':'Artefact',
    'SBS53':'Artefact',
    'SBS54':'Artefact',
    'SBS55':'Artefact',
    'SBS56':'Artefact',
    'SBS57':'Artefact',
    'SBS58':'Artefact',
    'SBS59':'Artefact',
    'SBS60':'Artefact', 
    'SBS84':'Artefact',
    'SBS85':'Artefact', 
}


SigEtioTable = pd.DataFrame.from_dict(aetilogy_info, orient='index', columns=['Etiology'])
SigEtioTable.to_csv('Etiology Information of Signatures_SBS5_Unknown_20240527.csv')

SigEtioTable