In [1]:
import pandas as pd

mutations = pd.read_csv('../analysis/merged_variants_anno.txt', sep="\t", header=0, index_col=0)
metasamples = pd.read_excel("../Metadata.xlsx")

In [None]:
# Convert matrix to unique VAF value per cell (mean of VAFs)
import numpy as np

FILTER_DP = 25
FILTER_VAF = 5.0
FILTER_EFFECTS = ['chromosome', 
                  'duplication',
                  'inversion',
                  'exon_loss_variant',
                  'frameshift_variant',
                  'missense_variant',
                  'protein_protein_contact',
                  'structural_interaction_variant',
                  'splice_acceptor_variant',
                  'splice_donor_variant',
                  'stop_lost', 
                  'start_lost', 
                  'stop_gained']

def unify_vaf(str_vaf):
    if str_vaf in ['Na', 'NA']:
        return 0.0
    values = [float(c.split(':')[1].split(';')[2]) 
              if int(c.split(':')[1].split(';')[0]) >= FILTER_DP else 0.0 for c in str_vaf.split('|')]
    merged_vaf = np.round(np.mean(values), 2)
    return merged_vaf if merged_vaf >= FILTER_VAF else 0.0

def unify_vaf_unfilter(str_vaf):
    if str_vaf in ['Na', 'NA']:
        return 0.0
    values = [float(c.split(':')[1].split(';')[2]) for c in str_vaf.split('|')]
    return np.round(np.mean(values), 2)

def get_maf(ref, alt):
    ref_len = len(ref)
    alt_len = len(alt)
    diff = ref_len - alt_len
    vtype = None
    if diff == 0:
        if alt_len == 1:
            vtype = 'SNP'
        elif alt_len == 2:
            vtype = 'DNP'
        elif alt_len == 3:
            vtype = 'TNP'
        else:
            vtype = 'ONP'
    elif diff > 0 and alt_len == 1:
        vtype = 'DEL'
    elif diff < 0 and ref_len == 1:
        vtype = 'INS'
    else:
        vtype = 'INDEL'
    return vtype


# Convert vcalls to a single VAF (no DP filter)
mutations_vaf = mutations.copy()
mutations_vaf.loc[:, mutations_vaf.columns[14:]] = mutations_vaf.loc[:, mutations_vaf.columns[14:]].applymap(unify_vaf_unfilter)
mutations_vaf.to_csv('../analysis/merged_variants_anno_vaf.txt', sep="\t", index=True, header=True)

# Convert vcalls to a single VAF (with DP filter)
mutations_vaf = mutations.copy()
mutations_vaf.loc[:, mutations_vaf.columns[14:]] = mutations_vaf.loc[:, mutations_vaf.columns[14:]].applymap(unify_vaf)


# Create a MAF matrix (applying filters with GNOMAD filter)
to_keep = list()
maf_df = pd.DataFrame(columns=['Hugo_Symbol',
                               'Tumor_Sample_Barcode',
                               'Chromosome',
                               'Start_Position',
                               'Reference_Allele',
                               'Tumor_Seq_Allele2',
                               'Variant_Type'])
for index, row in mutations_vaf.iterrows():
    has_cosmic = row['COSMI70'] not in ['Na', '.']
    has_dbsnp = row['DBSNP'] not in ['Na', '.']
    has_gnomad = row['GNOMAD'] not in ['Na', '.']
    has_clinvar = 'Pathogenic' in row['CLINVAR']
    gnomad = float(row['GNOMAD']) if has_gnomad else 1.0
    
    if has_gnomad and gnomad >= 0.01:
        continue
    
    effect = row['EFFECT']
    vkey = index
    gene = row['GENE']
    chrm, rest = vkey.split(':')
    start, rest2 = rest.split()
    ref, alt = rest2.split('>')
    start = int(start)
    if any(x in FILTER_EFFECTS for y in effect.split('|') for x in y.split('&')):
        vtype = get_maf(ref, alt)
        if vtype == 'DEL':
            alt = '-' if len(alt) == 1 else alt[1:]
            ref = ref[1:]
            start += 1
        elif vtype == 'INS':
            ref = '-' if len(ref) == 1 else ref[1:]
            alt = alt[1:]
            start += 1
        has_mutations = False
        for column, value in row[14:].items():
            patient = column.split('-')[-1]
            tumor_only = not any(metasamples[metasamples['PATIENT_ID'] == patient]['SAMPLE_TYPE'] == 'gDNA')
            if tumor_only and not has_clinvar and not has_cosmic:
                continue
    
            if float(value) >= FILTER_VAF and 'gDNA' not in column:
                maf_df = maf_df.append({'Hugo_Symbol':gene,
                                        'Tumor_Sample_Barcode':column,
                                        'Chromosome':chrm,
                                        'Start_Position':start,
                                        'Reference_Allele':ref,
                                        'Tumor_Seq_Allele2':alt,
                                        'Variant_Type':vtype},
                                       ignore_index=True)
                has_mutations = True
        if has_mutations:
            to_keep.append(index)
                
mutations_vaf = mutations_vaf.reindex(to_keep)
mutations_vaf = mutations_vaf.drop(['NUM_SAMPLES', 
                                    'NUM_SAMPLES_GENE',
                                    'NUM_PATIENTS',
                                    'NUM_PATIENTS_GENE'], 
                                    axis='columns')
mutations_vaf.to_csv('../analysis/merged_variants_anno_vaf_filtered.txt', sep="\t", index=True, header=True)

maf_df['Start_Position'] = pd.to_numeric(maf_df['Start_Position'])
maf_df.to_csv('../analysis/merged_maf.txt', sep="\t")

In [None]:
import signatureanalyzer as sa

maf_df = pd.read_csv('../analysis/merged_maf.txt', sep="\t", index_col=False, header=0)

_,spectra_snvs = sa.spectra.get_spectra_from_maf(maf_df, cosmic='cosmic3_exome', hgfile='../analysis/hg38.2bit')
spectra_snvs.to_csv('../analysis/merged_maf_spectra_snvs.txt', sep="\t", index=True, header=True)
_,spectra_indels = sa.spectra.get_spectra_from_maf(maf_df, cosmic='cosmic3_ID', hgfile='../analysis/hg38.2bit')
spectra_indels.to_csv('../analysis/merged_maf_spectra_indels.txt', sep="\t", index=True, header=True)