### BCC Signature 

In [1]:
import numpy as np
import pandas as pd

In [2]:
#### BCC_ALL_MUTATIONS.tsv is a big file, in order to save memory, only few columns are extracted as follow: 
#### awk -F'\t' 'BEGIN { OFS = "\t"; ORS = "\n" } {print $1,$5,$6,$7,$8,$9,$10,$18,$19,$21,$54,$55}' BCC_filtered_ALL_MUTATIONS.txt > BCC_ALL_MUTATIONS.jx.tsv

# load BCC data file
muts_df = pd.read_csv("BCC_ALL_MUTATIONS.jx.tsv", sep='\t',header=0)

In [3]:
muts_df.head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Transcript_Strand,Variant_Classification,Variant_Type,cDNA_Change,Protein_Change,ref_context,context,signature_context
0,DNM3,1,172100820,172100820,+,Intron,SNP,,,TATTGATGCACAGTTAATAAA,ACA,ACA_C>T
1,KMO,1,241712290,241712290,+,Intron,SNP,,,cttgaggccgggagtttgaga,CCC,CCC_C>T
2,DTNB,2,25602424,25602424,-,Intron,SNP,,,TCAGGAGAAGGAAAGTCATTA,TCC,TCC_C>T
3,FAHD2A,2,96079744,96079744,+,3'UTR,SNP,,,CACTGATGGCAAGCTTTGGGT,TTG,TTG_T>A
4,KANSL3,2,97267966,97267966,-,Missense_Mutation,SNP,c.2369G>A,p.G790D,AGGAGTGGCACCCAAGGAGGA,ACC,ACC_C>T


In [4]:
#### cases without signature context
set(muts_df[muts_df['signature_context'] == '-']["Variant_Type"])

{'DEL', 'DNP', 'INS', 'TNP'}

In [5]:
#### drop the cases without signature context
muts_df = muts_df[muts_df['signature_context'] != '-']

In [6]:
#### Types of variant
#### PROBLEM: There is No synonymous mutation
set(muts_df["Variant_Classification"])

{"3'UTR",
 "5'Flank",
 "5'UTR",
 'De_novo_Start_InFrame',
 'De_novo_Start_OutOfFrame',
 'IGR',
 'Intron',
 'Missense_Mutation',
 'Nonsense_Mutation',
 'Nonstop_Mutation',
 'RNA',
 'Silent',
 'Splice_Site',
 'Start_Codon_SNP',
 'lincRNA'}

In [7]:
muts_df[muts_df.Variant_Classification == 'Silent'].head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Transcript_Strand,Variant_Classification,Variant_Type,cDNA_Change,Protein_Change,ref_context,context,signature_context
8,TTN,2,179615901,179615901,-,Silent,SNP,c.11226G>A,p.L3742L,CTCTAGAGAACAGAATATCTT,ACA,ACA_C>T
16,ERGIC1,5,172377723,172377723,+,Silent,SNP,c.777C>T,p.I259I,TCTGTGCCATCATTGGCGGGA,TCA,TCA_C>T
26,OR56B3P,11,6149942,6149942,+,Silent,SNP,c.103C>T,p.L35L,CTGGCTCTCCCTGCCCTTAGC,CCT,CCT_C>T
59,ISG15,1,949555,949555,+,Silent,SNP,c.195G>A,p.L65L,GCCAGGGCCTGGGCCCCGGCA,CCA,CCA_C>T
60,AGRN,1,982211,982211,+,Silent,SNP,c.3262T>C,p.L1088L,GCTCGAGCCCTTGGAGGGCAG,CTT,CTT_T>C


In [8]:
#### keep only missense, nonsense, nonstop, splice site
#### There is no synonymous mutation
muts_df = muts_df[muts_df.Variant_Classification.map(lambda x: x in ['Missense_Mutation', 
                                                                     'Nonsense_Mutation', 
                                                                     'Nonstop_Mutation', 
                                                                     'Splice_Site', 
                                                                     'Start_Codon_SNP', 
                                                                     'Silent'])]

In [9]:
#### BCC signature
#### only for all the mutations in the target
muts_gb = muts_df.groupby(['signature_context'])
muts_sign_sizes = muts_gb.size()
signature = muts_sign_sizes/sum(muts_sign_sizes)
print(signature)

signature_context
ACA_C>A    0.002372
ACA_C>G    0.000685
ACA_C>T    0.002413
ACC_C>A    0.001187
ACC_C>G    0.000593
             ...   
TTG_T>C    0.002784
TTG_T>G    0.001036
TTT_T>A    0.003460
TTT_T>C    0.002939
TTT_T>G    0.001409
Length: 96, dtype: float64


In [10]:
#### save BCC signature to csv
signature.to_csv("BCC_signature.tsv", sep="\t", header=None)

### BCC background

In [11]:
## extract penta-nucleotide
penta      = muts_df.ref_context.map(lambda x : x[8:13])

## if lower case then return Reverse Complement 
penta      = penta.map(lambda x : x if x.isupper() else x.replace("t", "A").replace("g", "C").replace("a", "T").replace("c", "G")[::-1])

## left adjacent context and right adjacent context
la_context = penta.map(lambda x : x[:3])
ra_context = penta.map(lambda x : x[2:])

muts_df["penta_context"]          = penta
muts_df["left_adjacent_context"]  = la_context
muts_df["right_adjacent_context"] = ra_context
muts_df.head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Transcript_Strand,Variant_Classification,Variant_Type,cDNA_Change,Protein_Change,ref_context,context,signature_context,penta_context,left_adjacent_context,right_adjacent_context
4,KANSL3,2,97267966,97267966,-,Missense_Mutation,SNP,c.2369G>A,p.G790D,AGGAGTGGCACCCAAGGAGGA,ACC,ACC_C>T,CACCC,CAC,CCC
5,ANKRD36,2,97827852,97827852,+,Missense_Mutation,SNP,c.1406C>T,p.P469L,AAAGCACTACCAGCAACTGGA,CCA,CCA_C>T,ACCAG,ACC,CAG
7,SCN7A,2,167300176,167300176,-,Missense_Mutation,SNP,c.1637G>A,p.G546E,TGTGAAAATTCCAATGAAAAC,TCC,TCC_C>T,TTCCA,TTC,CCA
8,TTN,2,179615901,179615901,-,Silent,SNP,c.11226G>A,p.L3742L,CTCTAGAGAACAGAATATCTT,ACA,ACA_C>T,AACAG,AAC,CAG
11,UNC80,2,210761082,210761082,+,Missense_Mutation,SNP,c.4313G>A,p.G1438E,GAGCCAGAGGGAATGAGTAAT,TCC,TCC_C>T,GGGAA,GGG,GAA


In [12]:
## types of mutation
muts_df["mutation"] = muts_df.signature_context.map(lambda x: x.split('_')[1])
muts_df["mut_nul"]  = muts_df.mutation.map(lambda x: x.split('>')[1])
muts_df.head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Transcript_Strand,Variant_Classification,Variant_Type,cDNA_Change,Protein_Change,ref_context,context,signature_context,penta_context,left_adjacent_context,right_adjacent_context,mutation,mut_nul
4,KANSL3,2,97267966,97267966,-,Missense_Mutation,SNP,c.2369G>A,p.G790D,AGGAGTGGCACCCAAGGAGGA,ACC,ACC_C>T,CACCC,CAC,CCC,C>T,T
5,ANKRD36,2,97827852,97827852,+,Missense_Mutation,SNP,c.1406C>T,p.P469L,AAAGCACTACCAGCAACTGGA,CCA,CCA_C>T,ACCAG,ACC,CAG,C>T,T
7,SCN7A,2,167300176,167300176,-,Missense_Mutation,SNP,c.1637G>A,p.G546E,TGTGAAAATTCCAATGAAAAC,TCC,TCC_C>T,TTCCA,TTC,CCA,C>T,T
8,TTN,2,179615901,179615901,-,Silent,SNP,c.11226G>A,p.L3742L,CTCTAGAGAACAGAATATCTT,ACA,ACA_C>T,AACAG,AAC,CAG,C>T,T
11,UNC80,2,210761082,210761082,+,Missense_Mutation,SNP,c.4313G>A,p.G1438E,GAGCCAGAGGGAATGAGTAAT,TCC,TCC_C>T,GGGAA,GGG,GAA,C>T,T


In [13]:
muts_df["signature_penta_context"] = muts_df[["penta_context", "mutation"]].aggregate('_'.join, axis=1)

muts_df.head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Transcript_Strand,Variant_Classification,Variant_Type,cDNA_Change,Protein_Change,ref_context,context,signature_context,penta_context,left_adjacent_context,right_adjacent_context,mutation,mut_nul,signature_penta_context
4,KANSL3,2,97267966,97267966,-,Missense_Mutation,SNP,c.2369G>A,p.G790D,AGGAGTGGCACCCAAGGAGGA,ACC,ACC_C>T,CACCC,CAC,CCC,C>T,T,CACCC_C>T
5,ANKRD36,2,97827852,97827852,+,Missense_Mutation,SNP,c.1406C>T,p.P469L,AAAGCACTACCAGCAACTGGA,CCA,CCA_C>T,ACCAG,ACC,CAG,C>T,T,ACCAG_C>T
7,SCN7A,2,167300176,167300176,-,Missense_Mutation,SNP,c.1637G>A,p.G546E,TGTGAAAATTCCAATGAAAAC,TCC,TCC_C>T,TTCCA,TTC,CCA,C>T,T,TTCCA_C>T
8,TTN,2,179615901,179615901,-,Silent,SNP,c.11226G>A,p.L3742L,CTCTAGAGAACAGAATATCTT,ACA,ACA_C>T,AACAG,AAC,CAG,C>T,T,AACAG_C>T
11,UNC80,2,210761082,210761082,+,Missense_Mutation,SNP,c.4313G>A,p.G1438E,GAGCCAGAGGGAATGAGTAAT,TCC,TCC_C>T,GGGAA,GGG,GAA,C>T,T,GGGAA_C>T


In [14]:
STOP      = "STOP"

MISSENSE  = "Missense_Mutation" # "missense"
NONSENSE  = "Nonsense_Mutation" # "nonsense"
SYNONYMOUS= "Silent" # "coding-synon"

cDNA_genetic_code = {"G": 
                        {"G": {"G": "Gly", "A": "Gly", "C": "Gly", "T": "Gly"},
                         "A": {"G": "Glu", "A": "Glu", "C": "Asp", "T": "Asp"},
                         "C": {"G": "Ala", "A": "Ala", "C": "Ala", "T": "Ala"},
                         "T": {"G": "Val", "A": "Val", "C": "Val", "T": "Val"}},
                     "A": 
                        {"G": {"G": "Arg", "A": "Arg", "C": "Ser", "T": "Ser"},
                         "A": {"G": "Lys", "A": "Lys", "C": "Asn", "T": "Asn"},
                         "C": {"G": "Thr", "A": "Thr", "C": "Thr", "T": "Thr"},
                         "T": {"G": "Met", "A": "Ile", "C": "Ile", "T": "Ile"}},
                     "C": 
                        {"G": {"G": "Arg", "A": "Arg", "C": "Arg", "T": "Arg"},
                         "A": {"G": "Gln", "A": "Gln", "C": "His", "T": "His"},
                         "C": {"G": "Pro", "A": "Pro", "C": "Pro", "T": "Pro"},
                         "T": {"G": "Leu", "A": "Leu", "C": "Leu", "T": "Leu"}},
                     "T": 
                        {"G": {"G": "Trp", "A":  STOP, "C": "Cys", "T": "Cys"},
                         "A": {"G":  STOP, "A":  STOP, "C": "Tyr", "T": "Tyr"},
                         "C": {"G": "Ser", "A": "Ser", "C": "Ser", "T": "Ser"},
                         "T": {"G": "Leu", "A": "Leu", "C": "Phe", "T": "Phe"}}}

def func_mut_type(ref_cnt: str, mut_cnt: str):
    if cDNA_genetic_code[ref_cnt[0]][ref_cnt[1]][ref_cnt[2]] != STOP and cDNA_genetic_code[mut_cnt[0]][mut_cnt[1]][mut_cnt[2]] == STOP: 
        return NONSENSE
    elif cDNA_genetic_code[ref_cnt[0]][ref_cnt[1]][ref_cnt[2]] != cDNA_genetic_code[mut_cnt[0]][mut_cnt[1]][mut_cnt[2]]:
        return MISSENSE
    elif cDNA_genetic_code[ref_cnt[0]][ref_cnt[1]][ref_cnt[2]] == cDNA_genetic_code[mut_cnt[0]][mut_cnt[1]][mut_cnt[2]]: 
        return SYNONYMOUS
   
#### reverse complement 
def rc(x):
    return x.lower().replace("t", "A").replace("g", "C").replace("a", "T").replace("c", "G")[::-1]

In [15]:
la_mut_type = muts_df.signature_penta_context.map(lambda x : func_mut_type(x[0:3], x[0:2] + x[-1]))
sg_mut_type = muts_df.signature_penta_context.map(lambda x : func_mut_type(x[1:4], x[1] + x[-1] + x[3]))
ra_mut_type = muts_df.signature_penta_context.map(lambda x : func_mut_type(x[2:5], x[-1] + x[3:5]))

#### reverse complement
la_rc_mut_type = muts_df.signature_penta_context.map(lambda x : func_mut_type(rc(x[0:3]), rc(x[0:2] + x[-1])))
sg_rc_mut_type = muts_df.signature_penta_context.map(lambda x : func_mut_type(rc(x[1:4]), rc(x[1]   + x[-1] + x[3])))
ra_rc_mut_type = muts_df.signature_penta_context.map(lambda x : func_mut_type(rc(x[2:5]), rc(x[-1]  + x[3:5])))

muts_df["left_adjacent_mut_type"] = la_mut_type
muts_df["signautre_mut_type"]     = sg_mut_type
muts_df["right_adjacent_mut_type"]= ra_mut_type

#### reverse complement
muts_df["left_adjacent_rc_mut_type"] = la_rc_mut_type
muts_df["signautre_rc_mut_type"]     = sg_rc_mut_type
muts_df["right_adjacent_rc_mut_type"]= ra_rc_mut_type

muts_df.head()


Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Transcript_Strand,Variant_Classification,Variant_Type,cDNA_Change,Protein_Change,ref_context,...,right_adjacent_context,mutation,mut_nul,signature_penta_context,left_adjacent_mut_type,signautre_mut_type,right_adjacent_mut_type,left_adjacent_rc_mut_type,signautre_rc_mut_type,right_adjacent_rc_mut_type
4,KANSL3,2,97267966,97267966,-,Missense_Mutation,SNP,c.2369G>A,p.G790D,AGGAGTGGCACCCAAGGAGGA,...,CCC,C>T,T,CACCC_C>T,Silent,Missense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation,Silent
5,ANKRD36,2,97827852,97827852,+,Missense_Mutation,SNP,c.1406C>T,p.P469L,AAAGCACTACCAGCAACTGGA,...,CAG,C>T,T,ACCAG_C>T,Silent,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Nonsense_Mutation,Silent
7,SCN7A,2,167300176,167300176,-,Missense_Mutation,SNP,c.1637G>A,p.G546E,TGTGAAAATTCCAATGAAAAC,...,CCA,C>T,T,TTCCA_C>T,Silent,Missense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation,Nonsense_Mutation
8,TTN,2,179615901,179615901,-,Silent,SNP,c.11226G>A,p.L3742L,CTCTAGAGAACAGAATATCTT,...,CAG,C>T,T,AACAG_C>T,Silent,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Missense_Mutation,Silent
11,UNC80,2,210761082,210761082,+,Missense_Mutation,SNP,c.4313G>A,p.G1438E,GAGCCAGAGGGAATGAGTAAT,...,GAA,C>T,T,GGGAA_C>T,Silent,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation


In [16]:
muts_df[muts_df.Variant_Classification != muts_df.signautre_mut_type].head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Transcript_Strand,Variant_Classification,Variant_Type,cDNA_Change,Protein_Change,ref_context,...,right_adjacent_context,mutation,mut_nul,signature_penta_context,left_adjacent_mut_type,signautre_mut_type,right_adjacent_mut_type,left_adjacent_rc_mut_type,signautre_rc_mut_type,right_adjacent_rc_mut_type
8,TTN,2,179615901,179615901,-,Silent,SNP,c.11226G>A,p.L3742L,CTCTAGAGAACAGAATATCTT,...,CAG,C>T,T,AACAG_C>T,Silent,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Missense_Mutation,Silent
14,EPM2AIP1,3,37033404,37033404,-,Nonsense_Mutation,SNP,c.1165G>T,p.E389*,CGAAGGTGTTCCATAATGTCC,...,CCA,C>A,A,TTCCA_C>A,Missense_Mutation,Missense_Mutation,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Missense_Mutation
16,ERGIC1,5,172377723,172377723,+,Silent,SNP,c.777C>T,p.I259I,TCTGTGCCATCATTGGCGGGA,...,CAT,C>T,T,ATCAT_C>T,Silent,Missense_Mutation,Missense_Mutation,Missense_Mutation,Silent,Missense_Mutation
26,OR56B3P,11,6149942,6149942,+,Silent,SNP,c.103C>T,p.L35L,CTGGCTCTCCCTGCCCTTAGC,...,CTG,C>T,T,CCCTG_C>T,Silent,Missense_Mutation,Silent,Missense_Mutation,Missense_Mutation,Silent
59,ISG15,1,949555,949555,+,Silent,SNP,c.195G>A,p.L65L,GCCAGGGCCTGGGCCCCGGCA,...,GGG,C>T,T,CTGGG_C>T,Silent,Missense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation,Silent


In [17]:
muts_df_head = muts_df.head(1000)

## Drop splice sites for this comparasion
muts_df_head = muts_df_head[muts_df_head.Variant_Classification.map(lambda x: x != 'Splice_Site')]

## Question: some Variant Classification correspond to none of the mutation type of the following 6 contexts
problem_set = muts_df_head[muts_df_head.apply(func = lambda x: x.Variant_Classification not in [x.left_adjacent_mut_type, 
                                                                                  x.signautre_mut_type, 
                                                                                  x.right_adjacent_mut_type, 
                                                                                  x.left_adjacent_rc_mut_type, 
                                                                                  x.signautre_rc_mut_type, 
                                                                                  x.right_adjacent_rc_mut_type], 
                                axis = 1)]

# [["context","signature_context","penta_context","left_adjacent_context","right_adjacent_context"]]
# problem_set

In [18]:
## join BCC signature
muts_df = muts_df.set_index("signature_context").join(pd.DataFrame(signature, columns=["BCC_signature"]), how="outer").reset_index()

In [30]:
genes_gb = muts_df.groupby(["Hugo_Symbol"])
gene_EPM2AIP1 = genes_gb.get_group("EPM2AIP1")
gene_EPM2AIP1

Unnamed: 0,signature_context,Hugo_Symbol,Chromosome,Start_position,End_position,Transcript_Strand,Variant_Classification,Variant_Type,cDNA_Change,Protein_Change,...,mutation,mut_nul,signature_penta_context,left_adjacent_mut_type,signautre_mut_type,right_adjacent_mut_type,left_adjacent_rc_mut_type,signautre_rc_mut_type,right_adjacent_rc_mut_type,BCC_signature
88803,CCA_C>T,EPM2AIP1,3,37033751,37033751,-,Nonsense_Mutation,SNP,c.818G>A,p.W273*,...,C>T,T,TCCAA_C>T,Silent,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Nonsense_Mutation,Silent,0.065905
295373,CTG_T>G,EPM2AIP1,3,37033462,37033462,-,Silent,SNP,c.1107A>C,p.T369T,...,T>G,G,ACTGT_T>G,Silent,Missense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation,Silent,0.001832
295494,CTG_T>G,EPM2AIP1,3,37033462,37033462,-,Silent,SNP,c.1107A>C,p.T369T,...,T>G,G,ACTGT_T>G,Silent,Missense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation,Silent,0.001832
369277,TCA_C>T,EPM2AIP1,3,37033258,37033258,-,Silent,SNP,c.1311C>T,p.L437L,...,C>T,T,CTGAG_C>T,Silent,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Nonsense_Mutation,Silent,0.06851
380142,TCA_C>T,EPM2AIP1,3,37033478,37033478,-,Missense_Mutation,SNP,c.1091C>T,p.S364L,...,C>T,T,CTGAA_C>T,Silent,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,0.06851
391038,TCA_C>T,EPM2AIP1,3,37033819,37033819,-,Silent,SNP,c.750G>A,p.L250L,...,C>T,T,CTCAA_C>T,Silent,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Silent,Silent,0.06851
406384,TCC_C>A,EPM2AIP1,3,37033404,37033404,-,Nonsense_Mutation,SNP,c.1165G>T,p.E389*,...,C>A,A,TTCCA_C>A,Missense_Mutation,Missense_Mutation,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Missense_Mutation,0.003176
559055,TCC_C>T,EPM2AIP1,3,37033200,37033200,-,Missense_Mutation,SNP,c.1369C>T,p.P457S,...,C>T,T,AGGAT_C>T,Missense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation,Silent,0.228051
645001,TCT_C>T,EPM2AIP1,3,37033944,37033944,-,Silent,SNP,c.625C>T,p.L209L,...,C>T,T,CAGAA_C>T,Missense_Mutation,Missense_Mutation,Nonsense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation,0.079674
664135,TCT_C>T,EPM2AIP1,3,37032768,37032768,-,Missense_Mutation,SNP,c.1801G>A,p.E601K,...,C>T,T,TTCTC_C>T,Silent,Missense_Mutation,Missense_Mutation,Missense_Mutation,Missense_Mutation,Silent,0.079674


In [32]:
muts_gb = gene_EPM2AIP1.groupby(["Variant_Classification"])

synonymous = muts_gb.get_group(SYNONYMOUS)
nonsense   = muts_gb.get_group(NONSENSE)
missense   = muts_gb.get_group(MISSENSE)

prob_subst_BCC = np.array([sum(synonymous.BCC_signature), sum(nonsense.BCC_signature), sum(missense.BCC_signature)])
fraction_subst_BCC = prob_subst_BCC / sum(prob_subst_BCC)

In [33]:
pd.DataFrame(np.array([prob_subst_BCC, fraction_subst_BCC]), columns=[SYNONYMOUS, NONSENSE, MISSENSE], index=["prob_subst_BCC", "fraction_subst_BCC"])

Unnamed: 0,Silent,Nonsense_Mutation,Missense_Mutation
prob_subst_BCC,0.220358,0.069081,0.378244
fraction_subst_BCC,0.330034,0.103464,0.566503
