# VARify


## Workflow


<img src="wf.png" width="600">

# Functions

In [453]:
# Utility File

# libraries
import warnings
import pandas as pd

asciiDict = {chr(i):(i-33) for i in range(33,74)}

def varify(x, mp):
    # dictionary of alternate codons from pileup
    # alt_dict = altDict(mp)
    
    # running the script
    snp_df = x.apply(axis=1, func=snpDict)
    snp_df = pd.concat(list(snp_df))
    df = pd.merge(x, snp_df, on='snp_pos')

    # Add the dictionary
    df = df.apply(axis=1, func=translate)
    df = df.apply(axis=1, func=altValidate)
    df = df.apply(axis=1, func=codonVarify)
    df = df.apply(axis=1, func=aaVarify)
    df = df.drop(columns='pileup', axis=1)
    
    reorg = ['chr_id', 'snp_pos', 'ref_allele', 'alt_allele', 'gene_id', 'mrna_id',
             'prot_id', 'strand', 'effect', 'snp_cds_pos', 'codon1_genome_pos',
             'codon2_genome_pos', 'codon3_genome_pos', 'snp_aa_pos', 'ref_codon',
             'alt_codon', 'ref_aa', 'alt_aa', #'ref', 
             'pos1_pileup', 'pos2_pileup', 'pos3_pileup',
             'varify_codon', 'varify_allele',
             'varify_aa', 'nt_VARified', 'codon_VARified', 'aa_VARified','comment']
    return df[reorg]



def snpDict(x):
    # import itertools # for permutations
    #print(x.snp_pos)
    # Takes the current position of SNP
    p3 = int(x.codon3_genome_pos)
    p2 = int(x.codon2_genome_pos)
    p1 = int(x.codon1_genome_pos)
    subDict = {p1 : '.', p2 : '.', p3 : '.'}

    
    # Fills dictionary with SNP of position from mpileup
    res_dict = {key: alt_dict.get(key, subDict[key]) for key in subDict}
    res = ''.join(res_dict.values())
    
    pos_check = False
    pos_list = [p1, p2, p3]
#     print(pos_list)
#     print(subDict)
    
    # If there are any differences between RNA & DNA, return position
    if pos != False: 
        for p in pos_list:
            if p in pos:
                pos_check = True
                pos_dict = {key: alt_dict.get(key, subDict[key]) for key in pos_list}
                pos_dict = {key:val for key, val in pos_dict.items() if val != '.'}
    
    # starts from reference 
    ref = list(x['ref_codon']) 
    
    # create a new dictionary
    snp_dict = {}
    multi_check = False
    comment=''
    flag = ''
    
    # create pileup in df
    for i in range(len(pos_list)):
        cname = ['pos1_pileup', 'pos2_pileup', 'pos3_pileup']
        
        try:
            snp_dict[cname[i]] = mp[mp.snp_pos == pos_list[i]].pileup.tolist()[0]
        except:
            # if the pileup does not have info, replace with *
            snp_dict[cname[i]] = "*"
        
    
    # MultiFlag for complex codons
    if (list(res)).count(".") < 2:
        multi_check = True
        if pos_check == False:
            # if < 2, then there are multi snps 
            flag = 'complex_codon'
        elif pos_check == True: 
            flag = f'NA'
            comment=f'SNP(s) {pos_dict} only in RNA'
        
    if multi_check == True: 
        # if multi_check true, then complex_codon
        snp_dict['varify_codon'] = flag
        snp_dict['snp_pos'] = x['snp_pos']
        current_pos = int(x['snp_pos'])
        snp_dict['varify_allele'] = alt_dict[int(current_pos)] # fill with current pos snp
        snp_dict['comment'] = comment
                                      
    else:
        current_pos = int(x['snp_pos'])
        idx = list(subDict.keys()).index(current_pos)
        # get ref
        try:
            ref[idx] = alt_dict[current_pos]
            snp_dict['varify_allele'] = alt_dict[int(current_pos)] # fill with current pos snp
        except: 
            ref[idx] = x.ref_allele
            snp_dict['varify_allele'] = x.ref_allele
            
        
        snp_dict['varify_codon'] = ''.join(ref)
        snp_dict['snp_pos'] = x['snp_pos']
        snp_dict['comment'] = comment

        
    snp_df = pd.DataFrame([snp_dict])


    return snp_df

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def getAlt(x):
    indel = has_numbers(x.pileup)
    l = list(x['pileup'])
    nts = ['A','C','T','G']
    
    # check if indel, then skip
    while indel == True:
        for i in l: 
            if i.isnumeric():
                string = ''.join(l)
                n = int(i)
                idx = l.index(i)
                r = range(idx-1,idx+n+1)
                pat = ''.join(l[r[0]:r[-1]+1])
                string = string.replace(pat, '')
        l = [i for i in string]
        indel = has_numbers(string)

    l = [i for i in l if i.isalpha()]
    l = [i for i in l if i.upper() in nts]

    if len(l) == 0:
        x['varify_allele'] = None
    else:   
        # Get list of unique
        a = [i for i in l]
        a_unique = list(set(a)) 

        # getting max alt
        a_unique = pd.DataFrame(a_unique)

        # Get the max snp
        #nt = a_unique.groupby([0]).apply(lambda x: x.value_counts().index[0])[0]
        #nt = nt[0]
        nt = pd.value_counts(a_unique[0].values.flatten()).index[0]
        x['varify_allele'] = nt.upper()
    
    return x

def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

def checkRNADict(x, mp):
    x_dict = altDict(x) # for DNA
    mp_dict = altDict(mp) # for RNA
    
    if str(set(mp_dict)-set(x_dict)) == 'set()':
        return False, mp_dict
    else:
        # returns position difference from RNA 
        return list(set(mp_dict)-set(x_dict)), mp_dict

def readMP(x):
    import pandas as pd
    mp = pd.read_table(x, header=None)
    mp = mp.rename(columns={0:"chr", 1:"snp_pos", 2:"ref", 3:"reads", 4:"pileup"})
    mp = mp[mp.reads >= 1]
    mp = mp.dropna(axis=0)
    return(mp)
    

def aaVarify(x):
    x['aa_VARified'] = 'Yes' if x['varify_aa'] == x['alt_aa'] else "No"
    return x

def codonVarify(x):
    x['codon_VARified'] = 'Yes' if x['varify_codon'] == x['alt_codon'] else 'No'
    return x

def translate(x):
    table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
    'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',}
    
    codon = x['varify_codon']

    if codon not in table.keys():
        x['varify_aa'] = ""
    else:
        x['varify_aa'] =table[codon]
        
    return(x)


# For every row in here, we're going to apply this uniqueSNP function

def mergeTab(mp, so):
    sub_mp = mp[['snp_pos', 'pileup']]
    # sub_mp['snp_pos'] = str(sub_mp['snp_pos'])
    mdf = pd.merge(so, sub_mp, on='snp_pos')
    return mdf


def varOnly(a):
    '''
    purpose: remove non-nt values in pileup
    input: string of mpileup 
    output: nt string
    '''
    x = [i for i in a if i.upper() in ['A','C','T','G']]
    
    return x

def isMono(ref, temp):
    # Check if two snps are monomorphic
    c=0
    count = [c+1 for snp in temp if snp == ref]
    if sum(count) >= 2: 
        warnings.warn("More than 2 monomorphic alleles at position.")
        return True
    else:
        return False

    

def altValidate(x):
    # Varify alt == Alt? T or F
    # Add flag to capture
    
    x['nt_VARified'] = 'Yes' if x['alt_allele'] == x['varify_allele'] else 'No'
    
    return x



def effect(x):
    # missense
    # nonsense
    x['effect'] = '_'
    

###################Older, probably outdated############################

# Codon Combos
import numpy as np
import math 



def altDict(x):
    
    '''
    Takes mPileup
    Out: mpileup dictionary with snp_pos and varify_allele
    '''
    
    # dataframe of mpileup goes through each pileup with apply
    x = x.apply(axis=1, func=getAlt)
    alt_dict = dict(zip((x['snp_pos']), x['varify_allele']))
    clean_dict = {k: alt_dict[k] for k in alt_dict if pd.isna(alt_dict[k]) == False}
    
    
    return clean_dict
    

def flatten(d, parent_key='', sep='_'):
    import collections
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

---

# Case 1

Consider the following mutation that introduces a G->A mutation.
 
VCF:
Chr1	1100023      	.       	G     	A      	60    	PASS	.       	GT    	0/1
 
* Genotype: G/A
* Reference base: G
* Alternate base: A
* Predicted reference codon: TGG (tryptophan)
* Predicted alternate codon: TGA (premature termination codon)
 
A tool like snpEff can be used to predict the effect of the mutation using a reference genome annotation (e.g., GFF). In this case the mutation occurs in a coding exon at third position of a codon that codes for tryptophan in the reference proteome and a premature termination codon in the case of the alternate allele. snpEff will determine the reference and alternate codon and return the two codons and their translation.
 
The predicted effect is therefore a nonsense mutation.

## Goal Example
| chr_id|snp_pos     | Ref     | Pred_Alt |Varify_Alt|VARified|ref_codon|varify_codon|aa|effect|
| -----------| ----------- | ----------- | ----------- | ----------- | ----------- |----------- |----------- |----------- |----------- |
|Chr1|1100023| G      | A       |A|Yes| TGG | TGA |*|stop_gain|

## Read in the Toy Mpileup File

In [530]:
# mpileup 
mp1 = readMP('../data/test/case1_pileup.txt')

mp1

Unnamed: 0,0,snp_pos,ref,3,pileup
0,Chr1,1100021,T,12,.............
1,Chr1,1100022,G,13,.............
2,Chr1,1100023,G,13,...A...A..A..


## Read in VCF File

In [531]:
# The toy VCF
import pandas as pd
toy_vcf = pd.read_table('../data/toy/toy.vcf')

toy_vcf

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NA00001
0,Chr1,1100023,.,G,A,60,PASS,.,GT,0/1
1,Chr2,312000,.,G,A,60,PASS,.,GT,0/1
2,Chr2,312001,.,T,C,55,PASS,.,GT,0/1


## Toy SNPeff Table

In [532]:
# Toy Table
toy_snpeff = pd.read_table('../data/toy/toy.snp_effects.tsv')
toy_snpeff

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,snp_aa_pos,ref_codon,alt_codon,ref_aa,alt_aa
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,1100021,1100022,1100023,7,TGG,TGA,W,*
1,Chr2,312000,G,A,gene2,mrna2,prot2,+,nonsynonymous,4,312000,312001,312002,2,GTA,ATA,V,I
2,Chr2,312001,T,C,gene2,mrna2,prot2,+,nonsynonymous,5,312000,312001,312002,2,GTA,GCA,V,A


## Merge the mpileup & snpeff table

In [533]:
# if pileup information is . at the previous positions, output the reference, else variant
def mergeTab(mp, so):
    sub_mp = mp[['snp_pos', 'ref', 'pileup']]
    mdf = pd.merge(so, sub_mp, on='snp_pos')
    return mdf

# Merging the two tables
x = mergeTab(mp1, toy_snpeff)

# dictionary of alternate codons from pileup
mp_dict = altDict(mp1) # needs to be based on mp1

In [534]:
# merged tables
x

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,snp_aa_pos,ref_codon,alt_codon,ref_aa,alt_aa,ref,pileup
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,1100021,1100022,1100023,7,TGG,TGA,W,*,G,...A...A..A..


In [535]:
# dictionary of alts from merged tables
mp_dict

{1100023: 'A'}

In [536]:
'''
Using this information, 
we're going to create our varify codon based on any
existing alts on all positions. 

If the position does not exist, we take from the referenc
'''
alt_dict = mp_dict

# a_unique = getAlt(x) # Replaced with alt_dict[3]

# Takes the current position of SNP
p3 = int(x['snp_pos'])
p2 = p3-1 
p1 = p2-1

subDict = {p1 : '.', p2 : '.', p3 : '.'}

# create a dictionary
ref = str(x['ref_codon'][0]) # starts from reference

# set up to create varify_alt
df = pd.DataFrame()
temp = [i for i in ref]
snp_dict = {}

# Add values from full dictionary if keys are in sub
res = {key: alt_dict.get(key, subDict[key]) for key in subDict}
res = ''.join(res.values())

for n in range(3):
    snp_dict['snp_pos'] = int(x['snp_pos'][0])
    snp_dict['varify_allele'] = alt_dict[p3] # alt of 'current' position
    if res[n] != '.':
        temp[n] = res[n]
        
    snp_dict["varify_codon"] = ''.join(temp)
    
# snp_dict output: {'snp_pos': 1100023, 'varify_alt': 'A', 'varify_codon': 'TGA'}

# Add the dictionary 
df = df.append(snp_dict, ignore_index=True)
df = pd.merge(x, df, on='snp_pos')
df = df.apply(axis=1, func=altValidate)


In [537]:
df

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,...,snp_aa_pos,ref_codon,alt_codon,ref_aa,alt_aa,ref,pileup,varify_allele,varify_codon,nt_VARified
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,...,7,TGG,TGA,W,*,G,...A...A..A..,A,TGA,Yes


In [603]:
df = df.apply(axis=1, func=translate)
#df = effect(df)

reorg = ['chr_id', 'snp_pos', 'ref_allele', 'alt_allele', 
         'varify_allele', 'nt_VARified', 'ref_codon', 'varify_codon', 'varify_aa', 'comment'
        ]

df = df[reorg]

df

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,varify_allele,nt_VARified,ref_codon,varify_codon,varify_aa,comment
0,Chr1,1100023,G,A,A,Yes,TGG,ND,,RNA_{pos}


# testing the case2 function here

In [715]:
# let's just do 1 line
x = x.iloc[[0]]

# dictionary of alternate codons from pileup
alt_dict = altDict(mp1)

# running the script
snp_df = x.apply(axis=1, func=snpDict)
snp_df = pd.concat(list(snp_df))
df = pd.merge(x, snp_df, on='snp_pos')

# Add the dictionary
df = df.apply(axis=1, func=translate)
df = df.apply(axis=1, func=altValidate)
df = df.apply(axis=1, func=codonVarify)
df = df.apply(axis=1, func=aaVarify)

reorg = ['chr_id', 'snp_pos', 'ref_allele', 'alt_allele', 
         #'gene_id', 'mrna_id','prot_id', 'strand', 
         'effect', #'snp_cds_pos', 
         'codon1_genome_pos','codon2_genome_pos', 'codon3_genome_pos', 
         #'snp_aa_pos', 
         'ref_codon','alt_codon', 'ref_aa', 'alt_aa', 'varify_codon', 'varify_allele',
         'varify_aa', 'nt_VARified', 'codon_VARified', 'aa_VARified']
df = df.drop(columns='pileup', axis=1)
df[reorg]

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,effect,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,ref_codon,alt_codon,ref_aa,alt_aa,varify_codon,varify_allele,varify_aa,nt_VARified,codon_VARified,aa_VARified
0,Chr1,1100023,G,A,stop_gain,1100021,1100022,1100023,TGG,TGA,W,*,TGA,A,*,Yes,Yes,Yes


# Case 1.b and Case 1.c 

To determine if the codon is VARified, VARify will look at the two other positions in the same codon (the ‘non-focal positions’) and determine if ONLY the reference nucleotide is observed in RNA. 

For example, if both non-focal positions are monomorphic for the reference nucleotide in the RNA mpileup (i.e., the reference nucleotide from VCF in snpeff table), then the codon is verified and the “codon_VARified” column will be set to ‘yes’. The Case 1 example above would have the codon_VARified column set to ‘yes’.

All other situations will have the “codon_VARified” column set to “no”. These cases could include cases in which one (or both) non-focal positions have two or more nucleotides in the RNA mpileup (note: this is NOT shown in Case 1 example above). As we discussed, this could occur frequently because of sequencing errors. We will case this “Case 1b”. 

Another situation might be if one (or both) non-focal positions are monomorphic for a non-reference nucleotide. This also would be inconsistent with the codon reported in alt_codon column and the “codon_VARified” column should be reported as “no”. We will call this “Case 1c”

Case 1b:

Given: VCF same as Case 1.
RNA Mpileup:
|chromosome_id|snp_pos|ref_allele|pos|pileup||
|-----|-----|-----|-----|-----|-----|
|Chr1|1100021|T|12|.......|other columns|
|Chr1|1100022|G|13|..TT..|other columns|
|Chr1|1100023|G|13|..A..A..A..|other columns|


Case 1c:

Given: VCF same as Case 1
RNA Mpileup:
|chromosome_id|snp_pos|ref_allele|pos|pileup||
|-----|-----|-----|-----|-----|-----|
|Chr1|1100021|T|12|CCCCCC|other columns|
|Chr1|1100022|G|13|...|other columns|
|Chr1|1100023|G|13|..A..A..A..|other columns|

*note: in Case 1c, the mpileup does not have the reference nucleotide at non-focal position 1100021. Therefore the alt_codon (TGG) is not verified (even though both non-focal positions are monomorphic in RNA mpileup.


## Goal Example
| chr_id|snp_pos     | Ref     | Pred_Alt |Varify_Alt|VARified|ref_codon|varify_codon|pred_codon|varify_aa|effect|alt_codons|comment|
| ---| --- | --- | --- | --- | --- |--- |--- |--- |--- |---|--|--|
|Chr2|312000| G      | A       |A|Yes| GTA | ATA |T|*|||Pos1: A; Pos2; C
|Chr2|312001| T      | C       |C|Yes| GTA | GCA |T|*|||

In [716]:
mp_1b = readMP('../data/toy/toy.case1b.pileup')
mp_1c = readMP('../data/toy/toy.case1c.pileup')

mp_1b


Unnamed: 0,0,snp_pos,ref,3,pileup,5
0,Chr1,1100021,T,12,"......,.,.,.",<<<<<;<<<;<<
1,Chr1,1100022,G,13,".,.,..TT.,...",=&;<<<;<<<<<<
2,Chr1,1100023,G,13,"...A...A..A,.",==<;;;<<<<<<<


In [717]:
mp_1c

Unnamed: 0,0,snp_pos,ref,3,pileup,5
0,Chr1,1100021,T,12,CCCCCCCCCCCC,<<<<<;<<<;<<
1,Chr1,1100022,G,13,".,.,.....,...",=&;<<<;<<<<<<
2,Chr1,1100023,G,13,"...A...A..A,.",==<;;;<<<<<<<


In [718]:
x_1c = mergeTab(mp_1c, toy_snpeff)
x_1c

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,snp_aa_pos,ref_codon,alt_codon,ref_aa,alt_aa,ref,pileup
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,1100021,1100022,1100023,7,TGG,TGA,W,*,G,"...A...A..A,."


There are two issues
1. When there are two snps in one codon found in both DNA & RNA
    * Solution: output complex codon
2. When there are two snps found in RNA only 
    * Solution: Output comment - POS2 in RNA

# Case 1b

In [872]:
x_1b = mergeTab(mp_1b, toy_snpeff)

x = x_1b 
mp = mp_1b

alt_dict = altDict(mp)
pos = checkRNADict(x, mp)

# running the script
#df.apply(some_func(row, var1='DOG'), axis=1)
snp_df = x.apply(axis=1, func=snpDict)
snp_df = pd.concat(list(snp_df))
df = pd.merge(x, snp_df, on='snp_pos')

# Add the dictionary
df = df.apply(axis=1, func=translate)
df = df.apply(axis=1, func=altValidate)
df = df.apply(axis=1, func=codonVarify)
df = df.apply(axis=1, func=aaVarify)
df = df.drop(columns='pileup', axis=1)

df

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,...,alt_codon,ref_aa,alt_aa,varify_codon,varify_allele,varify_aa,nt_VARified,codon_VARified,aa_VARified,comment
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,...,TGA,W,*,ND,A,,Yes,No,No,SNP(s) {1100022: 'T'} only in RNA


# Case 1c

In [879]:
x_1c = mergeTab(mp_1c, toy_snpeff)

x = x_1c 
mp = mp_1c

alt_dict = altDict(mp)
pos = checkRNADict(x, mp)

# running the script
varify(x, mp)

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,...,alt_codon,ref_aa,alt_aa,varify_codon,varify_allele,varify_aa,nt_VARified,codon_VARified,aa_VARified,comment
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,...,TGA,W,*,ND,A,,Yes,No,No,SNP(s) {1100021: 'C'} only in RNA


---

# Case 2: Multi-substitution case

In the case of multi substitution, we have alts on pos1 and pos2 of the codon. 
We want to capture this information, although pos3 has no alt

## Case2 Mpileup

In [808]:
mp2 = readMP('../data/toy/toy.case2.pileup')
mp2

Unnamed: 0,0,snp_pos,ref,3,pileup,5
0,Chr2,312000,G,4,.A.A,<<<;
1,Chr2,312001,T,3,..C,<;<
2,Chr2,312002,A,3,...,<<<


## Case2 Snpeff-like table

In [809]:
toy_snpeff2 = pd.read_table('../data/toy/toy.snp_effects.tsv') # toy_snpeff
toy_snpeff2

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,snp_aa_pos,ref_codon,alt_codon,ref_aa,alt_aa
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,1100021,1100022,1100023,7,TGG,TGA,W,*
1,Chr2,312000,G,A,gene2,mrna2,prot2,+,nonsynonymous,4,312000,312001,312002,2,GTA,ATA,V,I
2,Chr2,312001,T,C,gene2,mrna2,prot2,+,nonsynonymous,5,312000,312001,312002,2,GTA,GCA,V,A


## Goal Example
| chr_id|snp_pos     | Ref     | Pred_Alt |Varify_Alt|VARified|ref_codon|varify_codon|pred_codon|varify_aa|effect|alt_codons|flag|
| ---| --- | --- | --- | --- | --- |--- |--- |--- |--- |---|--|--|
|Chr2|312000| G      | A       |A|Yes| GTA | ATA |T|*|||
|Chr2|312001| T      | C       |C|Yes| GTA | GCA |T|*|||

## Merging mpileup and snpeff together

In [810]:
# Merging the two tables
x2 = mergeTab(mp2, toy_snpeff2)

# dictionary of alternate codons from pileup
mp_dict2 = altDict(x2)

mp_dict2

{312000: 'A', 312001: 'C'}

In [811]:
x2

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,snp_aa_pos,ref_codon,alt_codon,ref_aa,alt_aa,ref,pileup
0,Chr2,312000,G,A,gene2,mrna2,prot2,+,nonsynonymous,4,312000,312001,312002,2,GTA,ATA,V,I,G,.A.A
1,Chr2,312001,T,C,gene2,mrna2,prot2,+,nonsynonymous,5,312000,312001,312002,2,GTA,GCA,V,A,T,..C


In [812]:
import itertools

permutations = list(itertools.product(["A","G"], ["T","C"], ["A"]))

print(permutations)

["".join(x) for x in permutations]

[('A', 'T', 'A'), ('A', 'C', 'A'), ('G', 'T', 'A'), ('G', 'C', 'A')]


['ATA', 'ACA', 'GTA', 'GCA']

In [813]:
alt_dict = altDict(x2)
alt_dict

{312000: 'A', 312001: 'C'}

## Running Case 2

Things to Add: 
    * A control if nothing exists

In [821]:
pos = checkRNADict(x2, mp2)
snp_df = x2.apply(axis=1, func=snpDict)
snp_df = pd.concat(list(snp_df))
df = pd.merge(x2, snp_df, on='snp_pos')

# Add the dictionary
df = df.apply(axis=1, func=altValidate)
df = df.apply(axis=1, func=translate)

# reorg = ['chr_id', 'snp_pos', 
#          'ref_allele', 'alt_allele','varify_allele', 'VARified', 
#          'codon1_genome_pos', 'codon2_genome_pos', 'codon3_genome_pos',
#          'ref_codon', 'alt_codon','varify_codon', 
#          'ref_aa', 'alt_aa', 'varify_aa', #"alt_codons",'flag'
#          #'effect'
#         ]

df


Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,...,alt_codon,ref_aa,alt_aa,ref,pileup,varify_codon,varify_allele,comment,nt_VARified,varify_aa
0,Chr2,312000,G,A,gene2,mrna2,prot2,+,nonsynonymous,4,...,ATA,V,I,G,.A.A,complex_codon,A,,Yes,
1,Chr2,312001,T,C,gene2,mrna2,prot2,+,nonsynonymous,5,...,GCA,V,A,T,..C,complex_codon,C,,Yes,


# Case 2.5 - Both Case 1 and Case 2 on one 

In [947]:
# need to merge the mpileups
mp3 = pd.concat([mp_1b, mp2])

# this will allow us to create one dictionary
mp3

Unnamed: 0,0,snp_pos,ref,3,pileup,5
0,Chr1,1100021,T,12,"......,.,.,.",<<<<<;<<<;<<
1,Chr1,1100022,G,13,".,.,..TT.,...",=&;<<<;<<<<<<
2,Chr1,1100023,G,13,"...A...A..A,.",==<;;;<<<<<<<
0,Chr2,312000,G,4,.A.A,<<<;
1,Chr2,312001,T,3,..C,<;<
2,Chr2,312002,A,3,...,<<<


In [948]:
# toysnpeff2 has the snptable info for everything
toy_snpeff2

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,snp_aa_pos,ref_codon,alt_codon,ref_aa,alt_aa
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,1100021,1100022,1100023,7,TGG,TGA,W,*
1,Chr2,312000,G,A,gene2,mrna2,prot2,+,nonsynonymous,4,312000,312001,312002,2,GTA,ATA,V,I
2,Chr2,312001,T,C,gene2,mrna2,prot2,+,nonsynonymous,5,312000,312001,312002,2,GTA,GCA,V,A


In [959]:
# Merging the two tables
mp = mp3
x = mergeTab(mp3, toy_snpeff2)
pos = checkRNADict(x, mp3)
alt_dict = altDict(mp3)

# running the script
snp_df = x.apply(axis=1, func=snpDict)
snp_df = pd.concat(list(snp_df))
df = pd.merge(x, snp_df, on='snp_pos')

# Add the dictionary
df = df.apply(axis=1, func=translate)
df = df.apply(axis=1, func=altValidate)
df = df.apply(axis=1, func=codonVarify)
df = df.apply(axis=1, func=aaVarify)
df = df.drop(columns='pileup', axis=1)

df

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,...,pos1_pileup,pos2_pileup,pos3_pileup,varify_codon,varify_allele,comment,varify_aa,nt_VARified,codon_VARified,aa_VARified
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,...,"......,.,.,.",".,.,..TT.,...","...A...A..A,.",,A,SNP(s) {1100022: 'T'} only in RNA,,Yes,No,No
1,Chr2,312000,G,A,gene2,mrna2,prot2,+,nonsynonymous,4,...,.A.A,..C,...,complex_codon,A,,,Yes,No,No
2,Chr2,312001,T,C,gene2,mrna2,prot2,+,nonsynonymous,5,...,.A.A,..C,...,complex_codon,C,,,Yes,No,No


In [841]:
df = varify(x, mp3)
df.to_csv('../data/output/varify_output.csv', index=False)

In [960]:
varify(x, mp3)

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,...,alt_codon,ref_aa,alt_aa,varify_codon,varify_allele,varify_aa,nt_VARified,codon_VARified,aa_VARified,comment
0,Chr1,1100023,G,A,gene1,mrna1,prot1,+,stop_gain,21,...,TGA,W,*,,A,,Yes,No,No,SNP(s) {1100022: 'T'} only in RNA
1,Chr2,312000,G,A,gene2,mrna2,prot2,+,nonsynonymous,4,...,ATA,V,I,complex_codon,A,,Yes,No,No,
2,Chr2,312001,T,C,gene2,mrna2,prot2,+,nonsynonymous,5,...,GCA,V,A,complex_codon,C,,Yes,No,No,


---

# Real World Data

## Reading in the Mpileup

In [7]:
mp3 = readMP('../data/test/ENCFF283TLK_chr1.mpileup')
mp3.head()

Unnamed: 0,chr,snp_pos,ref,reads,pileup,5,6
149,chr1,10150.0,C,1.0,.,:,?
150,chr1,10151.0,T,1.0,.,:,?
170,chr1,10171.0,A,1.0,.,6,?
171,chr1,10172.0,C,1.0,.,9,?
172,chr1,10173.0,C,1.0,.,:,?


In [139]:
sub_m3 = mp3[mp3.snp_pos.isin(range(69265, 4712659))]
sub_m3.head()

Unnamed: 0,chr,snp_pos,ref,reads,pileup,5,6
37869,chr1,69265.0,C,1.0,.,9,7
37874,chr1,69270.0,A,1.0,*,C,Z
37875,chr1,69271.0,C,2.0,*.,C9,Z8
37876,chr1,69272.0,T,2.0,",.",C@,Z8
37877,chr1,69273.0,C,2.0,",.",G8,Z8


In [182]:
mp[mp.snp_pos==69451]

Unnamed: 0,chr,snp_pos,ref,reads,pileup,5,6
38055,chr1,69451.0,A,1.0,",",E,Z


## Reading in the SnpEff-like Table

In [9]:
snptable3 = pd.read_table('../data/test/ENCFF541HLI.snp_effects.chr1_only_10kgenic_snps.no_intron.no_utr.tsv')
snptable3.head()

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,snp_aa_pos,ref_codon,alt_codon,ref_aa,alt_aa
0,chr1,69270,A,G,gene-OR4F5,rna-NM_001005484.2,protein_id,+,synonymous,243,69268,69269,69270,81,TCA,TCG,S,S
1,chr1,69453,G,A,gene-OR4F5,rna-NM_001005484.2,protein_id,+,synonymous,426,69451,69452,69453,142,AAG,AAA,K,K
2,chr1,69511,A,G,gene-OR4F5,rna-NM_001005484.2,protein_id,+,nonsynonymous,484,69511,69512,69513,162,ACA,GCA,T,A
3,chr1,69897,T,C,gene-OR4F5,rna-NM_001005484.2,protein_id,+,synonymous,870,69895,69896,69897,290,TCT,TCC,S,S
4,chr1,942451,T,C,gene-SAMD11,rna-NM_001385640.1,protein_id,+,nonsynonymous,1519,942451,942452,942453,507,TGG,CGG,W,R


In [422]:
mp = sub_m3
x = mergeTab(sub_m3, snptable3)

In [None]:
pos, alt_dict = checkRNADict(x, mp)

In [None]:
df = varify(x[0:100], mp)
df.head()

In [None]:
df.to_csv('../data/output/varify_output_RWDsubset.csv', header=True)