# VARify


## Workflow


<img src="wf.png" width="600">

## Exploring Data

In [4]:
%load_ext autoreload
%autoreload 2

import pandas as pd

# mpileup 
mp = pd.read_table('../data/test/mpiletest-out.txt', header=None)
mp = mp.rename(columns={1:"snp_pos", 4:"pileup"})

# custom out file
cf = pd.read_table('../data/test/snp_out.txt')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# column names
# cols = ['chr_id', 'snp_pos', 'ref_allele', 
#         'alt_allele', 'gene_id', 'mrna_id',
#         'prot_id', 'strand', 'effect', 
#         'snp_cds_pos', 'codon1_genome_pos',
#         'codon2_genome_pos', 'codon3_genome_pos', 
#         'snp_aa_pos', 'ref_codon','alt_codon', 
#         'ref_aa', 'alt_aa']

sub_cols = ['chr_id', 'snp_pos', 'ref_allele', 
            'alt_allele', 'codon1_genome_pos',
            'codon2_genome_pos', 'codon3_genome_pos', 
            'ref_codon']

# subset of custom file, cf
sub_cf = cf[sub_cols]
sub_cf

## Merging the two dataframes together

In [12]:
sub_mp = mp[['snp_pos','pileup']]
merged_var = pd.merge(cf, sub_mp, on='snp_pos')
merged_var

Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,gene_id,mrna_id,prot_id,strand,effect,snp_cds_pos,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,snp_aa_pos,ref_codon,alt_codon,ref_aa,alt_aa,pileup
0,chr1,521,G,C,seq1,seq1.1,seq1.1,+,synonymous,265,519,520,521,89,AAG,AAC,K,N,".,......C.T.A,..,..,.,.,,^]."
1,chr1,523,A,T,seq1,seq1.1,seq1.1,+,synonymous,267,521,522,523,89,GTA,GTT,E,E,tTTTTTTTTCTTtTTtTCtTtTttC
2,chr1,524,C,G,seq1,seq1.1,seq1.1,+,synonymous,268,522,523,524,89,TAC,TAG,Y,X,",...G.......,.G,..,.,.,,A.."


## Split Dataframe

Splitting the snpeff table by codons, get genome position and alt/ref codon



In [31]:
# Utility File

# libraries
import warnings
import pandas as pd

# For every row in here, we're going to apply this uniqueSNP function

def mergeTab(mp, so):
    sub_mp = mp[['snp_pos', 'pileup']]
    mdf = pd.merge(so, sub_mp, on='snp_pos')
    return mdf


def varOnly(a):
    '''
    purpose: remove non-nt values in pileup
    input: string of mpileup 
    output: nt string
    '''
    x = [i for i in a if i.upper() in ['A','C','T','G']]
    
    return x

def uniqueSNP(x):
    l = str(x['pileup'])
    # make upper
    l = l.upper() # just to make simple
    l = varOnly(l)

    # Get list of unique
    a = [i for i in l]
    a_unique = list(set(a)) 
    
    # create a dictionary
    df = pd.DataFrame()
    temp = [i for i in x['ref_codon']]
    snp_dict = {}
    
    flag = False
    
    if len(a_unique) > 2: 
        warnings.warn("There are more than 2 SNPS at position"); print(x['snp_pos'])
        flag = True

    for snp in a_unique:
        snp_dict['snp_pos'] = int(x['snp_pos'])
        snp_dict['varify_alt'] = snp

        if flag:
            snp_dict['flag'] = '*'
        else:
            snp_dict['flag'] = ''
        
        for i in range(len(temp)):
            temp[i] = snp     
            snp_dict[f"varify_codon{i+1}"] = ''.join(temp)
            temp = [i for i in x['ref_codon']]

        df = df.append(snp_dict, ignore_index=True)
        
        snp_dict = {}

    df = pd.merge(so, df, on='snp_pos')
    return df


def varify(x):
    df = x.apply(axis=1, func=uniqueSNP)
    df = pd.concat(list(df))
    reorg = ['chr_id', 'snp_pos', 'ref_allele', 'alt_allele', 
             'varify_alt', 'codon1_genome_pos','codon2_genome_pos',
             'codon3_genome_pos', 'ref_codon', 'varify_codon1', 
             'varify_codon2', 'varify_codon3','flag']

    return df[reorg]

In [33]:
mdf = mergeTab(mp, so)
varify(merged_var)

521




Unnamed: 0,chr_id,snp_pos,ref_allele,alt_allele,varify_alt,codon1_genome_pos,codon2_genome_pos,codon3_genome_pos,ref_codon,varify_codon1,varify_codon2,varify_codon3,flag
0,chr1,521,G,C,A,519,520,521,AAG,AAG,AAG,AAA,*
1,chr1,521,G,C,T,519,520,521,AAG,TAG,ATG,AAT,*
2,chr1,521,G,C,C,519,520,521,AAG,CAG,ACG,AAC,*
0,chr1,523,A,T,T,521,522,523,GTA,TTA,GTA,GTT,
1,chr1,523,A,T,C,521,522,523,GTA,CTA,GCA,GTC,
0,chr1,524,C,G,A,522,523,524,TAC,AAC,TAC,TAA,
1,chr1,524,C,G,G,522,523,524,TAC,GAC,TGC,TAG,


## Running script

In [29]:
%reload_ext autoreload
%autoreload 2

%run '../script/main.py' --in_mp '../data/test/mpiletest-out.txt' --in_so '../data/test/snp_out.txt'

521
  chr_id  snp_pos ref_allele alt_allele varify_alt  codon1_genome_pos  \
0   chr1      521          G          C          A                519   
1   chr1      521          G          C          T                519   
2   chr1      521          G          C          C                519   
0   chr1      523          A          T          T                521   
1   chr1      523          A          T          C                521   
0   chr1      524          C          G          A                522   
1   chr1      524          C          G          G                522   

   codon2_genome_pos  codon3_genome_pos ref_codon varify_codon1 varify_codon2  \
0                520                521       AAG           AAG           AAG   
1                520                521       AAG           TAG           ATG   
2                520                521       AAG           CAG           ACG   
0                522                523       GTA           TTA           GTA   
1     

  


### To Do
* [x] output comparison
* [ ] Type Effect
* [x] Make into Function
* [ ] Build larger test data
* [ ] Nextflow with Assembly/Sam to mpileup


### Notes
**Test Data:**
* Take processed mRNA transcript (FASTA) —> VCF w/ 2-3 SNPS

**Process:** 
* RNA FASTA -align to genome using gmap->SAM/BAM—>BAM -variantcalling-> VCF —> Capture Variants
* DNA FASTA -align to genome using gmap->SAM/BAM—>BAM -variantcalling-> VCF —> Capture Variants |—> Compare RV and GV

In [None]:
top_snp = majorSNP(merged_var)
top_strand = strand(top_snp)

In [None]:
snp_dict = {'confirmed_snp' : top_snp, 'confirmed_strand' : strand}
merged_var['confirmed_snp'] = top_snp
merged_var['confirmed_strand'] = top_strand

# proposed final output
merged_var.drop(columns='pileup')

In [None]:
# majority snp
def split(word):
    return list(word)

def majorSNP(merged_var):
    n=0
    arr = split(merged_var['pileup'][n])
    
    snps = pd.Series(arr).drop_duplicates().tolist()
    
    count = dict.fromkeys(snps, 0)

    for snp_key in snps:
        for arr_key in arr: 
            if snp_key == arr_key: 
                count[snp_key] = count.get(snp_key, 0) + 1
                
    top_snp = sorted(count)[0]
    return top_snp

# strand
def strand(x):
    pos = ['A','C','T','G', 'N']
    rev = ['a','c','t','g', 'n']
    if x in pos:
        return '+'
    if x in rev:
        return '-'
    
# grab all snps
#def uniqueSNP():
    # get the list of SNPs
    
    # split into unique
    

def combine(x):
    return print(*x, sep='')

    