## The Plan

1) Generate a FASTA reference file. 

2) Create an index for the reference file.

3) Map simulated reads to the index file. 

4) Analyze the mapping results. 

## Coding

In [33]:
import os
import sys
from glob import glob
import subprocess
import shlex
import pandas as pd

bwa = '/frazer01/software/bwa-0.7.12/bwa'
samtools = '/frazer01/software/samtools-1.2/samtools'

In [2]:
proj_dir = '/frazer01/home/joreyna/repos/CSE-280a/Project/'

# LOCATING all the reference files 
ref_dir = os.path.join(proj_dir, 'output/', 'build_vntr_sequence/')
refs = glob(ref_dir + '*')
ref = refs[0]

# EXTRACTING the sample name 
sample_name = os.path.split(ref)[-1]

# PINPOINTING the location of the reference file
ref_fn = os.path.join(ref, '{}_reference.fa'.format(sample_name))

# PINPOINTING the location of the fastq file
coverage = 30
coverage_name = 'coverage_{}'.format(coverage)
fastq_dir = os.path.join(proj_dir, 'output/', 'simulate_read_mapping/')
fastq_dir = os.path.join(fastq_dir, sample_name, coverage_name)
fastq_name = '_'.join(fastq_dir.split('/')[-2:])
fastq_fn = os.path.join(fastq_dir, '{}.fq'.format(fastq_name))

# PRINTING parameters 
# print('sam: {}\n'.format(sample_name))
# print('sample_name: {}\n'.format(sample_name))
# print('ref_fn: {}, exists? {}\n'.format(ref_fn, os.path.exists(ref_fn)))
# print('fastq_fn: {}, exists? {}\n'.format(fastq_fn, os.path.exists(fastq_fn)))

# Running BWA thru python
# cmd = shlex.split('/frazer01/software/bwa-0.7.12/bwa mem {} {}'.format(ref_fn, fastq_fn))
# subprocess.check_output(cmd)

## Sequence versus Reference  

In [3]:
!wc -c ../output/build_vntr_sequence/sequence_10000_case_6/sequence_10000_case_6.fa
!wc -c ../output/build_vntr_sequence/sequence_10000_case_6/sequence_10000_case_6_reference.fa

10733 ../output/build_vntr_sequence/sequence_10000_case_6/sequence_10000_case_6.fa
31005 ../output/build_vntr_sequence/sequence_10000_case_6/sequence_10000_case_6_reference.fa


* The original sequence and the reference differ by 530 characters.

* Difference corresponds to 8 VNTR's in the sequence.

## BWA Argument Testing 

Running multiple version of BWA. 
* Looking at the mapping generated by ART
* BWA Standard version, uses **BWA mem fq -t 4**
* **-a** argument,  for outputting multiple mappings per read. 

In [4]:
!echo "ART alignment"
!wc -l ../output/simulate_read_mapping/sequence_10000_case_6/coverage_30/sequence_10000_case_6_coverage_30.sam

ART alignment
2103 ../output/simulate_read_mapping/sequence_10000_case_6/coverage_30/sequence_10000_case_6_coverage_30.sam


In [5]:
aln_fn = os.path.join(fastq_dir, 'set_std_arg.sam')
!echo  "BWA standard"
!$bwa mem $ref_fn $fastq_fn -t 4 2> /dev/null > $aln_fn
!wc -l $aln_fn

BWA standard
2104 /frazer01/home/joreyna/repos/CSE-280a/Project/output/simulate_read_mapping/sequence_10000_case_6/coverage_30/set_std_arg.sam


In [18]:
aln_fn = os.path.join(fastq_dir, 'set_a_arg.sam')
!echo "BWA multi-mappings" 
!$bwa mem $ref_fn $fastq_fn -t 4  -a 2> /dev/null > $aln_fn
!wc -l $aln_fn

BWA multi-mappings
6449 /frazer01/home/joreyna/repos/CSE-280a/Project/output/simulate_read_mapping/sequence_10000_case_6/coverage_30/set_a_arg.sam


### Investigating the differences between ART and Standard BWA

I will be looking at:
* number of mappings
* number of reads which were mapped multiple times 
* the different coordinate values/sys

In [19]:
def count_mappings(samtools, sam_fn):
    """
    Count the number of mappings.
    
    Parameters
    ----------
    
    samtools : str
    Path to the samtools program. 
    
    sam_fn : str
    Path to the sam file.
    """
    
    # SCRIPTING command to count the number of mappings 
    view = '{} view {} '.format(samtools, sam_fn)
    wc = view + ' | wc -l'
    num_mappings = int(subprocess.check_output(wc, shell=True))
    
    return num_mappings

In [20]:
def get_multiple_mappings(samtools, sam_fn):
    """
    Count the number of mappings.
    
    Parameters
    ----------
    
    samtools : str
    Path to the samtools program. 
    
    sam_fn : str
    Path to the sam file.
    """
    
    # SCRIPTING command to count the number of mappings 
    view = '{} view {} '.format(samtools, sam_fn)
    cmd = view + " | awk '{print $1}'"
    cmd += ' | sort'
    cmd += ' | uniq -d'
    mult_mappings = subprocess.check_output(cmd, shell=True)
    return mult_mappings

In [21]:
# PINPOINTING ART sam file 
art_fn = '../output/simulate_read_mapping/sequence_10000_case_6/coverage_30/sequence_10000_case_6_coverage_30.sam'
art_fn = os.path.abspath(art_fn)
art_mappings = count_mappings(samtools, art_fn)

In [22]:
art_mappings

2100

In [23]:
art_mult = get_multiple_mappings(samtools, art_fn)

In [24]:
art_mult

''

In [25]:
# PINPOINTING ART sam file 
std_fn = '/frazer01/home/joreyna/repos/CSE-280a/Project/output/' + \
    'simulate_read_mapping/sequence_10000_case_6/coverage_30/set_std_arg.sam'

In [26]:
std_mappings = count_mappings(samtools, std_fn)

In [27]:
std_mappings

2100

In [28]:
std_mult = get_multiple_mappings(samtools, std_fn)

In [29]:
std_mult

''

### Analyzing how many times each read mapped when setting -a 

In [32]:
set_a_fn = '/frazer01/home/joreyna/repos/CSE-280a/Project/output/' + \
    'simulate_read_mapping/sequence_10000_case_6/coverage_30/set_a_arg.sam'

In [37]:
df = pd.read_table(set_a_fn, skiprows=4, header=None)
columns = ['QNAME', 'FLAG', 'RNAME', 'POS', \
           'MAPQ', 'CIGAR', 'RNEXT', 'PNEXT', \
          'TLEN', 'SEQ', 'QUAL', 'NM', 'MD', \
          'AS', 'XS']
df.columns = columns

In [45]:
df.head()

Unnamed: 0,QNAME,FLAG,RNAME,POS,MAPQ,CIGAR,RNEXT,PNEXT,TLEN,SEQ,QUAL,NM,MD,AS,XS
0,seq1-2100,16,seq2,8121,0,150M,*,0,0,CTCGGCTTAACCTCTAGCAATACACGTATATGCTAGTAGTATTAGA...,CGCCCGGGCCCCCCGCGGCGGGGCGCGGCCCCCGCGCGCG=CGGC=...,NM:i:0,MD:Z:150,AS:i:150,XS:i:150
1,seq1-2100,272,seq1,8056,0,150M,*,0,0,*,*,NM:i:0,MD:Z:150,AS:i:150,
2,seq1-2100,272,seq3,8186,0,150M,*,0,0,*,*,NM:i:0,MD:Z:150,AS:i:150,
3,seq1-2099,0,seq1,4369,0,150M,*,0,0,TAACCAGTGCTGCTACACGCTATCACTGGCGTAAAAAATGGTGGAC...,CCCGGGGGGGGGGJJJGJCJJJJJJJJGJJJGJJ8JJJJGJJJGGG...,NM:i:0,MD:Z:150,AS:i:150,XS:i:150
4,seq1-2099,256,seq2,4369,0,150M,*,0,0,*,*,NM:i:0,MD:Z:150,AS:i:150,


In [53]:
query_mapping_counts = df.groupby('QNAME').count().iloc[:, 0]

In [56]:
type(query_mapping_counts)

pandas.core.series.Series

In [64]:
query_grp = query_mapping_counts.groupby(lambda x: query_mapping_counts[x])

In [65]:
query_grp.groups.keys()

[3, 4, 5]

What do the flags 272, and 256 mean? 

256 means it's a secondary alignment and in the forward orientation. 
272 means it's a secondary alignment and reverse complimented. 
