## Bring in Alignment for mapping

This program will map TFBS using the Biopython's motif package.

**Inputs**: 
1. before alignment (fasta) 
2. after alignment (fasta) 
3. TFBS Position Frequency Matrix.

In [103]:
from Bio import motifs
from Bio import SeqIO 
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC, generic_dna, generic_protein
import re
from collections import defaultdict
import pandas as pd
import numpy as np
import os, sys

In [104]:
## Print full DF

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [105]:
## Input 1 - Alignment Input 

## read in alignment as a list of sequences
alignment = list(SeqIO.parse("../data/fasta/output_ludwig_eve-striped-2.fa", "fasta"))

## Get alignment ID list for later
alignment_id = []
for seq in alignment:
    alignment_id.append(str(seq.id))
    
## Check
print("Found %i records in alignment file" % len(alignment))

## Sequence Length should be the same for all alignment sequences
for seq in alignment:
    print len(seq)

## Turn sequences into a list of strings
## Note: They are no longer bio.seq.seq objects
alignment_string_list = []
for seq in alignment:
    alignment_string_list.append(str(seq.seq))


Found 9 records in alignment file
1136
1136
1136
1136
1136
1136
1136
1136
1136


In [106]:
## Input 2 - Raw Sequences Input

raw_sequences = list(SeqIO.parse("../data/fasta/ludwig_eve-striped-2.fasta", "fasta"))
print("Found %i records in raw sequence file" % len(raw_sequences))

## make all IUPAC.IUPACUnambiguousDNA()
raw_sequences_2 = []

for seq in raw_sequences:
    raw_sequences_2.append(Seq(str(seq.seq), IUPAC.IUPACUnambiguousDNA()))
    print len(seq)

Found 9 records in raw sequence file
928
875
898
868
862
913
905
909
868


In [107]:
## Input 3 - Motif Input
## [ ] This is where I need to loop through all PWMs in a file

motif = motifs.read(open("../data/PWM/transpose_fm/bcd_FlyReg.fm"),"pfm")
print(motif.counts)
pwm = motif.counts.normalize(pseudocounts=0.0)
pssm = pwm.log_odds()
motif_length = len(motif) #for later

        0      1      2      3      4      5      6      7
A:   0.19   0.17   0.88   0.92   0.04   0.04   0.06   0.12
C:   0.37   0.08   0.04   0.02   0.02   0.87   0.52   0.25
G:   0.08   0.04   0.04   0.04   0.33   0.02   0.08   0.37
T:   0.37   0.71   0.04   0.02   0.62   0.08   0.35   0.27



In [108]:
## Searching the Motifs in Sequences
## Returns a list of arrays with a score for each position

## This give the score for each position
## If you print the length you get the length of the sequence minus 8. 

## Forward stand

pssm_list = [ ]
for seq in raw_sequences_2:
    pssm_list.append(pssm.calculate(seq))

for seq in pssm_list:
    print len(seq)
    
## Reverse strand
## rpssm = pssm.reverse_complement()
   
## Approximate calculation of appropriate thresholds for motif finding 
## Patser Threshold
## It selects such a threshold that the log(fpr)=-ic(M) 
## note: the actual patser software uses natural logarithms instead of log_2, so the numbers 
## are not directly comparable. 

distribution = pssm.distribution(background=bcd.background, precision=10**4)
patser_threshold = distribution.threshold_patser()

print("Patser Threshold is %5.3f" % patser_threshold) # Calculates Paster threshold. 

921
868
891
861
855
906
898
902
861
Patser Threshold is 3.262


In [109]:
###################################
## [x] Need to reiterate over raw_sequences_2
## [x] When reiterating over raw_sequences_2, attach id
#################################

position_list = []
for i in range(0,8):
    for position, score in pssm.search(raw_sequences_2[i], threshold = patser_threshold):
        positions = {'score':score, 'position':position, 'species': i}
        position_list.append(positions)
        
position_DF = pd.DataFrame(position_list)

## Check
## print position_DF

In [110]:
#############################
## Add strand and pos position information
##############################

## Change position to positive
position_list_pos = []
for x in position_DF['position']:
    if x < 0:
       position_list_pos.append(905 + x)
    else:
       position_list_pos.append(x)

## append to position_DF
position_DF['raw_position'] = position_list_pos
    
## Strand
strand = []
for x in position_DF['position']:
    if x < 0:
       strand.append("negative")
    else:
       strand.append("positive")
    
## append to position_DF
position_DF['strand'] = strand

## Attach Motifs found 
## motif_found 

## First turn into a list of strings
raw_sequences_2_list = []
for seq in raw_sequences_2:
    raw_sequences_2_list.append(str(seq))

## Now get all motifs found in sequences
motif_found = []
for x in position_DF['position']:
    motif_found.append(raw_sequences_2_list[i][x:x + motif_length])

## Check
## len(motif_found)    
## print(motif_found) 

## append to position_DF
position_DF['motif_found'] = motif_found

## Check    
## print(position_DF)

In [111]:
## get alignment position 

##################
### This becomes a little more difficult, since each species has a different position key.
##  Ideas could be subsetting, using species row as a qualifier.

## I use the raw and alignment sequences to get the values. So that an be done seperatley from what has
## already been done to create the DF. 
####################

remap_list = []
nuc_list = ['A', 'a', 'G', 'g', 'C', 'c', 'T', 't', 'N', 'n']


positions = {'score':score, 'position':position, 'species': i}
position_list.append(positions)


for i in range(0,8):
    counter = 0
    for xInd, x in enumerate(alignment_string_list[i]):    
        if x in nuc_list:
            remaps = {'raw_position': counter, 'align_position':xInd, 'species':i}
            counter += 1
            remap_list.append(remaps)
remap_DF = pd.DataFrame(remap_list)
            
## Check
## print_full(remap_DF)



In [112]:
## Merge both datasets

## Merge the two dataframes by two columns.
## [x] Something like: merge(postion_DF, remap_DF, by = species & raw_pos)


## Check 
print(position_DF.shape)
print(remap_DF.shape)

## Merge - all sites

TFBS_map_DF_all = pd.merge(position_DF, remap_DF, on=['species', 'raw_position'], how='outer')
TFBS_map_DF_all = TFBS_map_DF_all.sort_values(by=['species','align_position'], ascending=[True, True])

## Check
## print_full(TFBS_map_DF_all)
## print(TFBS_map_DF_all.shape)

# Merge - only signal 

TFBS_map_DF_only_signal = pd.merge(position_DF, remap_DF, on=['species', 'raw_position'], how='inner')
TFBS_map_DF_only_signal = TFBS_map_DF_only_signal.sort_values(by=['species','align_position'], ascending=[True, True])

## Check
print_full(TFBS_map_DF_only_signal)
print(TFBS_map_DF_only_signal.shape)

(234, 6)
(7098, 3)
     position      score  species  raw_position    strand motif_found  \
0          10   5.013668        0            10  positive    ataatttt   
2        -751   8.946094        0           154  negative    tcctcgcc   
1         157  10.457056        0           157  positive    ttcctcgc   
3        -684   5.243600        0           221  negative    tcgttccc   
4        -649   3.285077        0           256  negative    tattgccg   
5        -616   3.594098        0           289  negative    ttggtacc   
6        -598   6.417715        0           307  negative    ctacattt   
7         334   3.702168        0           334  positive    gaacggaa   
8         404   3.491528        0           404  positive    gcaaaagt   
9         451   3.794091        0           451  positive    gtttttgc   
10       -450   9.909568        0           455  negative    tgggatta   
12       -438   5.787577        0           467  negative    agggcttg   
11        481   4.094957        

In [113]:
## Other things to do

## [ ] Make Vector that attaches
## [x] Get actual motif
## [ ] Attach real species name
## [ ] Loop through all files in directory
##    [ ] Write file name in last column
##    [ ] Append to a file 
