In [1]:
import os
import os.path as osp
import pickle
import re
from collections import defaultdict

import scipy.special
import numpy as np
import praline
import mapraline
import mapraline.component
from praline.core import *

In [2]:
INPUT_DIR = 'alignments/praline'
TRACK_ID_BASE = "mapraline.track.PrositePatternAnnotation"
TRACK_ID_CONSENSUS = "mapraline.track.PrositeConsensusAnnotation"
FMT_TRACK_ID = "{0}_{1}"
ALPHABET = mapraline.ALPHABET_PROSITE
RE_FILENAME = re.compile(r'(BBA\d{4})_alpha_(\d+).aln')

In [3]:
def annotate_pattern(manager, alignment, pattern):
    for seq in alignment.items:
        component = mapraline.component.PrositePatternAnnotator
        execution = Execution(manager, "__ROOT__")
        task = execution.add_task(component)
        task.environment(env)
        task.inputs(sequence=seq, pattern=pattern,
                    track_id=praline.container.TRACK_ID_INPUT)

        for message in execution.run():
            pass
        outputs = execution.outputs[0]

        track = outputs['prediction_track']

        seq.add_track(FMT_TRACK_ID.format(TRACK_ID_BASE, pattern), track)

def window(l, size = 2):
    for n in xrange(len(l) - size + 1):
        yield tuple(n+m for m in xrange(size))
        
def count_motif_events(alignment):
    tracks = []
    for sequence in alignment.items:
        track = sequence.get_track(TRACK_ID_CONSENSUS)
        tracks.append(track)
    
    path = alignment.path
    counts = np.zeros((path.shape[0]-1, 2), dtype=np.int64)
    for i, i_next in window(range(path.shape[0])):
        inc_cols = (path[i_next, :]-path[i, :]) > 0
        for j, inc_col in enumerate(inc_cols):
            if inc_col:
                seq_idx = path[i_next, j]
                if tracks[j].values[seq_idx-1] == 1:
                    counts[i, 0] += 1
                else:
                    counts[i, 1] += 1
            else:
                # Treat gaps in a column as a non-match.
                counts[i, 1] += 1

    return counts

def reorder_seqs(l, according_to):
    order_map = {}
    for i, x in enumerate(according_to):
        order_map[x.name] = i
    
    return sorted(l, key=lambda x: order_map[x.name])

In [4]:
with open('./family_patterns.pickle', 'rb') as fi:
    patterns_per_family = pickle.load(fi)

In [6]:
alignment_per_balibase_alpha = {}

for filename in os.listdir(INPUT_DIR):
    m = RE_FILENAME.match(filename)
    if not m:
        continue
    
    balibase = m.group(1)
    alpha = int(m.group(2))
    
    path = osp.join(INPUT_DIR, filename)
    aln = praline.load_alignment_fasta(path, praline.container.ALPHABET_AA)
    
    alignment_per_balibase_alpha[(balibase, alpha)] =  aln

In [7]:
# Setup PRALINE for later.
env = Environment(keys={})
index = TypeIndex()
index.autoregister()
manager = Manager(index)

In [8]:
for ((balibase, alpha), aln) in alignment_per_balibase_alpha.iteritems():
    for s1, s2 in zip([s.name for s in aln.items], [s.name for s in reorder_seqs(seq_cache[balibase], aln.items)]):
        if s1 != s2:
            print "MISMATCH!", s1, s2
            print [s.name for s in aln.items]
            print [s.name for s in seq_cache[balibase]]
            break

NameError: name 'seq_cache' is not defined

In [None]:
for ((balibase, alpha), aln) in alignment_per_balibase_alpha.iteritems():
    aln.items = reorder_seqs(seq_cache[balibase], aln.items)

In [9]:
seq_cache = {}

for ((balibase, alpha), aln) in alignment_per_balibase_alpha.iteritems():
    print balibase, alpha
    if balibase in seq_cache:
        aln.items = reorder_seqs(seq_cache[balibase], aln.items)
    else: 
        for pattern in patterns_per_family[balibase]:
            annotate_pattern(manager, aln, pattern)
        seq_cache[balibase] = aln.items
            

BBA0077 35
BBA0197 50
BBA0025 100
BBA0071 20
BBA0098 30
BBA0127 45
BBA0006 25
BBA0209 20
BBA0102 50
BBA0023 50
BBA0008 35
BBA0192 20
BBA0135 50
BBA0057 100
BBA0068 15
BBA0160 35
BBA0116 15
BBA0022 20
BBA0009 0
BBA0176 50
BBA0069 40
BBA0118 20
BBA0199 5
BBA0031 25
BBA0151 35
BBA0215 50
BBA0055 25
BBA0147 40
BBA0022 40
BBA0083 50
BBA0146 15
BBA0020 30
BBA0212 40
BBA0084 10
BBA0204 30
BBA0007 15
BBA0030 40
BBA0106 15
BBA0016 20
BBA0057 30
BBA0013 35
BBA0098 25
BBA0036 5
BBA0194 45
BBA0048 100
BBA0109 45
BBA0056 5
BBA0012 0
BBA0070 35
BBA0157 10
BBA0022 35
BBA0152 5
BBA0069 25
BBA0136 45
BBA0038 0
BBA0160 45
BBA0116 25
BBA0064 0
BBA0126 20
BBA0123 35
BBA0137 10
BBA0089 35
BBA0016 45
BBA0196 10
BBA0071 35
BBA0107 25
BBA0122 0
BBA0055 15
BBA0002 50
BBA0047 10
BBA0119 5
BBA0041 20
BBA0059 35
BBA0174 30
BBA0096 15
BBA0062 10
BBA0068 100
BBA0139 100
BBA0001 25
BBA0126 45
BBA0032 35
BBA0051 50
BBA0024 40
BBA0013 45
BBA0149 100
BBA0029 45
BBA0152 100
BBA0194 35
BBA0162 20
BBA0181 10
BBA0200 15
BB

BBA0192 100
BBA0113 0
BBA0199 100
BBA0032 40
BBA0024 35
BBA0216 35
BBA0052 40
BBA0193 15
BBA0208 100
BBA0013 40
BBA0163 50
BBA0121 15
BBA0130 100
BBA0169 100
BBA0141 35
BBA0130 45
BBA0109 50
BBA0177 5
BBA0148 0
BBA0132 50
BBA0157 5
BBA0186 20
BBA0200 10
BBA0047 100
BBA0185 45
BBA0026 20
BBA0123 40
BBA0085 25
BBA0015 25
BBA0036 100
BBA0092 5
BBA0028 45
BBA0027 15
BBA0114 40
BBA0071 40
BBA0031 0
BBA0210 10
BBA0215 5
BBA0055 0
BBA0048 20
BBA0042 10
BBA0030 35
BBA0050 100
BBA0104 45
BBA0071 25
BBA0098 0
BBA0064 25
BBA0020 5
BBA0185 20
BBA0204 5
BBA0218 50
BBA0086 35
BBA0111 100
BBA0069 10
BBA0118 50
BBA0141 45
BBA0036 30
BBA0011 20
BBA0197 25
BBA0162 15
BBA0041 100
BBA0198 40
BBA0079 0
BBA0023 25
BBA0094 25
BBA0060 20
BBA0085 15
BBA0016 50
BBA0203 5
BBA0121 20
BBA0003 5
BBA0050 5
BBA0191 40
BBA0105 0
BBA0161 25
BBA0177 40
BBA0119 30
BBA0157 40
BBA0217 25
BBA0008 100
BBA0104 35
BBA0098 50
BBA0059 10
BBA0120 100
BBA0096 100
BBA0023 30
BBA0201 0
BBA0079 50
BBA0084 20
BBA0064 5
BBA0126 50
BBA0

BBA0161 100
BBA0183 40
BBA0142 15
BBA0173 0
BBA0062 25
BBA0153 0
BBA0125 35
BBA0065 100
BBA0094 10
BBA0158 0
BBA0027 30
BBA0074 30
BBA0213 0
BBA0087 50
BBA0050 10
BBA0191 45
BBA0048 5
BBA0185 100
BBA0056 100
BBA0177 45
BBA0119 35
BBA0181 25
BBA0157 45
BBA0030 50
BBA0217 20
BBA0096 45
BBA0064 10
BBA0180 10
BBA0104 50
BBA0185 5
BBA0124 5
BBA0201 5
BBA0038 35
BBA0170 35
BBA0107 20
BBA0130 20
BBA0040 45
BBA0142 35
BBA0114 0
BBA0208 10
BBA0103 20
BBA0031 40
BBA0210 50
BBA0124 40
BBA0105 30
BBA0194 5
BBA0074 100
BBA0033 30
BBA0167 25
BBA0088 25
BBA0098 40
BBA0198 25
BBA0020 45
BBA0079 40
BBA0049 100
BBA0060 5
BBA0114 15
BBA0205 10
BBA0122 40
BBA0141 5
BBA0215 100
BBA0161 10
BBA0115 40
BBA0119 45
BBA0070 40
BBA0030 0
BBA0088 0
BBA0214 15
BBA0054 0
BBA0033 100
BBA0028 100
BBA0079 35
BBA0149 45
BBA0175 45
BBA0176 5
BBA0170 10
BBA0111 5
BBA0202 30
BBA0074 5
BBA0001 15
BBA0087 45
BBA0018 35
BBA0090 25
BBA0191 0
BBA0170 40
BBA0073 40
BBA0093 40
BBA0214 20
BBA0010 20
BBA0049 5
BBA0155 10
BBA0037 40

In [10]:
for items in seq_cache.itervalues():
    for seq in items:
        consensus_values = np.zeros((len(seq),), dtype=np.int32)
        for trid, track in seq.tracks:
            if not trid.startswith(TRACK_ID_BASE):
                continue
            consensus_values[track.values == 1] = 1
        
        consensus_track = praline.container.PlainTrack(None, mapraline.ALPHABET_PROSITE, raw_indices=consensus_values)
        seq.add_track(TRACK_ID_CONSENSUS, consensus_track)

In [11]:
motif_scores_per_alpha = defaultdict(list)
motif_scores_per_balibase_alpha = {}

for ((balibase, alpha), aln) in alignment_per_balibase_alpha.iteritems():
    counts = count_motif_events(aln)
    match_count = counts[:, 0]
    motif_score = np.sum(scipy.special.comb(match_count, 2) / scipy.special.comb(len(aln.items), 2)) / np.nonzero(match_count)[0].shape[0]
    
    motif_scores_per_alpha[alpha].append(motif_score)
    motif_scores_per_balibase_alpha[balibase, alpha] = motif_score 

In [12]:
for alpha, scores in sorted(motif_scores_per_alpha.iteritems()):
    a = np.array(scores)
    print alpha, a.mean(), a.std()

0 0.04773291411358243 0.053527524695604706
5 0.04943299772985678 0.05356628359953301
10 0.05226443572181958 0.0569575879014985
15 0.054764053056889306 0.05939302919001572
20 0.05727142591036778 0.0605153783081723
25 0.0598319267788965 0.0620452612335982
30 0.06235318300692802 0.06276862700770286
35 0.06508875259182043 0.06351814193324061
40 0.0686626001115406 0.06718977095885324
45 0.07157686058529743 0.06938371686615895
50 0.0747564770263148 0.07100766857233616
100 0.09283033513153256 0.07675297177613069


In [13]:
with open('./motif_scores.pickle', 'wb') as fo:
    pickle.dump((motif_scores_per_alpha, motif_scores_per_balibase_alpha), fo)