In [1]:
%load_ext autoreload
%autoreload 2

from __future__ import division

import os
import os.path
import math
import pickle
import multiprocessing
import collections

import numpy as np
import Bio.ExPASy.Prosite as BEP
import praline
import praline.container
import mapraline.component
import mapraline.prosite
from praline.core import *

In [2]:
INPUT_DIR = 'output/'
TRACK_ID_BASE = "mapraline.track.PrositePatternAnnotation"
FMT_TRACK_ID = "{0}_{1}"
ALPHABET = mapraline.ALPHABET_PROSITE
PROSITE_PATH = '/Users/maurits/Downloads/prosite.dat'

In [3]:
def read_patterns(path):
    patterns = []

    with file(path, 'r') as fi:
        for record in BEP.parse(fi):
            pattern = record.pattern.rstrip('.')
            if len(pattern):
                patterns.append(pattern)
    
    return patterns

In [4]:
# Read all patterns from our local prosite db.
patterns = read_patterns(PROSITE_PATH)

# Read all HOMSTRAD alignments.
alignments = []
for path, dirnames, filenames in os.walk(INPUT_DIR):
    for filename in filenames:
        if not filename.endswith('.aln'):
            continue
            
        path = os.path.join(INPUT_DIR, filename)
        try:
            alignment = praline.load_alignment_fasta(path, praline.container.ALPHABET_AA)
        except praline.AlphabetError:
            continue
                    
        alignments.append(alignment)

In [5]:
regexes = []
for pattern in patterns:
    re = mapraline.prosite.pattern_to_re(pattern)
    regexes.append((pattern, re))

In [6]:
def get_sequences(alignment):
    seqs = []
    for seq in alignment.items:
        track = seq.get_track(praline.container.TRACK_ID_INPUT)
        
        indices = track.values
        sym_list = []
        for n in xrange(indices.shape[0]):
            sym_list.append(track.alphabet.index_to_symbol(indices[n]))
        seqs.append("".join(sym_list))
    
    return seqs
    

In [7]:
seq_sets = [get_sequences(aln) for aln in alignments]

In [8]:
pattern_to_match_sets = {}
for pattern, regex in regexes:
    match_sets = []
    for seq_set in seq_sets:
        match_set = []
        for seq in seq_set:
            matches = list(regex.finditer(seq, overlapped=True))
            match_set.append(matches)
        match_sets.append(match_set)
    pattern_to_match_sets[pattern] = match_sets

In [9]:
count_motifs = 0
count_seqs = sum(len(aln.items) for aln in alignments)
count_alns = len(alignments)
for pattern, match_sets in pattern_to_match_sets.iteritems():
    for match_set in match_sets:
        for matches in match_set:
            count_motifs += len(matches)

In [10]:
count_motifs, count_seqs, count_alns

(34568, 3102, 974)

In [11]:
count_per_aln = collections.defaultdict(lambda: 0)
count_per_seq = collections.defaultdict(lambda: 0)
for pattern, match_sets in pattern_to_match_sets.iteritems():
    for i, match_set in enumerate(match_sets):
        for j, matches in enumerate(match_set):
            count_per_aln[i] += len(matches)
            count_per_seq[i, j] += len(matches)

In [12]:
a = np.array(count_per_aln.values())
b = np.array(count_per_seq.values())

In [13]:
a.min(), a.max(), a.mean()

(0, 619, 35.49075975359343)

In [14]:
b.min(), b.max(), b.mean()

(0, 69, 11.143778207607994)

In [16]:
def find_repeats(matches, adjacency_window=1):
    repeat_sets = []
    cur_repeat_set = []
    
    for match in matches:
        start = match.start()
        end = match.end()
        
        if not len(cur_repeat_set):
            cur_repeat_set = [match]
        else:
            prev_match = cur_repeat_set[-1]
            prev_end = prev_match.end()
            
            # Match falls within the adjacency window.
            if (prev_end + adjacency_window) >= start:
                cur_repeat_set.append(match)
            else:
                repeat_sets.append(cur_repeat_set)
                cur_repeat_set = [match]
    
    if len(cur_repeat_set):
        repeat_sets.append(cur_repeat_set)
    
    return repeat_sets

In [50]:
pat_aln_seq_to_repeat_matches = {}
for pattern, match_sets in pattern_to_match_sets.iteritems():
    for i, match_set in enumerate(match_sets):
        for j, matches in enumerate(match_set):
            pat_aln_seq_to_repeat_matches[pattern, i, j] = find_repeats(matches, 5)

In [51]:
repeat_counts = []
for (pat, i, j), repeat_set in pat_aln_seq_to_repeat_matches.iteritems():
    repeats = [len(x) for x in repeat_set if len(x) > 1]
    if repeats:
        print pat, repeats
        repeat_counts.extend(repeats)

N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2, 2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2, 2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2, 3]
N-{P}-[ST]-{P} [2, 2, 2]
N-{P}-[ST]-{P} [2, 2, 2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [3]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [3]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2]
N-{P}-[ST]-{P} [2, 2]
N-{P}-[ST]-{P} [2]
N-{P}-[

[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2, 2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2, 2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [3]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [3]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2, 2, 2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2, 2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2, 2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2, 2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2, 2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [3]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [3]
[ST]-x(2)-[DE] [3, 2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]-x(2)-[DE] [2]
[ST]

[KRQ]-[LIVMA]-x(2)-[GSTALIV]-{FYWPGDN}-x(2)-[LIVMSA]-x(4,9)-[LIVMF]-x-{PLH}-[LIVMSTA]-[GSTACIL]-{GPK}-{F}-x-[GANQRF]-[LIVMFY]-x(4,5)-[LFY]-x(3)-[FYIVA]-{FYWHCM}-{PGVI}-x(2)-[GSADENQKR]-x-[NSTAPKL]-[PARL] [2]
R-x(2)-[LIVMT]-x(2,3)-[FWY]-[QNYDI]-x(8,13)-[LVESI]-x-P-C-[HAVMLC]-x(3)-[QMTLHD]-[FYWL]-x(0,1)-[LV] [2]
R-x(2)-[LIVMT]-x(2,3)-[FWY]-[QNYDI]-x(8,13)-[LVESI]-x-P-C-[HAVMLC]-x(3)-[QMTLHD]-[FYWL]-x(0,1)-[LV] [2]
R-x(2)-[LIVMT]-x(2,3)-[FWY]-[QNYDI]-x(8,13)-[LVESI]-x-P-C-[HAVMLC]-x(3)-[QMTLHD]-[FYWL]-x(0,1)-[LV] [2]
R-x(2)-[LIVMT]-x(2,3)-[FWY]-[QNYDI]-x(8,13)-[LVESI]-x-P-C-[HAVMLC]-x(3)-[QMTLHD]-[FYWL]-x(0,1)-[LV] [2]
R-x(2)-[LIVMT]-x(2,3)-[FWY]-[QNYDI]-x(8,13)-[LVESI]-x-P-C-[HAVMLC]-x(3)-[QMTLHD]-[FYWL]-x(0,1)-[LV] [2]
P-x(0,2)-[GSTAN]-[DENQGAPK]-x-[LIVMFP]-[HT]-[LIVMYAC]-G-[HNTG]-[LIVMFYSTAGPC] [2]
P-x(0,2)-[GSTAN]-[DENQGAPK]-x-[LIVMFP]-[HT]-[LIVMYAC]-G-[HNTG]-[LIVMFYSTAGPC] [2]
[GA]-x(0,2)-[YSA]-x(0,1)-[VFY]-{SEDT}-C-x(1,2)-[PG]-x(0,1)-H-x(2,4)-[MQ] [2]
[GA]-x(0,2)-[YSA]-x(0,1)-[VFY]-

In [45]:
a = np.array(repeat_counts)

In [46]:
a.min(), a.max(), a.mean()

(2, 27, 4.094465372355839)

In [47]:
a.sum()

28260