In [9]:
%load_ext autoreload
%autoreload 2

from __future__ import division

import os
import os.path
import math
import pickle
import multiprocessing
import collections

import numpy as np
import Bio.ExPASy.Prosite as BEP
import praline
import praline.container
import mapraline.component
from praline.core import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
INPUT_DIR = 'output/'
TRACK_ID_BASE = "mapraline.track.PrositePatternAnnotation"
FMT_TRACK_ID = "{0}_{1}"
ALPHABET = mapraline.ALPHABET_PROSITE
PROSITE_PATH = '/Users/maurits/Downloads/prosite.dat'

In [3]:
def read_patterns(path):
    patterns = []

    with file(path, 'r') as fi:
        for record in BEP.parse(fi):
            pattern = record.pattern.rstrip('.')
            if len(pattern):
                patterns.append(pattern)
    
    return patterns

def annotate_pattern(manager, alignment, pattern):
    for seq in alignment.items:
        component = mapraline.component.PrositePatternAnnotator
        execution = Execution(manager, "__ROOT__")
        task = execution.add_task(component)
        task.environment(env)
        task.inputs(sequence=seq, pattern=pattern,
                    track_id=praline.container.TRACK_ID_INPUT)

        for message in execution.run():
            pass
        outputs = execution.outputs[0]

        track = outputs['prediction_track']

        seq.add_track(FMT_TRACK_ID.format(TRACK_ID_BASE, pattern), track)

def remove_pattern(alignment, pattern):
    trid = FMT_TRACK_ID.format(TRACK_ID_BASE, pattern)

    for seq in alignment.items:
        seq.del_track(trid)
            
def has_annotation(alignment, pattern):
    alignment_has_annotation = False

    for sequence in alignment.items:
        sequence_has_annotation = False
        
        track = sequence.get_track(FMT_TRACK_ID.format(TRACK_ID_BASE, pattern))
        
        symbols = [ALPHABET.index_to_symbol(value) for value in track.values]
        for symbol in symbols:
            if symbol == 'M' or symbol == 'S':
                alignment_has_annotation = True
    
    return alignment_has_annotation


def window(l, size = 2):
    for n in xrange(len(l) - size + 1):
        yield tuple(n+m for m in xrange(size))

        
def count_motif_events(alignment, pattern):
    tracks = []
    for sequence in alignment.items:
        track = sequence.get_track(FMT_TRACK_ID.format(TRACK_ID_BASE, pattern))
        tracks.append(track)
    
    path = alignment.path
    counts = np.zeros((path.shape[0]-1, 2), dtype=np.int64)
    for i, i_next in window(range(path.shape[0])):
        inc_cols = (path[i_next, :]-path[i, :]) > 0
        for j, inc_col in enumerate(inc_cols):
            if inc_col:
                seq_idx = path[i_next, j]
                if tracks[j].values[seq_idx-1] == 1:
                    counts[i, 0] += 1
                else:
                    counts[i, 1] += 1
#                 symbol = ALPHABET.index_to_symbol(tracks[j].values[seq_idx-1])
#                 if symbol == "M":
#                     counts[i, 0] += 1
#                 else:
#                     counts[i, 1] += 1
                
            else:
                # Treat gaps in a column as a non-match.
                counts[i, 1] += 1

    return counts


def count_motifs(pattern, alignments, skip_no_match=True):
    count_arrays = []

    for alignment in alignments:
        count_array = count_motif_events(alignment, pattern)
        if skip_no_match and count_array[:, 0].sum() == 0:
            continue
        
        count_arrays.append(count_array)
    
    return count_arrays


def calculate_pairs(one, two, heterogeneous = False):
    if heterogeneous:
        result = one * two
    else:
        result = one * (two -1)
    
    result[result < 0] = 0
    
    return result


def get_statistics(count_arrays):
    statistics = []
    for count_array in count_arrays:
        # Per-columnm counts
        m_counts = count_array[:, 0]
        n_counts = count_array[:, 1]
        all_counts = m_counts + n_counts

        # Total counts
        all_tot_count = all_counts.sum()
        m_tot_count = m_counts.sum()
        n_tot_count = n_counts.sum()

        # Total pairs
        m_m_pairs = calculate_pairs(m_counts, m_counts).sum()
        n_n_pairs = calculate_pairs(n_counts, n_counts).sum()
        n_m_pairs = calculate_pairs(n_counts, m_counts, True).sum()
        m_n_pairs = calculate_pairs(m_counts, n_counts, True).sum()
        all_pairs = calculate_pairs(all_counts, all_counts).sum()
        
        row = (m_tot_count, n_tot_count, all_tot_count, m_m_pairs, n_n_pairs, n_m_pairs + m_n_pairs, all_pairs)

        statistics.append(row)
    
    return statistics


def calculate_log_odds(statistics):
    sum_m_count = 0
    sum_n_count = 0
    sum_all_count = 0
    sum_m_m_pairs = 0
    sum_n_n_pairs = 0
    sum_n_m_pairs = 0
    sum_all_pairs = 0
    for (m_count, n_count, all_count, m_m_pairs, n_n_pairs, n_m_pairs, all_pairs) in statistics:
        sum_m_count += m_count
        sum_n_count += n_count
        sum_all_count += all_count
        sum_m_m_pairs += m_m_pairs 
        sum_n_n_pairs += n_n_pairs
        sum_n_m_pairs += n_m_pairs
        sum_all_pairs += all_pairs

    q_m = sum_m_count / sum_all_count
    q_n = sum_n_count / sum_all_count
    p_m_m = sum_m_m_pairs / sum_all_pairs
    p_n_n = sum_n_n_pairs / sum_all_pairs
    p_n_m = sum_n_m_pairs / sum_all_pairs
    
    try: 
        s_m_m = 2 * math.log(p_m_m / (q_m * q_m), 2)
    except ValueError:
        s_m_m = None
    
    try:
        s_n_n = 2 * math.log(p_n_n / (q_n * q_n), 2)
    except ValueError:
        s_n_n = None
    
    try:
        s_n_m = 2 * math.log(p_n_m / (q_n * q_m * 2), 2)
    except ValueError:
        s_n_m = None
    
    return (q_m, sum_m_count, sum_n_count, s_m_m, s_n_n, s_n_m)

In [5]:
# Read all patterns from our local prosite db.
patterns = read_patterns(PROSITE_PATH)

['N-{P}-[ST]-{P}', '[RK](2)-x-[ST]', '[ST]-x-[RK]', '[ST]-x(2)-[DE]', '[RK]-x(2,3)-[DE]-x(2,3)-Y', 'G-{EDRKHPFYW}-x(2)-[STAGCN]-{P}', 'x-G-[RK]-[RK]', 'C-x-[DN]-x(4)-[FY]-x-C-x-C', 'E-x(2)-[ERK]-E-x-C-x(6)-[EDR]-x(10,11)-[FYA]-[YW]', '[DEQGSTALMKRH]-[LIVMFYSTAC]-[GNQ]-[LIVMFYAG]-[DNEKHS]-S-[LIVMST]-{PCFY}-[STAGCPQLIVMF]-[LIVMATN]-[DENQGTAKRHLM]-[LIVMWSTA]-[LIVGSTACR]-{LPIY}-{VY}-[LIVMFA]', '[KRHQSA]-[DENQ]-E-L>', 'R-G-D', '[AG]-x(4)-G-K-[ST]', 'D-{W}-[DNS]-{ILVFYW}-[DENSTG]-[DNQGHRK]-{GP}-[LIVMC]-[DENQSTAGC]-x(2)-[DE]-[LIVMFYW]', '[EQ]-{LNYH}-x-[ATV]-[FY]-{LDAM}-{T}-W-{PG}-N', '[LIVM]-x-[SGNL]-[LIVMN]-[DAGHENRS]-[SAGPNVT]-x-[DNEAG]-[LIVM]-x-[DEAGQ]-x(4)-[LIVM]-x-[LM]-[SAG]-[LIVM]-[LIVMT]-[WS]-x(0,1)-[LIVM](2)', '[FY]-C-[RH]-[NS]-x(7,8)-[WY]-C', 'C-x-C-x(2)-{V}-x(2)-G-{C}-x-C', 'C-x(2)-P-F-x-[FYWIV]-x(7)-C-x(8,10)-W-C-x(4)-[DNSR]-[FYW]-x(3,5)-[FYW]-x-[FYWI]-C', '[LIFAT]-{IL}-x(2)-W-x(2,3)-[PE]-x-{VF}-[LIVMFY]-[DENQS]-[STA]-[AV]-[LIVMFY]', '[KRH]-x(2)-C-x-[FYPSTV]-x(3,4)-[ST]-x(3)-C-x(4)

In [4]:
# Setup PRALINE for later.
env = Environment(keys={})
index = TypeIndex()
index.autoregister()
manager = Manager(index)

# Read all HOMSTRAD alignments.
alignments = []
for path, dirnames, filenames in os.walk(INPUT_DIR):
    for filename in filenames:
        if not filename.endswith('.aln'):
            continue
            
        path = os.path.join(INPUT_DIR, filename)
        try:
            alignment = praline.load_alignment_fasta(path, praline.container.ALPHABET_AA)
        except praline.AlphabetError:
            continue
                    
        alignments.append(alignment)

In [5]:
def get_scores(pattern):
    for alignment in alignments:
        annotate_pattern(manager, alignment, pattern)
    
    print "Statistics for pattern {0}:".format(pattern)
    count_arrays = count_motifs(pattern, alignments, skip_no_match=True)
    statistics = get_statistics(count_arrays)
    try:
        q_m, m_count, n_count, s_m_m, s_n_n, s_n_m = calculate_log_odds(statistics)
        
#         print "\tq_m =", q_m, "m_count =", m_count, "n_count=", n_count
#         print "\ts(M, M) =", s_m_m
#         print "\ts(*, *) =", s_n_n
#         print "\ts(*, M) =", s_n_m
        
        row = (pattern, q_m, m_count, n_count, s_m_m, s_n_n, s_n_m, count_arrays)
    except ZeroDivisionError:
#         print "\tSkipping due to lack of matches!"
        
        row = None
    
    for alignment in alignments:
        remove_pattern(alignment, pattern)
    
    return row

In [None]:
def _init(mgr, alns):
    import praline
    import mapraline
    
    global manager, alignments
    
    manager=mgr
    alignments=alns

pool_ = multiprocessing.Pool(initializer=_init, initargs=[manager, alignments])
scores = pool_.map(get_scores, patterns)

Statistics for pattern <M-R-[DE]-[IL]:
Statistics for pattern N-{P}-[ST]-{P}:
Statistics for pattern [LIVMF]-T-S-P-P-[FY]:
Statistics for pattern C-x-C-x(4)-D-x(2)-C-x(2)-[FY]-C:
Statistics for pattern [LIVM]-[ST]-A-[STAG]-H-C:
Statistics for pattern I-I-x-[GAC]-V-M-A-G-[LIVM](2):
Statistics for pattern [FYWL]-D-G-S-S-x(6,8)-[DENQSTAK]-[SA]-[DE]-x(2)-[LIVMFY]:
Statistics for pattern [DENSK]-x-[LIVMDET]-x(3)-[LIVMFTA](2)-x(6)-G-K-[KR]-x(5)-[LIVMF]-[LIVMFC]-x(2)-[STAC]:
Statistics for pattern [RK](2)-x-[ST]:
Statistics for pattern G-S-x(2)-N-x(2)-H-x-[PA]-[AG]-G(2):
Statistics for pattern C-C-x(5)-R-x(2)-[FY]-x(2)-C:
Statistics for pattern [DENKS]-x-[FLIV]-x(2)-[GSTC]-x-P-C-x-{V}-[FYWLIM]-S:
Statistics for pattern [IVM]-x-G-Q-D-x-V-K-x(5)-[KN]-G-x(3)-[STLV]:
Statistics for pattern K-P-[LIVMFYA]-x(3,5)-[NPAT]-[GA]-[GSTAN]-[GA]-x-H-x(3)-S:
Statistics for pattern [DNSTAGC]-[GSTAPIMVQH]-x(2)-G-[DE]-S-G-[GS]-[SAPHV]-[LIVMFYWH]-[LIVMFYSTANQH]:
Statistics for pattern [GE]-x(2)-[LIV](2)-[STY]-[S

Statistics for pattern [LIVMA]-G-[EQ]-H-G-[DN]-[ST]:
Statistics for pattern [GA]-x(0,2)-[YSA]-x(0,1)-[VFY]-{SEDT}-C-x(1,2)-[PG]-x(0,1)-H-x(2,4)-[MQ]:
Statistics for pattern G-[YV]-x-[ST]-x(2)-[IVAS]-G-K-x(0,1)-[FYWMK]-[HL]:
Statistics for pattern G-R-x-N-[LIV]-I-G-[DE]-H-x-D-Y:
Statistics for pattern H-R-H-R-G-H-x(2)-[DE](7):
Statistics for pattern [LIVMFY]-{G}-[LIVMFYAC]-[DNQ]-[RKHQS]-[PST]-F-[LIVMFY]-[LIVMFYC]-x-[LIVMFAH]:
Statistics for pattern C-x-[GNQ]-x(1,3)-G-x-C-x-C-x(2)-C-x-C:
Statistics for pattern [EQ]-{LNYH}-x-[ATV]-[FY]-{LDAM}-{T}-W-{PG}-N:
Statistics for pattern [LIVMA]-[AG]-[IVT]-[LIVMFY]-[AG]-x-G-[NHKRQGSAC]-[LIV]-G-x(13,14)-[LIVMFT]-{A}-x-[FYWCTH]-[DNSTK]:
Statistics for pattern C-{C}-{C}-[GA]-{C}-C-[GAST]-{CPDEKRHFYW}-C:
Statistics for pattern [LIV]-x-G-x-V-Q-[GH]-V-x-[FM]-R:
Statistics for pattern [LIVMF]-[LIVMFY]-[DN]-[LIVMFS]-G-[GSH]-[GS]-[AST]-x(3)-[ST]-[LIVM]-[LIVMFC]:
Statistics for pattern [FYW]-P-[EQH]-[LIV](2)-G-x(2)-[STAGV]-x(2)-A:
Statistics for pattern [LI

Statistics for pattern [IV]-D-L-G-T-[ST]-x-[SC]:
Statistics for pattern L-[FYW]-[QEDH]-F-[LI]-[LVQK]-{N}-[LI]-L:
Statistics for pattern [LIVMA]-C-{LIVMFYWPCST}-C-D-{GS}-{G}-{N}-x-{QS}-C:
Statistics for pattern [FY]-x(6)-C-C-x(2)-{C}-x(4)-C-[LFY]-x(6)-[LIVMFYW]:
Statistics for pattern V-x-H-x(33,40)-C-x(3)-C-x(3)-H-x(2)-M:
Statistics for pattern L-M-A-[EQ]-G-L-Y-N:
Statistics for pattern G-S-x(2)-M-x-{RS}-K-x-N:
Statistics for pattern W-A-x-G-[SH]-[LF]-M:
Statistics for pattern Y-x-[NQHD]-[KHR]-[DE]-[IVA]-F-[LM]-R-[ED]:
Statistics for pattern [RKHN]-x(2)-M-x-Y-[DENQ]-x-[LIVM]-[STAG]-R-[STAG]-[LI]-R-x-Y:
Statistics for pattern [LIV]-{KG}-[LIVFY]-[LIVMST]-G-[HYWV]-S-{YAG}-G-[GSTAC]:
Statistics for pattern R-P-C-x(11)-C-V-S:
Statistics for pattern [DENG]-{A}-[DENQGSTARK]-x(0,2)-[DENQARK]-[LIVFY]-{CP}-G-{C}-W-[FYWLRH]-{D}-[LIVMTA]:
Statistics for pattern G-x-[FYW]-x-[LIVMFYW]-x-[CST]-x-{PR}-{K}-x(2)-{S}-x-{LFH}-G-[LM]-x(3)-[LIVMFYW]:
Statistics for pattern [LIVTMS]-[LIVP]-[LIV]-[KQ]-x-[ND]-

Statistics for pattern [RK](2)-[AM]-[IVFYT]-[IV]-[RKT]-L-[STANEQK]-x(7)-[LIVMFT]:
Statistics for pattern [LIVMA]-{R}-E-G-[DN]-S-A-{F}-[STAG]:
Statistics for pattern C-x(2)-D-x(3,4)-[LIVM](2)-P-[LIVM]-x-[LIVM]-G-x(2)-[LIVM]-x-G-[LIVM](2)-x-[LIVM](4)-A-[FY]-x-[LIVM]-x(2)-[KR]-[RH]-x(1,2)-[STAG](2)-Y-[EQ]:
Statistics for pattern [LIVM]-x(2)-[LIVM]-[STAVC]-[GE]-[QV]-x(2)-[LIVMA]-x-[STC]-x-[STAG]-[KRH]-x-[STA]:
Statistics for pattern [RKN]-x-[LIVM]-x-G-[ST]-x(2)-[SNQ]-[LIVM]-G-x-{M}-[LIVM]-x(0,1)-[DENG]:
Statistics for pattern [SAG]-G-G-T-G-[SA]-G:
Statistics for pattern R-x(2)-[LIVMT]-x(2,3)-[FWY]-[QNYDI]-x(8,13)-[LVESI]-x-P-C-[HAVMLC]-x(3)-[QMTLHD]-[FYWL]-x(0,1)-[LV]:
Statistics for pattern H-[STAG]-{ADNV}-{VGFI}-{YAR}-[LIVME]-{SDEP}-x-[LIVMFYW]-P-[FYW]:
Statistics for pattern K-x-[NQEK]-[GT]-G-[DQ]-x-[LIVM]-x(3)-Q-S:
Statistics for pattern [KRM]-[PTKS]-x(3)-[LIVMFG]-x(2)-[NHS]-x(3)-R-[DNHY]-W-R-[RS]:
Statistics for pattern P-x(0,2)-[GSTAN]-[DENQGAPK]-x-[LIVMFP]-[HT]-[LIVMYAC]-G-[HNTG]-[L

Statistics for pattern G-[LIVMFYKRSAQT]-[LIVMAGPF]-[QAM]-x-[LIVMFYCA]-x-D-[AGIM]-[LIVMFTA]-[KS]-[LVMYSTI]-[LIVMFYGA]-x-[KRE]-[EQG]:
Statistics for pattern [IV]-x-G-[STAD]-[LIVT]-D-[FYI]-[IV]-[FSN]-G:
Statistics for pattern [DNSK]-[PSTV]-x-[SAG](2)-[GD]-D-x(3)-[SAGV]-[AG]-[LIVMFYA]-[LIVMSTAP]:
Statistics for pattern [LIVMFA]-[STAGC](2)-G-x-{TAV}-H-[STAGLI]-[LIVMFA]-{KI}-[LIVM]:
Statistics for pattern G-x(2)-[LIVMFA]-[LIVMF](2)-H-[LIVMF]-G-[LIVMF]-x-T-[LIVA]:
Statistics for pattern G-H-A-H-[SA]-G-M-G-K-[IV]-K:
Statistics for pattern [LIVM]-x-[DG]-x(2)-[GAEHS]-[NQSD]-[KS]-G-[TE]-G-x-W:
Statistics for pattern R-I-A-R-N-[TQ]-x(2)-[LIVMFY](2)-x-[EQH]-E-x(4)-[KRN]-x(2)-D-P-x-[GSA]-G-S:
Statistics for pattern P-F-D-[LIVMFYQN]-[STAGPVMI]-E-[GACS]-E-x(0,2)-[EQLN]-[LIVMS]-x(1,2)-G:
Statistics for pattern R-[ST]-H-[ST]-x(2)-A-x-G-G:
Statistics for pattern [LIVM]-[PK]-x-[GSTA]-x(0,1)-G-[LM]-[GS]-S-S-[GSA]-[GSTAC]:
Statistics for pattern [DENQ]-x(6)-[LIVMF]-[GA]-x(2)-[LIVM]-A-[LIVM]-P-H-[GAC]:
Stati

Statistics for pattern [NV]-x(5)-[GTR]-[LIVMA]-x-P-[PTLIVME]-x-G-[LIVM]-x(3)-[LIVMFW](2)-S-[YSAQ]-G-G-[STN]-[SA]:
Statistics for pattern [GSA]-Q-x-K-S-[FY]-x-Q-x-K-[SA]:
Statistics for pattern [ST]-x(3)-G-[DY]-G-[KR]-[IV]-[FW]-[LIVM]-x(2)-[LIVM]:
Statistics for pattern [LIVFYCHT]-[DGH]-[LIVMFYAC]-[LIVMFYA]-x(2)-[GSTAC]-[GSTA]-[HQR]-K-x(4,6)-G-x-[GSAT]-x-[LIVMFYSAC]:
Statistics for pattern C-C-[LIFYTRQ]-x(5,8)-[LIR]-x(4)-[LIVMFA]-x(2)-[FYWECI]-x(5,8)-C-x(3,4)-[SAG]-[LIVM](2)-[FL]-x(7,9)-C-[STAV]:
Statistics for pattern [FYW]-x-[PSTA]-x(7)-G-x-[LIVM]-x-[LIVM]-x-[FYWI]-x(2)-D-x(5)-P:
Statistics for pattern [LIVMF]-H-C-x(2)-G-x(2)-R-[STC]-[STAGP]:
Statistics for pattern G-x-[SA]-G-E-[LIVM]-R-Y-P-S-Y:
Statistics for pattern A-R-P-x(3)-K-x-S-x-T-N-A-Y-N-V-T-T-x(2)-[DN]-G-x(3)-Y-G:
Statistics for pattern G-[IV]-[GK]-x-W-[ST]-[AVI]-x-[LIVMFY](2)-x-[LIVM]-x(8)-[MF]-x(2)-[ED]-D:
Statistics for pattern [LIVMGSTAN]-{IEVK}-H-[GSACE]-[LIVM]-{GPSI}-[LIVMAT](2)-G-{SLAG}-[GSADNH]:
Statistics for patter

Statistics for pattern R-[FYW]-x-[DA]-[KA]-x(0,1)-[LIVMFY]-x-[LIVMFY](2)-x(3)-[DNS]-[GSA]-x(6)-[DE]-[HS]-x(3)-[DE]-[GAC]:
Statistics for pattern H-x-H-L-D-H-[LIVM]-x-[GS]-[LIVMA]-[LIVM](2)-x-S-[AP]:
Statistics for pattern [GSAH]-x-[LIVMF](3)-D-E-[ALIV]-H-[NECR]:
Statistics for pattern [FYWHPVAS]-x(3)-C-x(3,4)-[SG]-x-[FYW]-x(3)-Q-x(5,12)-[FYW]-C-[VA]-x(3,4)-[SG]:
Statistics for pattern K-[LIVM]-x-R-D-x(3)-R-G-x-[ST]-x-E:
Statistics for pattern [QL]-G-[LMFCAV]-[LIVMFTA]-[LIV]-x-[LIVFSTM]-[LIFHV]-[VFYHLG]-C-[LFYAVI]-x-[NKRQDS]-x(2)-[VAI]:
Statistics for pattern [EQ]-x-L-Y-[DEQSTLM]-x(3,12)-[LIVST]-[ST]-Y-x-R-[ST]-[DEQSN]:
Statistics for pattern H-x-K-R-[LIVMF]-[SANK]-x-P-x(2)-[WY]-x-[LIVM]-x-[KRP]:
Statistics for pattern R-[LIVMFYS]-x-[LIVM]-x-[QHG]-x-G-C-[FYNA]-[GAPV]-G-[GAC]-[STAVK]-x-[LIVMF]-[RAL]:
Statistics for pattern [DENQLF]-[KRVW]-N-[HRY]-[STAPV]-[SAC]-[LIVMFS]-[LIVMFSA]-[LIVMFS]-W-[GSV]-x(2,3)-N-E:
Statistics for pattern [SA]-[LIVM]-[NGS]-[STA]-D-D-P:
Statistics for pattern [DN]

Statistics for pattern [GRH]-[DEQKG]-[STVM]-[LIVMA](3)-[GA]-G-[LIVMFY]-x(11)-[LIVM]-P-[LIVMFYWGS]-[LIVMF]-[GSAE]-x-[LIVMS]-P-[LIVMFYW]-[LIVMFYWS]-x(2,3)-[LV]-[FK]:
Statistics for pattern G-S-x-[AG]-[KRN]-x-T-x-L-[KRN]-x(3)-[DE]-x-[DET]-[LM]-[VI]-x-F:
Statistics for pattern C-x(2)-[DE]-G-[DEQKRG]-W-x(2,3)-[PAQ]-[LIVMT]-[GT]-x-C-x-C-x(2)-G-[HFY]-[EQ]:
Statistics for pattern G-[AVP]-[DT]-[LIVMTAS]-[CG]-G-[FY]-x(3)-[STP]-x(3)-L-[CL]-x-R-W-x(2)-[LVMI]-[GSA]-[SA]-[FY]-x-P-[FY]-x-R-[DNA]:
Statistics for pattern G-x(2)-[LIVMFY](2)-x-[IF]-x-E-x(2)-[LIVM]-x-G-Y-P:
Statistics for pattern H-x(3)-[GA]-[LIVMT]-R-[HF]-[LIVMF]-x-[FYWM]-D-x-[GVA]:
Statistics for pattern Y-x(2)-[HP]-W-[FYH]-[APS]-[DE]-x-P-x-K-G-x-[GA]-[FY]-R-C-[IV]-[RH]-[IV]:
Statistics for pattern [LIVM]-[TS]-[NK]-[DN]-[GA]-[AVNHK]-[TAVC]-[LIVM](2)-x(2)-[LIVMA]-x-[LIVM]-x-[SNH]-[PQHA]:
Statistics for pattern G-G-S-[AN]-[GA]-Q-S-S-x(2)-Q:
Statistics for pattern R-P-[VI]-I-L-D-P-x-[DE]-P-T:
Statistics for pattern [LIVM]-x-[AG]-[LIVMF](2)

Statistics for pattern [QYR]-[GH]-[DNEAR]-x-[LIV]-[KR]-x(2)-K-x(2)-[KRNG]-[AS]-x(4)-[LIV]-[DENKA]-x(2)-[IV]-x(2)-L-x(3)-K:
Statistics for pattern [LIVMFY](2)-[EK]-x-G-[LIVM]-[GA]-G-x(2)-D-x-[GST]-x-[LIVM](2):
Statistics for pattern [LIVM]-[VIC]-x-{H}-G-[DENQTA]-x-[GAC]-{L}-x-[LIVMFY](4)-x(2)-G:
Statistics for pattern [KRHN]-x-[DEQN]-[DEQNK]-x(3)-C-G-G-[AG]-[FY]-[LIVM]-[KN]-[LIVMFY](2):
Statistics for pattern N-x-[LIVMFYWD]-R-[STACN](2)-H-Y-P-x(4)-[LIVMFYWS](2)-x(3)-[DN]-x(2)-G-[LIVMFYW](4):
Statistics for pattern A-x(3)-[GDTN]-[IF]-x-[DNQTKEH]-x-[DEAQ]-x-[LIVM]-x-[LIVMC]-x-[NS]-x(2)-[GS]-x(4,5)-[AV]-x-[LIVMEF]-[STY]:
Statistics for pattern [LIVAMSFT]-x(3)-[GAHDVSI]-x-[GSAIVCT]-R-[LIVMCAFST]-[DE]-[LIVMFAYGT]-[LIVMFAR]-x(7,12)-[LIVWCAF]-x-[EK]-[LIVAPMT]-N-[STPA]-x-P-[GA]:
Statistics for pattern G-[LIVMFY]-x(1,3)-[AGCY]-[NASMQG]-x-C-[FYWC]-[LIVMFCA]-[NSTAD]-[SACV]-x-[LIVMSF]-[QF]:
Statistics for pattern [LIV]-[AGD]-F-P-[CS]-[NG]-Q-F:
Statistics for pattern E-[KQ]-x-[SC]-H-[HR]-[PG]-[PL]-x

Statistics for pattern N-x(3)-[DEH]-x(2)-[LIMFYT]-D-x(2)-[VM]-x-R-[ST]-x(2)-R-x(4)-[GYNKR]:
Statistics for pattern [LIVMFW]-x(2)-H-x-H-[DN]-D-x-G-x-[GAS]-x-[GASLI]:
Statistics for pattern [LIVMFW]-H-x-N-[DEG]-[SA]-x(4)-[GNAQ]-x(3)-D-x-H:
Statistics for pattern [GSTNAD]-x(2)-[GAS]-x-G-[GC]-[IM]-x-[STAG]-K-[LIVMCT]-x-[SAI]-[TCAGFS]-x(2)-[GALVCMI]:
Statistics for pattern [GTND]-[FPMI]-x-[LIVMH]-x-[DEAT]-x(2)-[GA]-x-[GTAM]-[STA]-x-G-H-x-[LIVM]-[GAS]:
Statistics for pattern C-F-W-K-Y-C:
Statistics for pattern [LIVM](2)-[KR]-x-[EQKRD]-x(4)-G-[LIVMFTC]-[LIVT]-[LIVMF]-[ST]-D-x(2)-[SGADNIT]:
Statistics for pattern [DENGQST]-[LIVMPF]-[LIVM]-x(1,2)-[KRNQELD]-[DENKGS]-[LIVM]-x(3)-[STG]-x-C-[EP]-H-H:
Statistics for pattern Y-S-x-[KR]-Y-x-[DE](2)-x-[FY]-E-Y-R-H-V-x-[LV]-[PT]-[KRP]:
Statistics for pattern P-x(4)-C-D-x-R-[LIVM](2)-x-[KR]-x(14)-C:
Statistics for pattern [LIVMT]-x-[LIVM]-[KR]-L-[STAK]-R-{E}-G-[AKR]:
Statistics for pattern [CH]-[AGV]-E-x(2)-[LIVMFGAT]-[LIVM]-x(17,33)-P-C-x(2,8)-C-x(3)-[L

Statistics for pattern P-[LIVM]-x-[LIVM]-H-x-R-x-[TA]-x-[DE]:
Statistics for pattern [LV]-Y-[IVC]-P-R-K-C-S-[SAT]:
Statistics for pattern [DNHKR]-[LIVMF]-x-[LIVMF](2)-[VSTAC]-[STAC]-G-x-G-[GKN]-G-T-G-[ST]-G-[GSARC]-[STA]-P-[LIVMFT]-[LIVMF]-[SGAV]:
Statistics for pattern [GLES]-x-[LIVM]-x(2)-L-[KR]-[KRHNS]-x-K-x(5)-[LIVM]-x(2)-[GNKADS]-x-[DEN]-[CRG]-[GI]:
Statistics for pattern [GN]-[DNQPSA]-x-C-[GSTANK]-[GSTADNQ]-[STNQI]-[PTIV]-x-C-C-[DENQKPST]:
Statistics for pattern A-E-[KR]-R-E-H-E-[KR]-E-V:
Statistics for pattern [LIVMFAC]-K-x(1,3)-[DEA]-[DE]-[LIVMCP]-R-Q-[DE]-x(4)-Q:
Statistics for pattern [LIVNS]-x-{L}-[LIVMFA]-x-C-x-[STAGCDNH]-C-x(3)-[LIVFG]-{LV}-x(2)-[LIV]-x(9,11)-[IVA]-x-[LVFYS]:
Statistics for pattern [LVSAT]-[LIVA]-x(2)-[LIVMT]-[PSD]-x(3)-[LI]-[LIVMT]-[LIVMST]-E-T-D-x-P:
Statistics for pattern L-C-C-x-[KR]-C-x(4)-[DE]-x-N-x(4)-C-x-C-R-V-P:
Statistics for pattern [LIVM]-[DNG]-[LIVMF]-N-x-G-C-[PS]-x(3,4)-[LIVMASQ]-x(5,6)-G-[SACY]:
Statistics for pattern [CSH]-C-x(2)-[GAP]-x(7,

Statistics for pattern [LIVM]-[SADN]-x(2)-C-x-R-[LIVM]-x(4)-[GSC]-H-[STA]:
Statistics for pattern P-[LIVMF]-K-[LIVMF](5)-x-[LIVMA]-[DNGS]-G-W:
Statistics for pattern [LMFYCVI]-[DN]-R-x(3)-[PGA]-L-[LIVMCA]-E-[LIVMT]-x-[STL]-x-[PA]:
Statistics for pattern E-[DNQ]-x(8,17)-Y-x(7)-D-x-[RD]-[GP]-x-[TS]-x(3)-[AIVFLY]-G-x(5,11)-D:
Statistics for pattern [GN]-x-[DE]-[KRHST]-[LIVMFA]-[LIVMF]-P-[IV]-D-[LIVMFYWA]-[LIVMFYWK]-x-P-x-C-P-[PT]:
Statistics for pattern A-[LIVM]-x-[STAN]-x(2)-[LI]-x-[KRNQ]-[GSA]-H-[LM]-x-[FYLH]:
Statistics for pattern [LIVMF]-[QKRHSA]-{E}-x-[LIVMAC]-x(5,6)-[LIVMW]-[RKAYF]-x-[STACIVMF]-[PV]-{LG}-[LIVMF]-x-[FYI]-x(2)-D:
Statistics for pattern E-x-F-x(2)-G-[SA]-[LIVM]-C-x(4)-G-x-C-x-[LIVM]-S:
Statistics for pattern S-V-A-G-L-G-G-C-P-Y:
Statistics for pattern [KR]-x-G-[KR]-G-F-[ST]-[LVF]-x-E-[LVI]-x(3)-G:
Statistics for pattern [GR]-C-[IV]-G-R-[ILS]-x-W:
Statistics for pattern [LV]-P-[VI]-[VTPI]-[NQLHT]-[FL]-[ATVS]-[AS]-G-G-[LIV]-[AT]-T-P-[AQS]-D-[AGVS]-[AS]-[LM]:
Statistics 

Statistics for pattern P-x(8,10)-[LM]-R-x-[GE]-[LIVP]-x-G-C:
Statistics for pattern [FY]-x-[GSHE]-x(2)-[IVLF]-x-P-[GA]-x-G-x(2)-[FYV]-x-[KRHE]-x-D:
Statistics for pattern [LIVMY]-[VI]-H-[GA]-D-[LF]-[SN]-E-[FY]-N-x-[LIVM]:
Statistics for pattern C-C-[SHYN]-x(0,1)-[PRG]-[RPATV]-C-[ARMFTNHG]-x(0,4)-[QWHDGENLFYVP]-[RIVYLGSDW]-C:
Statistics for pattern R-P-L-[IV]-x-[NS]-F-G-S-[CA]-[TS]-[CU]-P-x-F:
Statistics for pattern [LIVM]-x(3)-[GNH]-x(0,1)-[LITCRV]-x-[LIVWF]-x-[LIVMF]-x-[GS]-[LIVM]-G-x-[DENV]-G-[HN]:
Statistics for pattern [LIV]-[LIVMGSTC]-[DET]-[RH]-[FYHCS]-x(2)-S-[GSTNP]-x-[AVC]-[FY]-[STANQ]:
Statistics for pattern [STNAQ]-[LIAMV]-x(0,1)-[RNGSYKE]-x(4,5)-[LM]-[EIVLA]-x(2)-[GESD]-[LFYWHA]-[LIVC]-x(7)-[DNS]-[RKQG]-[RK]-x(6)-[TS]-x(2)-[GAS]:
Statistics for pattern [RH]-G-x(2)-P-x-G(3)-x-[LIV]:


In [44]:
OUTLIER_PATTERNS = [
    "[GSTALIVN]-{PCHR}-{KND}-H-E-[LIVMFYW]-{DEHRKP}-H-{EKPC}-[LIVMFYWGSPQ]",
    "G-[GA]-x-[STN]-x-H-[STA]-[STAV]-[LIVM](2)-[STAV]-[RG]",
    "[GSTADE]-[KREQSTIV]-x-{EPRK}-{VPGL}-x-[KRDN]-S-[LIVMF](2)-{EVPL}-[LIVM]-{EATN}-x-[LIVM]-[GADE]"
]

NORMAL_PATTERNS = [
    "[LIVMFYC]-{A}-[HY]-x-D-[LIVMFY]-[RSTAC]-{D}-{PF}-N-[LIVMFYC](3)",
    "[LIVM](2)-H-[NHA]-Y-G-x-[GSA](2)-x-G-x(5)-G-x-A",
]

pattern = OUTLIER_PATTERNS[0]

for alignment in alignments:
    annotate_pattern(manager, alignment, pattern)

print "Statistics for pattern {0}:".format(pattern)
count_arrays = count_motifs(pattern, alignments, skip_no_match=True)
statistics = get_statistics(count_arrays)

q_m, m_count, n_count, s_m_m, s_n_n, s_n_m = calculate_log_odds(statistics)

print "\tq_m =", q_m, "m_count =", m_count, "n_count=", n_count
print "\ts(M, M) =", s_m_m
print "\ts(*, *) =", s_n_n
print "\ts(*, M) =", s_n_m

for alignment in alignments:
    remove_pattern(alignment, pattern)


Statistics for pattern [GSTALIVN]-{PCHR}-{KND}-H-E-[LIVMFYW]-{DEHRKP}-H-{EKPC}-[LIVMFYWGSPQ]:
	q_m = 0.02176541717049577 m_count = 90 n_count= 4045
	s(M, M) = 11.7518802698
	s(*, *) = 0.0455791167388
	s(*, M) = None


In [45]:
for arr in count_arrays:
    print "=== ARR MATEY ==="
    for row in arr:
        print row
        

=== ARR MATEY ===
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2]
[0 2

In [46]:
tot_m_count = 0.0
tot_n_count = 0.0
tot_all_count = 0.0
tot_q_m = 0.0

tot_n_m_pairs = 0.0
tot_n_n_pairs = 0.0
tot_m_m_pairs = 0.0
tot_all_pairs = 0.0
for row in statistics:
    m_count, n_count, all_count, m_m_pairs, n_n_pairs, n_m_pairs, all_pairs = row
    
    tot_m_count += m_count
    tot_n_count += n_count
    tot_all_count += all_count
    tot_q_m += (m_count / all_count)
    
    tot_m_m_pairs += m_m_pairs
    tot_n_m_pairs += n_m_pairs
    tot_n_n_pairs += n_n_pairs
    tot_all_pairs += all_pairs

    #print m_count, all_count
    #print m_m_pairs, all_pairs
    
p_m_m = tot_m_m_pairs / tot_all_pairs
q_m = tot_m_count / tot_all_count
#q_m = tot_q_m / len(statistics)
print 2 * math.log(p_m_m / (q_m * q_m), 2)
print p_m_m, q_m
#print tot_m_m_pairs + tot_n_m_pairs + tot_n_n_pairs, tot_n_m_pairs, tot_all_pairs

11.7518802698
0.0278207109737 0.0217654171705


In [9]:
filtered_scores = []
for pattern, q_m, m_count, n_count, s_m_m, s_n_n, s_n_m in scores:
    if s_m_m is None or np.isnan(s_m_m):
        continue
        
    filtered_scores.append((str(pattern), float(q_m), int(m_count), int(n_count), float(s_m_m)))

In [11]:
with file('./output/alpha_per_pattern.pickle', 'wb') as fo:
    pickle.dump(filtered_scores, fo)