In [15]:
def trim_ppm(ppm, t=0.45):
    maxes = np.max(ppm,-1)
    maxes = np.where(maxes>=t)
    return ppm[maxes[0][0]:maxes[0][-1]+1] 
        
def write_meme_file(ppm, bg, fname):
    f = open(fname, 'w')
    f.write('MEME version 4\n\n')
    f.write('ALPHABET= ACGT\n\n')
    f.write('strands: + -\n\n')
    f.write('Background letter frequencies (from unknown source):\n')
    f.write('A %.3f C %.3f G %.3f T %.3f\n\n' % tuple(list(bg)))
    f.write('MOTIF 1 TEMP\n\n')
    f.write('letter-probability matrix: alength= 4 w= %d nsites= 1 E= 0e+0\n' % ppm.shape[0])
    for s in ppm:
        f.write('%.5f %.5f %.5f %.5f\n' % tuple(s))
    f.close()

def fetch_tomtom_matches(ppm, background=[0.25, 0.25, 0.25, 0.25], tomtom_exec_path='tomtom', motifs_db='HOCOMOCOv11_core_HUMAN_mono_meme_format.meme' , n=5, temp_dir='./', trim_threshold=0.45):
    """Fetches top matches from a motifs database using TomTom.
    
    Args:
        ppm: position probability matrix- numpy matrix of dimension (N,4)
        background: list with ACGT background probabilities
        tomtom_exec_path: path to TomTom executable
        motifs_db: path to motifs database in meme format
        n: number of top matches to return, ordered by p-value
        temp_dir: directory for storing temp files
        trim_threshold: the ppm is trimmed from left till first position for which
            probability for any base pair >= trim_threshold. Similarly from right.
    
    Returns:
        list: a list of up to n results returned by tomtom, each entry is a
            dictionary with keys 'Target ID', 'p-value', 'E-value', 'q-value'  
    """
    
    fname = os.path.join(temp_dir, 'query_file')
    
    # trim and prepare meme file
    write_meme_file(ppm, background, fname)
    
    # run tomtom
    cmd = '%s -no-ssc -oc . -verbosity 1 -text -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10.0 %s %s' % (tomtom_exec_path, fname, motifs_db)
    #print(cmd)
    out = subprocess.check_output(cmd, shell=True)
    
    # prepare output
    dat = [x.split('\\t') for x in str(out).split('\\n')]
    schema = dat[0]
    tget_idx, pval_idx, eval_idx, qval_idx = schema.index('Target_ID'), schema.index('p-value'), schema.index('E-value'), schema.index('q-value')
    
    r = []
    for t in dat[1:1+n]:
        mtf = {}
        mtf['Target_ID'] = t[tget_idx]
        mtf['p-value'] = float(t[pval_idx])
        mtf['E-value'] = float(t[eval_idx])
        mtf['q-value'] = float(t[qval_idx])
        r.append(mtf)
    
    os.system('rm ' + fname)
    return r

In [16]:
import subprocess

In [17]:
fetch_tomtom_matches(np.array([[0,0,0,1],[0,0,1,0],[1,0,0,0],[0,1,0,0],[0,0,0,1],[0,1,0,0],[1,0,0,0]]), motifs_db='/mnt/lab_data3/soumyak/adpd/HOCOMOCOv11_core_HUMAN_mono_meme_format.meme')

[{'Target_ID': 'FOS_HUMAN.H11MO.0.A',
  'p-value': 3.80182e-06,
  'E-value': 0.00152453,
  'q-value': 0.000852542},
 {'Target_ID': 'JUNB_HUMAN.H11MO.0.A',
  'p-value': 6.33636e-06,
  'E-value': 0.00254088,
  'q-value': 0.000852542},
 {'Target_ID': 'JUND_HUMAN.H11MO.0.A',
  'p-value': 6.33636e-06,
  'E-value': 0.00254088,
  'q-value': 0.000852542},
 {'Target_ID': 'JUN_HUMAN.H11MO.0.A',
  'p-value': 6.33636e-06,
  'E-value': 0.00254088,
  'q-value': 0.000852542},
 {'Target_ID': 'FOSL1_HUMAN.H11MO.0.A',
  'p-value': 7.60363e-06,
  'E-value': 0.00304906,
  'q-value': 0.000852542}]

In [18]:
out

b'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation\n1\tFOS_HUMAN.H11MO.0.A\t1\t3.80182e-06\t0.00152453\t0.000852542\t7\tTGACTCA\tGTGACTCAC\t-\n1\tJUNB_HUMAN.H11MO.0.A\t2\t6.33636e-06\t0.00254088\t0.000852542\t7\tTGACTCA\tGGTGACTCAGA\t-\n1\tJUND_HUMAN.H11MO.0.A\t2\t6.33636e-06\t0.00254088\t0.000852542\t7\tTGACTCA\tGATGACTCATC\t-\n1\tJUN_HUMAN.H11MO.0.A\t2\t6.33636e-06\t0.00254088\t0.000852542\t7\tTGACTCA\tGATGACTCATC\t-\n1\tFOSL1_HUMAN.H11MO.0.A\t2\t7.60363e-06\t0.00304906\t0.000852542\t7\tTGACTCA\tGGTGACTCATCC\t-\n1\tFOSL2_HUMAN.H11MO.0.A\t2\t7.60363e-06\t0.00304906\t0.000852542\t7\tTGACTCA\tGATGACTCATCC\t-\n1\tFOSB_HUMAN.H11MO.0.A\t1\t8.27176e-06\t0.00331698\t0.000852542\t7\tTGACTCA\tATGACTCAT\t-\n1\tNFE2_HUMAN.H11MO.0.A\t3\t8.8709e-06\t0.00355723\t0.000852542\t7\tTGACTCA\tGCATGACTCAGCA\t+\n1\tNF2L2_HUMAN.H11MO.0.A\t1\t1.01382e-05\t0.0040654\t0.000866074\t7\tTGACTCA\tATGACTCAGCAGTT\t-\n1\tBACH2_HUMAN.H11MO.0.A\t1\t