## Preparation

In [23]:
import subprocess
import os
import pandas as pd
from Bio import motifs
from Bio.motifs import meme
import re

# ===== MOTIFS ELICITATION CONFIGURATION =====
label_names = [
    "chromatin", "cytoplasm", "cytosol", "ER", "extracellular", 
    "membrane", "mitochondrion", "nucleolus", "nucleoplasm", "nucleus", "ribosome"
]

# Paths
meme_path = "/home/jingqi/miniconda3/envs/rnalocate_old/bin/meme"
tomtom_path = "/home/jingqi/miniconda3/envs/rnalocate_old/bin/tomtom"
input_dir = "meme_input_split"
meme_output_dir_short = "meme_results_short"
meme_output_dir_long = "meme_results_long"
tomtom_output_dir_short = "tomtom_results_short"
tomtom_output_dir_long = "tomtom_results_long"

# Databases
databases = {
    "CISBP_RNA": "~/meme_db/motif_databases/CISBP-RNA/Mus_musculus.dna_encoded.meme",
    "RNA": "~/meme_db/motif_databases/RNA/Ray2013_rbp_Mus_musculus.dna_encoded.meme", 
    "miRBase": "~/meme_db/motif_databases/MIRBASE/22/Mus_musculus_mmu.dna_encoded.meme",
}


# Create output directories
os.makedirs(meme_output_dir_short, exist_ok=True)
os.makedirs(meme_output_dir_long, exist_ok=True)
os.makedirs(tomtom_output_dir_short, exist_ok=True)
os.makedirs(tomtom_output_dir_long, exist_ok=True)
for db_name in databases.keys():
    os.makedirs(f"{tomtom_output_dir_short}/{db_name}", exist_ok=True)
    os.makedirs(f"{tomtom_output_dir_long}/{db_name}", exist_ok=True)

# count sequences in FASTA file
def count_fasta_sequences(fasta_path):
    count = 0
    with open(fasta_path, 'r') as f:
        for line in f:
            if line.startswith('>'):
                count += 1
    return count

print(" Configuration loaded")


# ===== FILTERING CONFIGRATION ===== 
MIN_SITE_PERCENT = 0.02
E_VALUE_THRESH = 0.05 
P_VALUE_THRESH = 0.0005
Q_VALUE_THRESH = 0.1

# Parse MEME results to get site counts per motif
def get_motif_sites(meme_txt_file):
    """Extract number of sites for each motif from MEME output."""
    motif_sites = {}
    
    with open(meme_txt_file, 'r') as f:
        content = f.read()
    
    import re
    pattern = r'MOTIF\s+(\S+).*?sites\s*=\s*(\d+)'
    matches = re.findall(pattern, content, re.DOTALL)
    
    for motif_id, sites in matches:
        motif_sites[motif_id] = int(sites)
    
    return motif_sites
    
print(" ALL SET!")

 Configuration loaded


## MEME

### Shorter

In [2]:
for class_name in label_names:
    for bucket in ['positive', 'negative']:
        fasta_file = f"{input_dir}/{class_name}_{bucket}_motifs.fasta"
        
        if not os.path.exists(fasta_file):
            print(f"Skipping {fasta_file} (not found)")
            continue
        
        print(f"\nProcessing: {class_name} {bucket}")
        
        meme_out = f"{meme_output_dir_short}/{class_name}_{bucket}"
        meme_cmd = [
            meme_path, fasta_file,
            "-dna",
            "-oc", meme_out,
            "-mod", "zoops",
            "-nmotifs", "10",
            "-minw", "4",
            "-maxw", "10"
        ]
        
        print(f"Running MEME...")
        subprocess.run(meme_cmd, check=True)

print("\n Motifs searching complete!")



Processing: chromatin positive
Running MEME...


Writing results to output directory 'meme_results_short/chromatin_positive'.
Initializing the motif probability tables for 2 to 849 sites...
nsites = 849
Done initializing.
SEEDS: highwater mark: seq 848 pos 50

seqs=   849, min=  50, max=   50, total=    42450

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 849, iter=  40   



Processing: chromatin negative
Running MEME...


Writing results to output directory 'meme_results_short/chromatin_negative'.
Initializing the motif probability tables for 2 to 828 sites...
nsites = 828
Done initializing.
SEEDS: highwater mark: seq 827 pos 50

seqs=   828, min=  50, max=   50, total=    41400

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 828, iter=  40   



Processing: cytoplasm positive
Running MEME...


Writing results to output directory 'meme_results_short/cytoplasm_positive'.
Initializing the motif probability tables for 2 to 697 sites...
nsites = 697
Done initializing.
SEEDS: highwater mark: seq 696 pos 50

seqs=   697, min=  50, max=   50, total=    34850

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   



Processing: cytoplasm negative
Running MEME...


Writing results to output directory 'meme_results_short/cytoplasm_negative'.
Initializing the motif probability tables for 2 to 533 sites...
nsites = 533
Done initializing.
SEEDS: highwater mark: seq 532 pos 50

seqs=   533, min=  50, max=   50, total=    26650

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 533, iter=  40   



Processing: cytosol positive
Running MEME...


Writing results to output directory 'meme_results_short/cytosol_positive'.
Initializing the motif probability tables for 2 to 515 sites...
nsites = 515
Done initializing.
SEEDS: highwater mark: seq 514 pos 50

seqs=   515, min=  50, max=   50, total=    25750

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 515, iter=  40   



Processing: cytosol negative
Running MEME...


Writing results to output directory 'meme_results_short/cytosol_negative'.
Initializing the motif probability tables for 2 to 353 sites...
nsites = 353
Done initializing.
SEEDS: highwater mark: seq 352 pos 50

seqs=   353, min=  50, max=   50, total=    17650

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 353, iter=  40   



Processing: ER positive
Running MEME...


Writing results to output directory 'meme_results_short/ER_positive'.
Initializing the motif probability tables for 2 to 139 sites...
nsites = 139
Done initializing.
SEEDS: highwater mark: seq 138 pos 50

seqs=   139, min=  50, max=   50, total=     6950

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 139, iter=  40   
Writing results to output directory 'meme_results_short/ER_nega


Processing: ER negative
Running MEME...


nsites = 266
Done initializing.
SEEDS: highwater mark: seq 265 pos 50

seqs=   266, min=  50, max=   50, total=    13300

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 266, iter=  40   



Processing: extracellular positive
Running MEME...


Writing results to output directory 'meme_results_short/extracellular_positive'.
Initializing the motif probability tables for 2 to 190 sites...
nsites = 190
Done initializing.
SEEDS: highwater mark: seq 189 pos 50

seqs=   190, min=  50, max=   50, total=     9500

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 190, iter=  40   
Writing results to output directory 'meme_results_sh


Processing: extracellular negative
Running MEME...


Initializing the motif probability tables for 2 to 45 sites...
nsites = 45
Done initializing.
SEEDS: highwater mark: seq 44 pos 50

seqs=    45, min=  50, max=   50, total=     2250

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites=  45, iter=  40  
Writing results to output directory 'meme_results_short/membrane_positive'.
Initializing the motif probability tables for 2 to 764 sites...
nsites


Processing: membrane positive
Running MEME...


nsites = 764
Done initializing.
SEEDS: highwater mark: seq 763 pos 50

seqs=   764, min=  50, max=   50, total=    38200

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 764, iter=  40   



Processing: membrane negative
Running MEME...


Writing results to output directory 'meme_results_short/membrane_negative'.
Initializing the motif probability tables for 2 to 710 sites...
nsites = 710
Done initializing.
SEEDS: highwater mark: seq 709 pos 50

seqs=   710, min=  50, max=   50, total=    35500

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 710, iter=  40   



Processing: mitochondrion positive
Running MEME...


Writing results to output directory 'meme_results_short/mitochondrion_positive'.
Initializing the motif probability tables for 2 to 587 sites...
nsites = 587
Done initializing.
SEEDS: highwater mark: seq 586 pos 50

seqs=   587, min=  50, max=   50, total=    29350

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 587, iter=  40   



Processing: mitochondrion negative
Running MEME...


Writing results to output directory 'meme_results_short/mitochondrion_negative'.
Initializing the motif probability tables for 2 to 443 sites...
nsites = 443
Done initializing.
SEEDS: highwater mark: seq 442 pos 50

seqs=   443, min=  50, max=   50, total=    22150

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 443, iter=  40   



Processing: nucleolus positive
Running MEME...


Writing results to output directory 'meme_results_short/nucleolus_positive'.
Initializing the motif probability tables for 2 to 856 sites...
nsites = 856
Done initializing.
SEEDS: highwater mark: seq 855 pos 50

seqs=   856, min=  50, max=   50, total=    42800

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 856, iter=  40   



Processing: nucleolus negative
Running MEME...


Writing results to output directory 'meme_results_short/nucleolus_negative'.
Initializing the motif probability tables for 2 to 698 sites...
nsites = 698
Done initializing.
SEEDS: highwater mark: seq 697 pos 50

seqs=   698, min=  50, max=   50, total=    34900

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 698, iter=  40   



Processing: nucleoplasm positive
Running MEME...


Writing results to output directory 'meme_results_short/nucleoplasm_positive'.
Initializing the motif probability tables for 2 to 839 sites...
nsites = 839
Done initializing.
SEEDS: highwater mark: seq 838 pos 50

seqs=   839, min=  50, max=   50, total=    41950

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 839, iter=  40   



Processing: nucleoplasm negative
Running MEME...


Writing results to output directory 'meme_results_short/nucleoplasm_negative'.
Initializing the motif probability tables for 2 to 697 sites...
nsites = 697
Done initializing.
SEEDS: highwater mark: seq 696 pos 50

seqs=   697, min=  50, max=   50, total=    34850

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 697, iter=  40   



Processing: nucleus positive
Running MEME...


Writing results to output directory 'meme_results_short/nucleus_positive'.
Initializing the motif probability tables for 2 to 437 sites...
nsites = 437
Done initializing.
SEEDS: highwater mark: seq 436 pos 50

seqs=   437, min=  50, max=   50, total=    21850

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 437, iter=  40   



Processing: nucleus negative
Running MEME...


Writing results to output directory 'meme_results_short/nucleus_negative'.
Initializing the motif probability tables for 2 to 446 sites...
nsites = 446
Done initializing.
SEEDS: highwater mark: seq 445 pos 50

seqs=   446, min=  50, max=   50, total=    22300

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 446, iter=  40   



Processing: ribosome positive
Running MEME...


Writing results to output directory 'meme_results_short/ribosome_positive'.
Initializing the motif probability tables for 2 to 642 sites...
nsites = 642
Done initializing.
SEEDS: highwater mark: seq 641 pos 50

seqs=   642, min=  50, max=   50, total=    32100

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 642, iter=  40   



Processing: ribosome negative
Running MEME...


Writing results to output directory 'meme_results_short/ribosome_negative'.
Initializing the motif probability tables for 2 to 377 sites...
nsites = 377
Done initializing.
SEEDS: highwater mark: seq 376 pos 50

seqs=   377, min=  50, max=   50, total=    18850

motif=1
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   
motif=2
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   
motif=3
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   
motif=4
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   
motif=5
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   
motif=6
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   
motif=7
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   
motif=8
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   
motif=9
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   
motif=10
SEED WIDTHS: 4 6 8 10
em: w=  10, psites= 377, iter=  40   



 Motifs searching complete!


### longer

In [32]:
for class_name in label_names:
    for bucket in ['positive', 'negative']:
        fasta_file = f"{input_dir}/{class_name}_{bucket}_motifs.fasta"
        
        if not os.path.exists(fasta_file):
            print(f"Skipping {fasta_file} (not found)")
            continue
        
        print(f"\nProcessing: {class_name} {bucket}")
        
        meme_out = f"{meme_output_dir_long}/{class_name}_{bucket}"
        meme_cmd = [
            meme_path, fasta_file,
            "-dna",
            "-oc", meme_out,
            "-mod", "zoops",
            "-nmotifs", "10",
            "-minw", "10",
            "-maxw", "30"
        ]
        
        print(f"Running MEME...")
        subprocess.run(meme_cmd, check=True)

print("\n Motifs searching complete!")



Processing: chromatin positive
Running MEME...


The output directory 'meme_results_short/chromatin_positive' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 849 sites...
nsites = 849
Done initializing.
SEEDS: highwater mark: seq 848 pos 50

seqs=   849, min=  50, max=   50, total=    42450

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 849, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 849, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 849, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 849, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 849, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 849, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 849, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 849, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 849, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30



Processing: chromatin negative
Running MEME...


The output directory 'meme_results_short/chromatin_negative' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 828 sites...
nsites = 828
Done initializing.
SEEDS: highwater mark: seq 827 pos 50

seqs=   828, min=  50, max=   50, total=    41400

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 828, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 828, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 828, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 828, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 828, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 828, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 828, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 828, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 828, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30



Processing: cytoplasm positive
Running MEME...


The output directory 'meme_results_short/cytoplasm_positive' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 697 sites...
nsites = 697
Done initializing.
SEEDS: highwater mark: seq 696 pos 50

seqs=   697, min=  50, max=   50, total=    34850

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 697, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 697, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 697, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 697, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 697, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 697, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 697, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 697, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 697, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30



Processing: cytoplasm negative
Running MEME...


The output directory 'meme_results_short/cytoplasm_negative' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 533 sites...
nsites = 533
Done initializing.
SEEDS: highwater mark: seq 532 pos 50

seqs=   533, min=  50, max=   50, total=    26650

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 533, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 533, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 533, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 533, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 533, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 533, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 533, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 533, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 533, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30



Processing: cytosol positive
Running MEME...


The output directory 'meme_results_short/cytosol_positive' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 515 sites...
nsites = 515
Done initializing.
SEEDS: highwater mark: seq 514 pos 50

seqs=   515, min=  50, max=   50, total=    25750

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 515, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 515, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 515, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 515, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 515, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 515, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 515, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 515, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 515, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30
em


Processing: cytosol negative
Running MEME...


The output directory 'meme_results_short/cytosol_negative' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 353 sites...
nsites = 353
Done initializing.
SEEDS: highwater mark: seq 352 pos 50

seqs=   353, min=  50, max=   50, total=    17650

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 353, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 353, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 353, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 353, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 353, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 353, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 353, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 353, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 353, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30
em


Processing: ER positive
Running MEME...


The output directory 'meme_results_short/ER_positive' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 139 sites...
nsites = 139
Done initializing.
SEEDS: highwater mark: seq 138 pos 50

seqs=   139, min=  50, max=   50, total=     6950

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 139, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 139, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 139, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 139, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 139, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 139, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 139, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 139, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 139, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30
em: w= 


Processing: ER negative
Running MEME...


The output directory 'meme_results_short/ER_negative' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 266 sites...
nsites = 266
Done initializing.
SEEDS: highwater mark: seq 265 pos 50

seqs=   266, min=  50, max=   50, total=    13300

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 266, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 266, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 266, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 266, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 266, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 266, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 266, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 266, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 266, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30
em: w= 


Processing: extracellular positive
Running MEME...


The output directory 'meme_results_short/extracellular_positive' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 190 sites...
nsites = 190
Done initializing.
SEEDS: highwater mark: seq 189 pos 50

seqs=   190, min=  50, max=   50, total=     9500

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 190, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 190, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 190, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 190, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 190, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 190, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 190, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 190, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 190, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26


Processing: extracellular negative
Running MEME...


The output directory 'meme_results_short/extracellular_negative' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 45 sites...
nsites = 45
Done initializing.
SEEDS: highwater mark: seq 44 pos 50

seqs=    45, min=  50, max=   50, total=     2250

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites=  45, iter=  40  
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites=  45, iter=  40  
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites=  45, iter=  20  
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites=  45, iter=  40  
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites=  45, iter=  20  
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites=  45, iter=  20  
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites=  45, iter=  20  
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites=  45, iter=  20  
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites=  45, iter=  10  
motif=10
SEED WIDTHS: 10 14 19 26 30
em: w=  


Processing: membrane positive
Running MEME...


nsites = 764
Done initializing.
SEEDS: highwater mark: seq 763 pos 50

seqs=   764, min=  50, max=   50, total=    38200

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 764, iter=  40   



Processing: membrane negative
Running MEME...


The output directory 'meme_results_short/membrane_negative' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 710 sites...
nsites = 710
Done initializing.
SEEDS: highwater mark: seq 709 pos 50

seqs=   710, min=  50, max=   50, total=    35500

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 710, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 710, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 710, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 710, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 710, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 710, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 710, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 710, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
IOPub message rate exceeded.         
The Jupyter server will temporarily st


Processing: ribosome negative
Running MEME...


The output directory 'meme_results_short/ribosome_negative' already exists.
Its contents will be overwritten.
Initializing the motif probability tables for 2 to 377 sites...
nsites = 377
Done initializing.
SEEDS: highwater mark: seq 376 pos 50

seqs=   377, min=  50, max=   50, total=    18850

motif=1
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 377, iter=  40   
motif=2
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 377, iter=  40   
motif=3
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 377, iter=  40   
motif=4
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 377, iter=  40   
motif=5
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 377, iter=  40   
motif=6
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 377, iter=  40   
motif=7
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 377, iter=  40   
motif=8
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 377, iter=  40   
motif=9
SEED WIDTHS: 10 14 19 26 30
em: w=  30, psites= 377, iter=  40   
motif=10
SEED WIDTHS: 10 14 19 26 30
e


 Motifs searching complete!


## TOMTOM

### shorter

In [3]:
for class_name in label_names:
    for bucket in ['positive', 'negative']:
        meme_results = f"{meme_output_dir_short}/{class_name}_{bucket}/meme.txt"
        
        if not os.path.exists(meme_results):
            continue
        
        print(f" TOMTOM: {class_name} {bucket}")
        
        for db_name, db_path in databases.items():
            db_path_expanded = os.path.expanduser(db_path)
            
            if not os.path.exists(db_path_expanded):
                print(f"  Database not found: {db_name}")
                continue
            
            tomtom_out = f"{tomtom_output_dir_short}/{db_name}/{class_name}_{bucket}"
            
            tomtom_cmd = [
                tomtom_path,
                "-oc", tomtom_out,
                "-thresh", "1.0",  # Permissive, filter later
                "-evalue",
                "-no-ssc",
                meme_results,
                db_path_expanded
            ]
            
            try:
                subprocess.run(tomtom_cmd, check=True, capture_output=True)
            except subprocess.CalledProcessError as e:
                print(f"  Error in {db_name}: {e.stderr.decode()[:100]}")

print("\n TOMTOM complete")


Running TOMTOM on all MEME results...

 TOMTOM: chromatin positive
 TOMTOM: chromatin negative
 TOMTOM: cytoplasm positive
 TOMTOM: cytoplasm negative
 TOMTOM: cytosol positive
 TOMTOM: cytosol negative
 TOMTOM: ER positive
 TOMTOM: ER negative
 TOMTOM: extracellular positive
 TOMTOM: extracellular negative
 TOMTOM: membrane positive
 TOMTOM: membrane negative
 TOMTOM: mitochondrion positive
 TOMTOM: mitochondrion negative
 TOMTOM: nucleolus positive
 TOMTOM: nucleolus negative
 TOMTOM: nucleoplasm positive
 TOMTOM: nucleoplasm negative
 TOMTOM: nucleus positive
 TOMTOM: nucleus negative
 TOMTOM: ribosome positive
 TOMTOM: ribosome negative

 TOMTOM complete


### longer

In [2]:
for class_name in label_names:
    for bucket in ['positive', 'negative']:
        meme_results = f"{meme_output_dir_long}/{class_name}_{bucket}/meme.txt"
        
        if not os.path.exists(meme_results):
            continue
        
        print(f" TOMTOM: {class_name} {bucket}")
        
        for db_name, db_path in databases.items():
            db_path_expanded = os.path.expanduser(db_path)
            
            if not os.path.exists(db_path_expanded):
                print(f"  Database not found: {db_name}")
                continue
            
            tomtom_out = f"{tomtom_output_dir_long}/{db_name}/{class_name}_{bucket}"
            
            tomtom_cmd = [
                tomtom_path,
                "-oc", tomtom_out,
                "-thresh", "1.0",  # Permissive, filter later
                "-evalue",
                "-no-ssc",
                meme_results,
                db_path_expanded
            ]
            
            try:
                subprocess.run(tomtom_cmd, check=True, capture_output=True)
            except subprocess.CalledProcessError as e:
                print(f"  Error in {db_name}: {e.stderr.decode()[:100]}")

print("\n TOMTOM complete")


Running TOMTOM on all MEME results...



NameError: name 'label_names' is not defined

## Filtering

### shorter

In [68]:
filtered_results = []

for class_name in label_names:
    for bucket in ['positive', 'negative']:
        
        # Setup Input Paths
        fasta_file = f"{input_dir}/{class_name}_{bucket}_motifs.fasta"
        meme_file = f"{meme_output_dir_short}/{class_name}_{bucket}/meme.txt"
        
        if not os.path.exists(fasta_file) or not os.path.exists(meme_file):
            continue
        
        # Get Sequence Counts
        total_seqs = count_fasta_sequences(fasta_file)
        # min_sites = int(total_seqs * MIN_SITE_PERCENT)
        
        # Get MEME Sites
        motif_sites = {}
        with open(meme_file, 'r') as f:
            content = f.read()
            matches = re.findall(r'MOTIF\s+(\S+).*?sites\s*=\s*(\d+)', content, re.DOTALL)
            for m_id, m_sites in matches:
                # Filter out junk
                if m_id.isdigit():
                    motif_sites[m_id] = int(m_sites)

        for db_name in databases.keys():
            txt_file = f"{tomtom_output_dir_short}/{db_name}/{class_name}_{bucket}/tomtom.txt"
            
            if not os.path.exists(txt_file):
                continue
            
            try:
                # Read without comment='#' so to keep the header row
                df = pd.read_csv(txt_file, sep='\t')
                
                # Sanitize column names:
                df.columns = df.columns.str.replace('#', '', regex=False).str.strip().str.replace(' ', '_')
                
            except Exception as e:
                print(f"Error reading {txt_file}: {e}")
                continue
            
            if df.empty or 'Query_ID' not in df.columns:
                continue

            # Filter by rows
            for _, row in df.iterrows():
                    
                # Filter 0: Direction
                direction = row.get('Orientation','')
                if direction == '-':
                    continue
                
                # Filter 1: Site Count
                query_motif = str(row['Query_ID'])
                sites = motif_sites.get(query_motif, 0)
                ratio = float(sites / total_seqs)
                if ratio < MIN_SITE_PERCENT:
                    continue
                
                # Filter 1.5: The first hit is junk
                if query_motif == '1':
                    continue
                
                # Filter 2: Statistical Thresholds
                # (Use .get with defaults to prevent errors on missing cols)
                try:
                    e_val = float(row.get('E-value', 1.0))
                    p_val = float(row.get('p-value', 1.0))
                    q_val = float(row.get('q-value', 1.0))
                except ValueError:
                    continue
                
                passes = sum([
                    e_val < E_VALUE_THRESH,
                    p_val < P_VALUE_THRESH,
                    q_val < Q_VALUE_THRESH
                ])
                
                if passes >= 2:
                    filtered_results.append({
                        'Class': class_name,
                        'Bucket': bucket,
                        'Database': db_name,
                        'Query_motif': query_motif,
                        'Target': row.get('Target_ID', ''),
                        'Abandance':f'{ratio*100:.2f}%',
                        'E-value': e_val,
                        'p-value': p_val,
                        'q-value': q_val,
                        'Overlap': row.get('Overlap', '')
                    })

# Save results
df_filtered = pd.DataFrame(filtered_results)
output_file = "tomtom_shorter_filtered.csv"
df_filtered.to_csv(output_file, index=False)

print(f"\nFILTERING SUMMARY")
print(f"Total matches passing filters: {len(df_filtered)}")

if not df_filtered.empty:
    print(f"\nTop 10 matches:")
    print(df_filtered.head(10).to_string())


FILTERING RESULTS

FILTERING SUMMARY
Total matches passing filters: 164

Top 10 matches:
       Class    Bucket   Database Query_motif           Target Abandance   E-value   p-value   q-value  Overlap
0  chromatin  positive  CISBP_RNA           9         M157_0.6     2.47%  0.008060  0.000087  0.011393        7
1  chromatin  positive  CISBP_RNA           9         M049_0.6     2.47%  0.023743  0.000255  0.015481        7
2  chromatin  positive  CISBP_RNA           9         M004_0.6     2.47%  0.039235  0.000422  0.019186        7
3  chromatin  positive        RNA           9      RNCMPT00051     2.47%  0.000963  0.000241  0.001927        7
4  chromatin  positive    miRBase           9  mmu-miR-669c-3p     2.47%  0.003310  0.000002  0.003004       10
5  chromatin  positive    miRBase           9  mmu-miR-466i-3p     2.47%  0.049066  0.000025  0.009733       10
6  chromatin  positive    miRBase           9   mmu-miR-672-3p     2.47%  0.050416  0.000025  0.009733        9
7  chromatin  

### longer

In [67]:
filtered_results = []

for class_name in label_names:
    for bucket in ['positive', 'negative']:
        
        # Setup Input Paths
        fasta_file = f"{input_dir}/{class_name}_{bucket}_motifs.fasta"
        meme_file = f"{meme_output_dir_long}/{class_name}_{bucket}/meme.txt"
        
        if not os.path.exists(fasta_file) or not os.path.exists(meme_file):
            continue
        
        # Get Sequence Counts
        total_seqs = count_fasta_sequences(fasta_file)
        # min_sites = int(total_seqs * MIN_SITE_PERCENT)
        
        # Get MEME Sites (Regex Parser)
        motif_sites = {}
        with open(meme_file, 'r') as f:
            content = f.read()
            matches = re.findall(r'MOTIF\s+(\S+).*?sites\s*=\s*(\d+)', content, re.DOTALL)
            for m_id, m_sites in matches:
                # Filter out junk
                if m_id.isdigit():
                    motif_sites[m_id] = int(m_sites)

        for db_name in databases.keys():
            txt_file = f"{tomtom_output_dir_long}/{db_name}/{class_name}_{bucket}/tomtom.txt"
            
            if not os.path.exists(txt_file):
                continue
            
            try:
                # Read WITHOUT comment='#'
                df = pd.read_csv(txt_file, sep='\t')
                
                # Sanitize column names:
                df.columns = df.columns.str.replace('#', '', regex=False).str.strip().str.replace(' ', '_')
                
            except Exception as e:
                print(f"Error reading {txt_file}: {e}")
                continue
            
            if df.empty or 'Query_ID' not in df.columns:
                continue

            # Filter by rows
            for _, row in df.iterrows():

                # Filter 0: Direction
                direction = row.get('Orientation','')
                if direction == '-':
                    continue
                
                # Filter 1: Site Count
                query_motif = str(row['Query_ID'])
                sites = motif_sites.get(query_motif, 0)
                ratio = float(sites / total_seqs)
                if ratio < MIN_SITE_PERCENT:
                    continue
                              
                # Filter 1.5: The first hit is junk
                if query_motif == '1':
                    continue
                    
                # Filter 2: Statistical Thresholds
                # (Use .get with defaults to prevent errors on missing cols)
                try:
                    e_val = float(row.get('E-value', 1.0))
                    p_val = float(row.get('p-value', 1.0))
                    q_val = float(row.get('q-value', 1.0))
                except ValueError:
                    continue
                
                passes = sum([
                    e_val < E_VALUE_THRESH,
                    p_val < P_VALUE_THRESH,
                    q_val < Q_VALUE_THRESH
                ])
                
                if passes >= 2:
                    filtered_results.append({
                        'Class': class_name,
                        'Bucket': bucket,
                        'Database': db_name,
                        'Query_motif': query_motif,
                        'Target': row.get('Target_ID', ''),
                        'Abandance':f'{ratio*100:.2f}%',
                        'E-value': e_val,
                        'p-value': p_val,
                        'q-value': q_val,
                        'Overlap': row.get('Overlap', '')
                    })

# Save results
df_filtered = pd.DataFrame(filtered_results)
output_file = "tomtom_longer_filtered.csv"
df_filtered.to_csv(output_file, index=False)

print(f"\nFILTERING SUMMARY")
print(f"Total matches passing filters: {len(df_filtered)}")

if not df_filtered.empty:
    print(f"\nTop 10 matches:")
    print(df_filtered.head(10).to_string())


FILTERING RESULTS

FILTERING SUMMARY
Total matches passing filters: 70

Top 10 matches:
       Class    Bucket   Database Query_motif           Target Abandance   E-value   p-value   q-value  Overlap
0  chromatin  positive  CISBP_RNA           2         M044_0.6    12.84%  0.023755  0.000255  0.031096        7
1  chromatin  positive  CISBP_RNA           3         M043_0.6     2.36%  0.028723  0.000309  0.028063        7
2  chromatin  positive        RNA           3      RNCMPT00239     2.36%  0.044184  0.011046  0.088369        8
3  chromatin  negative  CISBP_RNA           3         M070_0.6     5.80%  0.043306  0.000466  0.084886        8
4  chromatin  negative    miRBase           9  mmu-miR-7222-3p     2.17%  0.046155  0.000023  0.056712       21
5  chromatin  negative    miRBase           9  mmu-miR-1933-3p     2.17%  0.056932  0.000029  0.056712       20
6  chromatin  negative    miRBase           9  mmu-miR-6900-5p     2.17%  0.129604  0.000066  0.086069       22
7  cytoplasm  p