In [1]:
import pandas as pd
from weblogo import *

### Configuration

In [2]:
sea = "sea"  # change to specific directory of MEME/bin/sea
df_name = "Mix_clustered.csv"  # input, be careful for multiindex
motif_col = "('INFO', 'motif_F10')"
cluster_col = "Cluster"
meme_fn = "mixed.meme"
fasta_fn = "mixed.fa"
sea_out = "sea_out"

### Run

In [3]:
df = pd.read_csv(df_name, index_col=[0,1,2], header=[0])
# df = pd.read_csv(df_name, index_col=[0,1,2], header=[0, 1]) # for dual header

In [4]:
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,"('INFO', 'motif_F10')",Human,Mouse,X. laevis,X. tropicalis,Zebrafish,Fly,X,Y,Cluster
10@100177160@-,ENSG00000107521,HPS1,CTAGACCACCCTCCATGTCAG,1.0,,,,,,9.848524,8.074862,1
10@100177994@-,ENSG00000107521,HPS1,TCCAGATGATCGAGGTGCCCG,3.0,,,,,,3.548068,1.524192,3
10@100183394@-,ENSG00000107521,HPS1,GGACTTCTTGCTGGTGAAGAG,3.0,,,,,,8.139991,2.309283,3
10@100183566@-,ENSG00000107521,HPS1,CAGCCCCCAGCAGGGGAGGCC,3.0,,,,,,7.001106,1.070967,3
10@100184075@-,ENSG00000107521,HPS1,CCGGATCCTCCTGGGAGTGAG,3.0,,,,,,6.508685,0.452521,3
...,...,...,...,...,...,...,...,...,...,...,...,...
XHet@68785@-,FBgn0039945,CG17159,AACGAGAGTCCGGCTGAAAAG,,,,,,2.0,7.691403,3.281687,3
XHet@68786@-,FBgn0039945,CG17159,TAACGAGAGTCCGGCTGAAAA,,,,,,2.0,6.476910,2.660218,3
XHet@77281@+,FBgn0003559,su(f),TCACACCTATCGTGGGCATCG,,,,,,2.0,4.203569,0.420474,3
YHet@328017@-,FBgn0085792,CR41509,GTTGAGGAAGCACTTGAAGAA,,,,,,2.0,7.310891,3.658054,3


In [5]:
def extact_all_fasta(df_in, column, fn_out, rna=True):
    N = 0
    with open(fn_out, "w") as output:
        for idx, row in df_in.iterrows():
            if rna == True:
                output.write(">{}\n{}\n".format(N, row[column].replace("T", "U")))
            else:
                output.write(">{}\n{}\n".format(N, row[column]))
            N += 1        

In [6]:
extact_all_fasta(df, motif_col, fasta_fn)

In [7]:
def generate_meme_file(df_in, id_column, motif_column, fn_out, rna=True):
    all_ids = set(df_in[id_column].tolist())
    temp_mat_name = fn_out+".temp.mat"
    print(all_ids)
    with open(temp_mat_name, "w") as output:
        for ID in all_ids:
            print(ID)
            subdf = df_in[df_in[id_column]==ID]
            count_data = {}
            for _, row in subdf.iterrows():
                if "N" in row[motif_column]:
                    continue
                if rna == True:
                    iterseq = row[motif_column].replace("T", "U")
                else:
                    iterseq = row[motif_column]
                for idx, base in enumerate(list(iterseq)):
                    if idx not in count_data:
                        count_data[idx] = {"A":0, "C": 0, "G": 0, "U":0}
                    count_data[idx][base] += 1
            count_df = pd.DataFrame.from_dict(count_data).T
            seqs = count_df.values
            seqs = np.array(seqs)
            logodata = LogoData.from_counts(counts=seqs, alphabet='ACGU')

            temp = []
            for i in range(logodata.counts.shape[0]):
                # temp.extend(list(logodata.entropy[i]*logodata.counts[i]/logodata.counts[i].sum()))
                output.write("{}\t{}\t{}\t{}\n".format(logodata.counts[i][0], logodata.counts[i][1], logodata.counts[i][2], logodata.counts[i][3]))
            output.write("\n")
        
    !matrix2meme -rna < $temp_mat_name > $fn_out

In [8]:
generate_meme_file(df, cluster_col, motif_col, meme_fn)

{1, 2, 3}
1
2
3


In [9]:
!$sea -oc $sea_out -p $fasta_fn -m $meme_fn

# Checking alphabets in 1 motif files.
# Loading motifs from file 'mixed.meme'
# Alphabet: RNA
# NOTE: Will convert any DNA sequences to RNA.
# Positive sequences "mixed.fa" - training: 130630 hold-out: 14514
# Negative sequences are shuffled primary sequences (2-order) - training: 130630 hold-out: 14514
# Estimating background model from control sequences.
# Background: A 0.239 C 0.274 G 0.294 U 0.192
# Background order: 2 Background size: 84
# Using Fisher Exact test for p-values.
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 3. Using pi_zero = 1.0.
# Freeing storage...


### It is better to read the html.

In [10]:
df_sea = pd.read_csv("./{}/sea.tsv".format(sea_out), header=0, sep="\t")

In [11]:
df_sea

Unnamed: 0,RANK,DB,ID,ALT_ID,CONSENSUS,TP,TP%,FP,FP%,ENR_RATIO,SCORE_THR,PVALUE,LOG_PVALUE,EVALUE,LOG_EVALUE,QVALUE,LOG_QVALUE
0,1,mixed.meme,3.0,NNNNNNNNNNCVRRDNNNNNN,NNNNNNNNNNCRGGRNNNNNN,99431.0,76.12,26674.0,20.42,3.73,0.00079,0.0,-43008.41,0.0,-43007.31,0.0,-43007.31
1,2,mixed.meme,1.0,NNNNNNNNNHCUCCANNNNNN,NNNNNNNNNHCUCCANNNNNN,15117.0,11.57,2300.0,1.76,6.57,4.7,0.0,-5616.67,0.0,-5615.57,0.0,-5615.57
2,3,mixed.meme,2.0,NNNNNKUDGCCAMWUGNNNNN,NNNNDKUWGCCAAAUGNNNNN,7871.0,6.03,2653.0,2.03,2.97,0.022,0.0,-1411.22,0.0,-1410.12,0.0,-1410.12
3,# SEA (Simple Enrichment Analysis): Version 5....,,,,,,,,,,,,,,,,
4,# The format of this file is described at http...,,,,,,,,,,,,,,,,
5,# sea -oc sea_out -p mixed.fa -m mixed.meme,,,,,,,,,,,,,,,,
