In [1]:
import pandas as pd
from weblogo import *

### Configuration

In [2]:
sea = "sea"  # change to specific directory of MEME/bin/sea
df_name = "variant_C.csv"  # input, be careful for multiindex
motif_col = "motif_F10"
cluster_col = "Cluster"
meme_fn = "variant.meme"
fasta_fn = "variant.fa"
sea_out = "sea_out"

### Run

In [3]:
df = pd.read_csv(df_name, index_col=None, header=[0])
# df = pd.read_csv(df_name, index_col=[0,1,2], header=[0, 1]) # for dual header

In [4]:
df

Unnamed: 0.1,Unnamed: 0,Tan et al,motif_F10,base,num_of_sample,known_mod,X,Y,Cluster
0,chr2@232325441@-,True,ACGACGACGACGAAGATGATG,C,724,,4.859695,9.978953,12
1,chr16@85689994@+,True,AGCGCGAGCGCGAGCGCGAGC,C,368,,4.490503,10.093743,12
2,chr19@4035911@+,True,CACACCGTCTCACACACACAC,C,327,,6.648704,7.513982,2
3,chr8@131064998@-,True,TGGTTTTTTTCAAAAAAAAAA,C,170,,4.291223,5.259356,4
4,chr10@102285413@-,True,CTAACACACACACACACACAC,C,157,,6.779656,7.399220,2
...,...,...,...,...,...,...,...,...,...
1559,chr17@17039565@+,True,CTGTTACCAGCAGCAGCAGCA,C,6,,2.246306,11.219783,8
1560,chr17@74035785@-,True,AGGAGCTAACCAGGCTCTTCC,C,6,,4.446080,9.728505,12
1561,tRNA-Thr-UGU@1@+,True,GAGCACTGGTCTTGTAAACCA,C,-1,m3C,4.301119,7.549700,7
1562,tRNA-Arg-CCU@1@+,True,AGGCACTGGCCTCCTAAGCCA,C,-1,m3C,5.031169,8.001174,7


In [5]:
def extact_all_fasta(df_in, column, fn_out, rna=True):
    N = 0
    with open(fn_out, "w") as output:
        for idx, row in df_in.iterrows():
            if rna == True:
                output.write(">{}\n{}\n".format(N, row[column].replace("T", "U")))
            else:
                output.write(">{}\n{}\n".format(N, row[column]))
            N += 1        

In [6]:
extact_all_fasta(df, motif_col, fasta_fn)

In [7]:
def generate_meme_file(df_in, id_column, motif_column, fn_out, rna=True):
    all_ids = set(df_in[id_column].tolist())
    temp_mat_name = fn_out+".temp.mat"
    print(all_ids)
    with open(temp_mat_name, "w") as output:
        for ID in all_ids:
            print(ID)
            subdf = df_in[df_in[id_column]==ID]
            count_data = {}
            for _, row in subdf.iterrows():
                if "N" in row[motif_column]:
                    continue
                if rna == True:
                    iterseq = row[motif_column].replace("T", "U")
                else:
                    iterseq = row[motif_column]
                for idx, base in enumerate(list(iterseq)):
                    if idx not in count_data:
                        count_data[idx] = {"A":0, "C": 0, "G": 0, "U":0}
                    count_data[idx][base] += 1
            count_df = pd.DataFrame.from_dict(count_data).T
            seqs = count_df.values
            seqs = np.array(seqs)
            logodata = LogoData.from_counts(counts=seqs, alphabet='ACGU')

            temp = []
            for i in range(logodata.counts.shape[0]):
                # temp.extend(list(logodata.entropy[i]*logodata.counts[i]/logodata.counts[i].sum()))
                output.write("{}\t{}\t{}\t{}\n".format(logodata.counts[i][0], logodata.counts[i][1], logodata.counts[i][2], logodata.counts[i][3]))
            output.write("\n")
        
    !matrix2meme -rna < $temp_mat_name > $fn_out

In [8]:
generate_meme_file(df, cluster_col, motif_col, meme_fn)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
1
2
3
4
5
6
7
8
9
10
11
12


In [9]:
!$sea -oc $sea_out -p $fasta_fn -m $meme_fn

# Checking alphabets in 1 motif files.
# Loading motifs from file 'variant.meme'
# Alphabet: RNA
# NOTE: Will convert any DNA sequences to RNA.
# Positive sequences "variant.fa" - training: 1408 hold-out: 156
# Negative sequences are shuffled primary sequences (2-order) - training: 1408 hold-out: 156
# Estimating background model from control sequences.
# Background: A 0.238 C 0.294 G 0.244 U 0.224
# Background order: 2 Background size: 84
# Using Fisher Exact test for p-values.
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 12. Using pi_zero = 1.0.
# Freeing storage...


### It is better to read the html.

In [10]:
df_sea = pd.read_csv("./{}/sea.tsv".format(sea_out), header=0, sep="\t")

In [11]:
df_sea

Unnamed: 0,RANK,DB,ID,ALT_ID,CONSENSUS,TP,TP%,FP,FP%,ENR_RATIO,SCORE_THR,PVALUE,LOG_PVALUE,EVALUE,LOG_EVALUE,QVALUE,LOG_QVALUE
0,1,variant.meme,12.0,NNNNNNNNNSCVNBNNVNNNN,NBNSBSVBSSCSVSSSSSBSS,609.0,43.25,259.0,18.39,2.35,0.17,2.78e-47,-107.2,3.34e-46,-104.71,3.34e-46,-104.71
1,2,variant.meme,7.0,NNNNNHNNNHCMNNNNNNNNN,HNNVDWNVHUCACNNNDBHNN,448.0,31.82,157.0,11.15,2.84,0.19,5.3499999999999997e-42,-95.03,6.42e-41,-92.55,3.21e-41,-93.24
2,3,variant.meme,6.0,VDDKGKGNDBCNGNKKKNGBG,GUGUGKGDGKCWGKGUGKGUG,194.0,13.78,51.0,3.62,3.75,0.026,7.550000000000001e-23,-50.94,9.06e-22,-48.45,3.0200000000000002e-22,-49.55
3,4,variant.meme,5.0,DMAAAAAAARCAAAWAADNDN,AAAAAAAAAACAAAAAAAAAW,193.0,13.71,52.0,3.69,3.66,0.37,3.1700000000000003e-22,-49.5,3.8e-21,-47.02,9.51e-22,-48.4
4,5,variant.meme,1.0,NDWWWUUUYUCUUUUUWHWWW,UUUUUUUUUUCUUUUUUUUUU,156.0,11.08,34.0,2.41,4.49,0.98,2.12e-21,-47.6,2.5399999999999998e-20,-45.12,5.08e-21,-46.73
5,6,variant.meme,3.0,DNHSDNDNNKCRYBDBBDNRN,KNWSWGUKWKCRUBUGBDKGK,188.0,13.35,60.0,4.26,3.1,1.1,3.08e-18,-40.32,3.7000000000000003e-17,-37.84,6.17e-18,-39.63
6,7,variant.meme,4.0,DHNNYNNNHWCAAAAAAAAAA,DWYUYHUKYUCAAAAAAAAAA,197.0,13.99,75.0,5.33,2.61,0.073,2.23e-15,-33.74,2.68e-14,-31.25,3.82e-15,-33.2
7,8,variant.meme,8.0,GSAGCWGSWGCWGSDGSNGSH,GCAGCAGCWGCAGCAGCWGCA,163.0,11.58,69.0,4.9,2.34,0.43,5.4e-11,-23.64,6.48e-10,-21.16,8.1e-11,-23.24
8,9,variant.meme,11.0,SVVSNSGBGGCRGSSGSGGVV,GSGGCGGSGGCGGCGGCGGSG,207.0,14.7,102.0,7.24,2.02,0.21,1.27e-10,-22.78,1.53e-09,-20.3,1.7e-10,-22.5
9,10,variant.meme,9.0,NSHNBHHBYHCHHBYHBYHBY,YCCHCMUCYYCCHCCHCCYYC,202.0,14.35,103.0,7.32,1.95,0.81,1.06e-09,-20.67,1.27e-08,-18.18,1.27e-09,-20.48
