In [1]:
import numpy as np
import pandas as pd

In [2]:
C1 = pd.read_csv("Motif_C1.csv")

In [3]:
C2 = pd.read_csv("Motif_C2.csv")

In [4]:
Data = pd.merge(C1, C2, how="inner", on=["Motif"])

In [5]:
Data = Data[(Data["C1"] != 0) | (Data["C2"] != 0)]
Data = Data[Data["Motif"].str.contains('_')==False]
Data = Data.reset_index(drop = True)

In [6]:
Data

Unnamed: 0,Motif,C1,C2
0,AAAAA,6195,26557
1,AAAAT,4334,21567
2,AAAAC,4530,16703
3,AAAAG,4675,23174
4,AAATA,2815,13335
...,...,...,...
1019,GGGCG,668,6470
1020,GGGGA,2762,14885
1021,GGGGT,866,6221
1022,GGGGC,1639,14195


In [7]:
from scipy.stats import fisher_exact

In [8]:
for i in range(len(Data)):
    S1 = Data["C1"].sum()
    S2 = Data["C2"].sum()
    
    A1 = Data.loc[i,"C1"]
    B1 = Data.loc[i,"C2"]
    
    oddsr, p = fisher_exact([[A1, S1-A1], [B1, S2-B1]], alternative='two-sided')
    
    if p == 0.000000e+00 :
        p = 5e-324
        
    Data.loc[i, "p"] = p

In [9]:
Data

Unnamed: 0,Motif,C1,C2,p
0,AAAAA,6195,26557,1.108244e-113
1,AAAAT,4334,21567,9.896720e-27
2,AAAAC,4530,16703,1.421285e-165
3,AAAAG,4675,23174,8.201298e-30
4,AAATA,2815,13335,1.838399e-27
...,...,...,...,...
1019,GGGCG,668,6470,5.629701e-37
1020,GGGGA,2762,14885,1.134025e-06
1021,GGGGT,866,6221,1.776931e-07
1022,GGGGC,1639,14195,7.623362e-51


In [10]:
Data.replace({0:0.001}, inplace=True)

In [11]:
Data["Per-C1"] = Data["C1"]/S1
Data["Per-C2"] = Data["C2"]/S2

In [12]:
Data

Unnamed: 0,Motif,C1,C2,p,Per-C1,Per-C2
0,AAAAA,6195,26557,1.108244e-113,0.002997,0.002153
1,AAAAT,4334,21567,9.896720e-27,0.002097,0.001749
2,AAAAC,4530,16703,1.421285e-165,0.002192,0.001354
3,AAAAG,4675,23174,8.201298e-30,0.002262,0.001879
4,AAATA,2815,13335,1.838399e-27,0.001362,0.001081
...,...,...,...,...,...,...
1019,GGGCG,668,6470,5.629701e-37,0.000323,0.000525
1020,GGGGA,2762,14885,1.134025e-06,0.001336,0.001207
1021,GGGGT,866,6221,1.776931e-07,0.000419,0.000504
1022,GGGGC,1639,14195,7.623362e-51,0.000793,0.001151


In [13]:
Data = Data.sort_values(by=["Per-C1","Per-C2"], ascending=[False,False])
Data = Data.reset_index(drop = True)

In [14]:
Data

Unnamed: 0,Motif,C1,C2,p,Per-C1,Per-C2
0,TGGAC,7986,18792,4.940656e-324,0.003864,0.001524
1,AAGAA,7979,44394,1.072102e-08,0.003860,0.003599
2,GGACA,7669,17663,4.940656e-324,0.003710,0.001432
3,CTGGA,7594,40525,9.692741e-19,0.003674,0.003286
4,GAAGA,7209,46799,1.664521e-11,0.003488,0.003794
...,...,...,...,...,...,...
1019,CGCGC,182,3083,1.118988e-57,0.000088,0.000250
1020,CGCGT,180,1917,1.164238e-15,0.000087,0.000155
1021,CGCGA,172,1996,5.779889e-20,0.000083,0.000162
1022,TAGCG,154,977,4.976728e-01,0.000075,0.000079


In [15]:
Data["OR"] = Data["Per-C1"]/Data["Per-C2"]

In [16]:
Data

Unnamed: 0,Motif,C1,C2,p,Per-C1,Per-C2,OR
0,TGGAC,7986,18792,4.940656e-324,0.003864,0.001524,2.535836
1,AAGAA,7979,44394,1.072102e-08,0.003860,0.003599,1.072479
2,GGACA,7669,17663,4.940656e-324,0.003710,0.001432,2.590831
3,CTGGA,7594,40525,9.692741e-19,0.003674,0.003286,1.118182
4,GAAGA,7209,46799,1.664521e-11,0.003488,0.003794,0.919186
...,...,...,...,...,...,...,...
1019,CGCGC,182,3083,1.118988e-57,0.000088,0.000250,0.352259
1020,CGCGT,180,1917,1.164238e-15,0.000087,0.000155,0.560293
1021,CGCGA,172,1996,5.779889e-20,0.000083,0.000162,0.514201
1022,TAGCG,154,977,4.976728e-01,0.000075,0.000079,0.940570


In [17]:
Data = Data.sort_values(by=["OR","p"], ascending=[False,True])
Data = Data.reset_index(drop = True)

In [18]:
Data

Unnamed: 0,Motif,C1,C2,p,Per-C1,Per-C2,OR
0,GGACT,6468,9801,4.940656e-324,0.003129,0.000795,3.937897
1,GAACT,6316,11984,4.940656e-324,0.003056,0.000972,3.144887
2,AGACT,5351,10166,4.940656e-324,0.002589,0.000824,3.140868
3,GACTG,5739,11423,4.940656e-324,0.002777,0.000926,2.997925
4,GACTT,5249,10624,4.940656e-324,0.002540,0.000861,2.948175
...,...,...,...,...,...,...,...
1019,CGCGG,265,3893,4.699577e-59,0.000128,0.000316,0.406187
1020,GCGCG,196,2988,2.789277e-48,0.000095,0.000242,0.391418
1021,CGCCG,364,5659,1.029839e-92,0.000176,0.000459,0.383819
1022,CCGCG,271,4264,1.441219e-71,0.000131,0.000346,0.379242


In [19]:
RAC = Data[(Data["Motif"]=="AAACA") | (Data["Motif"]=="AAACC") | (Data["Motif"]=="AAACG") | (Data["Motif"]=="AAACT") | \
           (Data["Motif"]=="AGACA") | (Data["Motif"]=="AGACC") | (Data["Motif"]=="AGACG") | (Data["Motif"]=="AGACT") | \
           (Data["Motif"]=="CAACA") | (Data["Motif"]=="CAACC") | (Data["Motif"]=="CAACG") | (Data["Motif"]=="CAACT") | \
           (Data["Motif"]=="CGACA") | (Data["Motif"]=="CGACC") | (Data["Motif"]=="CGACG") | (Data["Motif"]=="CGACT") | \
           (Data["Motif"]=="GAACA") | (Data["Motif"]=="GAACC") | (Data["Motif"]=="GAACG") | (Data["Motif"]=="GAACT") | \
           (Data["Motif"]=="GGACA") | (Data["Motif"]=="GGACC") | (Data["Motif"]=="GGACG") | (Data["Motif"]=="GGACT") | \
           (Data["Motif"]=="TAACA") | (Data["Motif"]=="TAACC") | (Data["Motif"]=="TAACG") | (Data["Motif"]=="TAACT") | \
           (Data["Motif"]=="TGACA") | (Data["Motif"]=="TGACC") | (Data["Motif"]=="TGACG") | (Data["Motif"]=="TGACT")]

In [20]:
RAC = RAC.reset_index(drop=True)

In [21]:
RAC

Unnamed: 0,Motif,C1,C2,p,Per-C1,Per-C2,OR
0,GGACT,6468,9801,5e-324,0.003129,0.000795,3.937897
1,GAACT,6316,11984,5e-324,0.003056,0.000972,3.144887
2,AGACT,5351,10166,5e-324,0.002589,0.000824,3.140868
3,GGACA,7669,17663,5e-324,0.00371,0.001432,2.590831
4,AAACT,4959,13643,5e-324,0.002399,0.001106,2.168947
5,TGACT,4106,11357,5e-324,0.001987,0.000921,2.157347
6,TAACT,1421,4452,1.770237e-88,0.000687,0.000361,1.9046
7,AGACA,5061,18016,1.0396560000000001e-209,0.002449,0.001461,1.676265
8,GAACA,4984,18406,6.558690000000001e-181,0.002411,0.001492,1.615784
9,TAACA,1768,6698,2.532265e-59,0.000855,0.000543,1.575077


In [22]:
RAC["Type"] = "RAC"

In [23]:
RAC = RAC[["Motif","Type"]]

In [24]:
Data = pd.merge(Data, RAC, how="left", on=["Motif"])

In [25]:
Data

Unnamed: 0,Motif,C1,C2,p,Per-C1,Per-C2,OR,Type
0,GGACT,6468,9801,4.940656e-324,0.003129,0.000795,3.937897,RAC
1,GAACT,6316,11984,4.940656e-324,0.003056,0.000972,3.144887,RAC
2,AGACT,5351,10166,4.940656e-324,0.002589,0.000824,3.140868,RAC
3,GACTG,5739,11423,4.940656e-324,0.002777,0.000926,2.997925,
4,GACTT,5249,10624,4.940656e-324,0.002540,0.000861,2.948175,
...,...,...,...,...,...,...,...,...
1019,CGCGG,265,3893,4.699577e-59,0.000128,0.000316,0.406187,
1020,GCGCG,196,2988,2.789277e-48,0.000095,0.000242,0.391418,
1021,CGCCG,364,5659,1.029839e-92,0.000176,0.000459,0.383819,
1022,CCGCG,271,4264,1.441219e-71,0.000131,0.000346,0.379242,


In [26]:
Data = Data.fillna(value="Non-RAC")

In [27]:
Data

Unnamed: 0,Motif,C1,C2,p,Per-C1,Per-C2,OR,Type
0,GGACT,6468,9801,4.940656e-324,0.003129,0.000795,3.937897,RAC
1,GAACT,6316,11984,4.940656e-324,0.003056,0.000972,3.144887,RAC
2,AGACT,5351,10166,4.940656e-324,0.002589,0.000824,3.140868,RAC
3,GACTG,5739,11423,4.940656e-324,0.002777,0.000926,2.997925,Non-RAC
4,GACTT,5249,10624,4.940656e-324,0.002540,0.000861,2.948175,Non-RAC
...,...,...,...,...,...,...,...,...
1019,CGCGG,265,3893,4.699577e-59,0.000128,0.000316,0.406187,Non-RAC
1020,GCGCG,196,2988,2.789277e-48,0.000095,0.000242,0.391418,Non-RAC
1021,CGCCG,364,5659,1.029839e-92,0.000176,0.000459,0.383819,Non-RAC
1022,CCGCG,271,4264,1.441219e-71,0.000131,0.000346,0.379242,Non-RAC


In [28]:
Data.to_csv("Human_Pentamer_InternalExon_End.csv")