## Downstream analysis for GLORI

Jianheng Liu (Fox) @ Jaffrey Lab, May 31st, 2023

Contact: jil4026@med.cornell.edu

In [1]:
import pandas as pd

To create shortcuts:

ln -s ../test1_run/test1.pileups.txt 

ln -s ../test2_run/test2.pileups.txt 

In [2]:
!head -20 test1.pileups.txt

#ALL	3076283	62418	0.020290070841986904
#Median	0.01607654
#Mean	0.01747423
#90%	0.02539503
#75%	0.01979920
#50%	0.01607654
#25%	0.01274198
#10%	0.01060022
#ELSE	255182	4996	0.019578183414190656
#ENSG00000000003	369	4	0.01084010840108401
#ENSG00000000419	77	1	0.012987012987012988
#ENSG00000000457	81	4	0.04938271604938271
#ENSG00000000460	104	0	0.0
#ENSG00000001036	314	5	0.01592356687898089
#ENSG00000001084	100	2	0.02
#ENSG00000001167	297	13	0.04377104377104377
#ENSG00000001460	78	0	0.0
#ENSG00000001461	243	1	0.00411522633744856
#ENSG00000001497	522	10	0.019157088122605363
#ENSG00000001561	42	4	0.09523809523809523


### Step1 call all m6A sites from pileups

In [3]:
# SampleSheet
!cat SampleSheet

HEK_1	test1.pileups.txt	gene	3
HEK_2	test2.pileups.txt	gene	3


In [3]:
# Type --help to check more options
# -i : SampleSheet
# -P : number of processors, <= number of files to analyze
# --NA discard : optional, if you don't want the non-annotated sites

!python m6A_caller.py -i SampleSheet -P 2 --NA discard -o m6A.csv

[2023-05-31 14:41:29] CMD: m6A_caller.py -i SampleSheet -P 2 --NA discard -o m6A.csv
[2023-05-31 14:41:29] Running with 2 processors, pid [1657827]
[2023-05-31 14:41:29] Searching for m6A candidates...
[2023-05-31 14:41:36] HEK_1 called: 49
[2023-05-31 14:41:36] HEK_2 called: 46
[2023-05-31 14:41:36] Getting common sites...
[2023-05-31 14:41:40] Merge tables...
[2023-05-31 14:41:40] Finished successfully!


In [4]:
df1 = pd.read_csv("m6A.csv", index_col=[0,1,2], header=[0,1])
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HEK_1,HEK_1,HEK_1,HEK_1,HEK_1,HEK_1,HEK_2,HEK_2,HEK_2,HEK_2,HEK_2,HEK_2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,coverage,A count,m6A level,P-value,signal,nonCR,coverage,A count,m6A level,P-value,signal,nonCR
17@51171635@+,ENSG00000011052,NME1-NME2,23,3,0.130435,0.006952,1.000000,0.01719,22,0,0.000000,1.000000,1.0,0.01822
20@62387325@+,ENSG00000171858,RPS21,26,3,0.115385,0.009005,1.000000,0.01664,20,1,0.050000,0.203740,1.0,0.01133
9@19376584@-,ENSG00000137154,RPS6,25,3,0.120000,0.012979,1.000000,0.01985,23,1,0.043478,0.251889,1.0,0.01254
9@19378444@-,ENSG00000137154,RPS6,24,3,0.125000,0.011590,1.000000,0.01985,40,0,0.000000,1.000000,1.0,0.01254
9@19378495@-,ENSG00000137154,RPS6,44,5,0.113636,0.001757,1.000000,0.01985,55,4,0.072727,0.005069,1.0,0.01254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT@2237@+,ENSG00000210082,MT-RNR2,22,4,0.181818,0.001003,1.000000,0.02074,20,1,0.050000,0.356578,1.0,0.02181
MT@6411@+,ENSG00000198804,MT-CO1,35,4,0.114286,0.000982,0.972222,0.01266,36,0,0.000000,1.000000,1.0,0.01442
MT@6894@+,ENSG00000198804,MT-CO1,23,0,0.000000,1.000000,1.000000,0.01266,23,3,0.130435,0.004278,1.0,0.01442
MT@12062@+,ENSG00000198886,MT-ND4,29,1,0.034483,0.310462,1.000000,0.01274,28,4,0.142857,0.001200,1.0,0.01687


### Step 2, add TRUE/FALSE labels to the sites

In [5]:
# Type --help to check more options
# Please make sure that the coverage and other filters you used in Step 1 are the same

!python evaluate_sites.py -i m6A.csv -o m6A.evaluated.csv

[2023-05-31 14:43:06] CMD: evaluate_sites.py -i m6A.csv -o m6A.evaluated.csv
[2023-05-31 14:43:06] Finished.


In [6]:
df2 = pd.read_csv("m6A.evaluated.csv", index_col=[0,1,2], header=[0,1])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HEK_1,HEK_1,HEK_1,HEK_1,HEK_1,HEK_1,HEK_1,HEK_2,HEK_2,HEK_2,HEK_2,HEK_2,HEK_2,HEK_2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,coverage,A count,m6A level,passed,P-value,signal,nonCR,coverage,A count,m6A level,passed,P-value,signal,nonCR
14@103519749@-,ENSG00000166165,CKB,34.0,13.0,0.382353,True,3.396559e-14,1.0,0.01936,37.0,13.0,0.351351,True,1.892448e-14,1.0,0.01668
7@5528622@-,ENSG00000075624,ACTB,22.0,11.0,0.5,True,8.728996e-13,0.956522,0.02408,19.0,6.0,0.315789,False,9.600071e-07,1.0,0.01877
X@73821120@-,ENSG00000229807,XIST,21.0,13.0,0.619048,True,4.535955e-18,1.0,0.01829,17.0,12.0,0.705882,False,7.300681e-18,1.0,0.01816
X@73826862@-,ENSG00000229807,XIST,14.0,10.0,0.714286,False,3.913417e-15,1.0,0.01829,22.0,15.0,0.681818,True,1.162221e-21,1.0,0.01816
X@73849843@-,ENSG00000229807,XIST,24.0,4.0,0.166667,True,0.0008864061,1.0,0.01829,15.0,0.0,0.0,False,1.0,1.0,0.01816
X@73850917@-,ENSG00000229807,XIST,21.0,20.0,0.952381,True,3.6063139999999998e-34,1.0,0.01829,21.0,15.0,0.714286,True,3.761988e-22,1.0,0.01816
X@73851062@-,ENSG00000229807,XIST,19.0,5.0,0.263158,False,1.918811e-05,1.0,0.01829,29.0,6.0,0.206897,True,1.188151e-05,1.0,0.01816
X@73852174@-,ENSG00000229807,XIST,21.0,4.0,0.190476,True,0.0005215627,1.0,0.01829,11.0,1.0,0.090909,False,0.1825468,0.916667,0.01816
X@73852197@-,ENSG00000229807,XIST,38.0,7.0,0.184211,True,5.243811e-06,0.974359,0.01829,21.0,3.0,0.142857,False,0.006232347,0.954545,0.01816
X@73852199@-,ENSG00000229807,XIST,39.0,7.0,0.179487,True,6.289491e-06,0.975,0.01829,25.0,3.0,0.12,False,0.01021295,0.961538,0.01816


### Step 3, merge replicates

In [7]:
!cat SampleSheet.merge_replicates

HEK	HEK_1	test1.pileups.txt gene	3
HEK	HEK_2	test2.pileups.txt gene	3


In [8]:
# Type --help to check more options

!python merge_replicates.py.py -i m6A.evaluated.csv -l SampleSheet.merge_replicates -o  m6A.evaluated.merged.csv

[2023-05-31 14:45:56] CMD: merge_replicates.py.py -i m6A.evaluated.csv -l SampleSheet.merge_replicates -o m6A.evaluated.merged.csv
[2023-05-31 14:45:56] Finished.


In [9]:
df3 = pd.read_csv("m6A.evaluated.merged.csv", index_col=[0,1,2], header=[0,1])
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HEK,HEK,HEK,HEK,HEK,HEK
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,coverage,A count,m6A level,P-combined,passed,replicates
"('INFO', 'chr')","('INFO', 'pos')","('INFO', 'strand')",Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
14@103519749@-,ENSG00000166165,CKB,71.0,26.0,0.366197,8.810546e-27,True,2
7@5528622@-,ENSG00000075624,ACTB,41.0,17.0,0.414634,3.284872e-17,True,2
X@73821120@-,ENSG00000229807,XIST,38.0,25.0,0.657895,5.126862e-34,True,2
X@73826862@-,ENSG00000229807,XIST,36.0,25.0,0.694444,1.471209e-34,True,2
X@73850917@-,ENSG00000229807,XIST,42.0,35.0,0.833333,1.285991e-53,True,2
X@73851062@-,ENSG00000229807,XIST,48.0,11.0,0.229167,1.822009e-09,True,2
X@73852197@-,ENSG00000229807,XIST,59.0,10.0,0.169492,5.224738e-07,True,2
X@73852199@-,ENSG00000229807,XIST,64.0,10.0,0.15625,1.136347e-06,True,2
X@73852206@-,ENSG00000229807,XIST,67.0,56.0,0.835821,8.641746e-84,True,2
X@73852662@-,ENSG00000229807,XIST,57.0,8.0,0.140351,2.039157e-05,True,2
