This notebook filters out variant consequences that are not of interest. The p-values are then adjusted for multiple testing correction. Finally, the consequences/p-values are merged with the original base counts and editing proportions as the master output file. 

In [13]:
library(dplyr)
library(tidyr)

In [14]:
# Set file paths 
filter_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/VEP/VEP_consequence_filter_to_keep.txt"
data_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/JACUSA2_all_dpf_VEP_p_val.tsv"

In [15]:
# Load in the data
filter_data <- read.table(filter_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE)
data <- read.table(data_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

In [16]:
# Add column name to filter data
colnames(filter_data) <- c("consequence")

In [17]:
filter_data

consequence
<chr>
transcript_ablation
stop_gained
frameshift_variant
stop_lost
start_lost
transcript_amplification
feature_elongation
feature_truncation
inframe_insertion
inframe_deletion


In [18]:
# Keep values present in filter_data consequence column
data_new <- data[data$most_severe_consequence %in% filter_data$consequence,]

In [19]:
data_new

Unnamed: 0_level_0,ID,p_value,most_severe_consequence
Unnamed: 0_level_1,<chr>,<dbl>,<chr>
1,1_10023876,0.0442094628,missense_variant
2,1_10054490,0.0662903807,missense_variant
3,1_10057975,0.0092256133,missense_variant
4,1_10067239,0.0028576348,missense_variant
8,1_10313231,0.0005234153,missense_variant
12,1_10379135,0.6257129526,missense_variant
13,1_10432785,0.5186447508,missense_variant
28,1_10897372,0.2453501989,missense_variant
29,1_10901303,0.2171556195,missense_variant
30,1_10918137,0.4873411206,missense_variant


In [20]:
# Run the p-adjust and place after p_value
data_new <- data_new %>% 
  mutate(p_adj_BH = p.adjust(p_value, method = "BH"), .after = "p_value")

In [21]:
data_new

Unnamed: 0_level_0,ID,p_value,p_adj_BH,most_severe_consequence
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>
1,1_10023876,0.0442094628,0.35273257,missense_variant
2,1_10054490,0.0662903807,0.40293478,missense_variant
3,1_10057975,0.0092256133,0.20586423,missense_variant
4,1_10067239,0.0028576348,0.13556956,missense_variant
8,1_10313231,0.0005234153,0.07134766,missense_variant
12,1_10379135,0.6257129526,0.86705499,missense_variant
13,1_10432785,0.5186447508,0.81722897,missense_variant
28,1_10897372,0.2453501989,0.64196246,missense_variant
29,1_10901303,0.2171556195,0.61626369,missense_variant
30,1_10918137,0.4873411206,0.80458672,missense_variant


In [22]:
# Add in base counts to file

AG_counts <- read.table("/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_AG_sites.tsv", sep = "\t", stringsAsFactors = FALSE)
TC_counts <- read.table("/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_TC_sites.tsv", sep = "\t", stringsAsFactors = FALSE)

#Fix the column names
col_names_1 <- AG_counts[1, ]
col_names_2 <- TC_counts[1, ]


# Remove the first row from the DataFrame
AG_counts <- AG_counts[-1, ]
TC_counts <- TC_counts[-1, ]

# Set the column names
colnames(AG_counts) <- col_names_1
colnames(TC_counts) <- col_names_2

In [23]:
# Percent edited 
proportions <- read.table("/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_merged_filtered_data.tsv", sep = "\t", stringsAsFactors = FALSE)

In [24]:
# Rename columns to merge with 
colnames(AG_counts) <- c("ID", "strand", "ref", "Ctrl.01_Base_count", "Ctrl.01_A_count", "Ctrl.01_C_count", "Ctrl.01_G_count", "Ctrl.01_T_count", "Ctrl.02_Base_count", "Ctrl.02_A_count", "Ctrl.02_C_count", "Ctrl.02_G_count",  "Ctrl.02_T_count", "Ctrl.03_Base_count", "Ctrl.03_A_count", "Ctrl.03_C_count", "Ctrl.03_G_count", "Ctrl.03_T_count","Ctrl.04_Base_count", "Ctrl.04_A_count", "Ctrl.04_C_count", "Ctrl.04_G_count", "Ctrl.04_T_count", "Ctrl.05_Base_count", "Ctrl.05_A_count", "Ctrl.05_C_count","Ctrl.05_G_count", "Ctrl.05_T_count", "Ctrl.06_Base_count", "Ctrl.06_A_count", "Ctrl.06_C_count", "Ctrl.06_G_count", "Ctrl.06_T_count", "NO.01_Base_count", "NO.01_A_count", "NO.01_C_count", "NO.01_G_count", "NO.01_T_count","NO.02_Base_count", "NO.02_A_count", "NO.02_C_count", "NO.02_G_count", "NO.02_T_count", "NO.03_Base_count", "NO.03_A_count", "NO.03_C_count", "NO.03_G_count", "NO.03_T_count","NO.04_Base_count", "NO.04_A_count", "NO.04_C_count", "NO.04_G_count", "NO.04_T_count", "NO.05_Base_count", "NO.05_A_count", "NO.05_C_count", "NO.05_G_count", "NO.05_T_count", "NO.06_Base_count", "NO.06_A_count", "NO.06_C_count", "NO.06_G_count", "NO.06_T_count", "tag")
colnames(TC_counts) <- c("ID", "strand", "ref", "Ctrl.01_Base_count", "Ctrl.01_A_count", "Ctrl.01_C_count", "Ctrl.01_G_count",  "Ctrl.01_T_count", "Ctrl.02_Base_count", "Ctrl.02_A_count", "Ctrl.02_C_count", "Ctrl.02_G_count", "Ctrl.02_T_count", "Ctrl.03_Base_count", "Ctrl.03_A_count", "Ctrl.03_C_count", "Ctrl.03_G_count",  "Ctrl.03_T_count","Ctrl.04_Base_count", "Ctrl.04_A_count", "Ctrl.04_C_count", "Ctrl.04_G_count", "Ctrl.04_T_count", "Ctrl.05_Base_count", "Ctrl.05_A_count", "Ctrl.05_C_count","Ctrl.05_G_count", "Ctrl.05_T_count", "Ctrl.06_Base_count", "Ctrl.06_A_count", "Ctrl.06_C_count", "Ctrl.06_G_count", "Ctrl.06_T_count", "NO.01_Base_count", "NO.01_A_count", "NO.01_C_count", "NO.01_G_count", "NO.01_T_count","NO.02_Base_count", "NO.02_A_count", "NO.02_C_count", "NO.02_G_count", "NO.02_T_count", "NO.03_Base_count", "NO.03_A_count", "NO.03_C_count", "NO.03_G_count", "NO.03_T_count", "NO.04_Base_count", "NO.04_A_count", "NO.04_C_count", "NO.04_G_count", "NO.04_T_count", "NO.05_Base_count", "NO.05_A_count", "NO.05_C_count", "NO.05_G_count", "NO.05_T_count", "NO.06_Base_count", "NO.06_A_count", "NO.06_C_count", "NO.06_G_count", "NO.06_T_count", "tag")

In [25]:
# Merge sites together
all_sites <- merge(AG_counts, TC_counts, by = c("ID", "strand", "ref", "Ctrl.01_Base_count", "Ctrl.01_A_count", "Ctrl.01_C_count", "Ctrl.01_G_count", "Ctrl.01_T_count", "Ctrl.02_Base_count", "Ctrl.02_A_count", "Ctrl.02_C_count", "Ctrl.02_G_count",  "Ctrl.02_T_count", "Ctrl.03_Base_count", "Ctrl.03_A_count", "Ctrl.03_C_count", "Ctrl.03_G_count", "Ctrl.03_T_count", "Ctrl.04_Base_count", "Ctrl.04_A_count", "Ctrl.04_C_count", "Ctrl.04_G_count", "Ctrl.04_T_count", "Ctrl.05_Base_count", "Ctrl.05_A_count", "Ctrl.05_C_count","Ctrl.05_G_count", "Ctrl.05_T_count", "Ctrl.06_Base_count", "Ctrl.06_A_count", "Ctrl.06_C_count", "Ctrl.06_G_count", "Ctrl.06_T_count", "NO.01_Base_count", "NO.01_A_count", "NO.01_C_count", "NO.01_G_count", "NO.01_T_count","NO.02_Base_count", "NO.02_A_count", "NO.02_C_count", "NO.02_G_count", "NO.02_T_count", "NO.03_Base_count", "NO.03_A_count", "NO.03_C_count", "NO.03_G_count", "NO.03_T_count", "NO.04_Base_count", "NO.04_A_count", "NO.04_C_count", "NO.04_G_count", "NO.04_T_count", "NO.05_Base_count", "NO.05_A_count", "NO.05_C_count", "NO.05_G_count", "NO.05_T_count", "NO.06_Base_count", "NO.06_A_count", "NO.06_C_count", "NO.06_G_count", "NO.06_T_count", "tag"), all = TRUE)

In [26]:
all_sites

ID,strand,ref,Ctrl.01_Base_count,Ctrl.01_A_count,Ctrl.01_C_count,Ctrl.01_G_count,Ctrl.01_T_count,Ctrl.02_Base_count,Ctrl.02_A_count,⋯,NO.05_A_count,NO.05_C_count,NO.05_G_count,NO.05_T_count,NO.06_Base_count,NO.06_A_count,NO.06_C_count,NO.06_G_count,NO.06_T_count,tag
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1_10023876,+,A,40090,40,0,9,0,54090,54,⋯,87,0,11,0,920320,92,0,32,0,AG
1_10054490,+,A,47101020,471,0,102,0,3840190,384,⋯,1268,0,212,0,70702710,707,0,271,0,AG
1_10057975,+,A,1550420,155,0,42,0,1150190,115,⋯,535,0,142,0,33701780,337,0,178,0,AG
1_10067239,-,T,0530151,0,53,0,151,0260107,0,⋯,0,115,0,241,0810161,0,81,0,161,TC
1_1006825,-,T,021015,0,21,0,15,011042,0,⋯,0,25,0,29,02038,0,2,0,38,TC
1_10068596,-,T,0159095,0,159,0,95,093095,0,⋯,0,583,0,0,0407034,0,407,0,34,TC
1_1009780,-,T,015023,0,15,0,23,010022,0,⋯,0,11,0,44,015018,0,15,0,18,TC
1_10313231,+,A,10120,1,0,12,0,70360,7,⋯,9,0,24,0,150250,15,0,25,0,AG
1_10314652,+,A,520290,52,0,29,0,480250,48,⋯,13,0,67,0,280340,28,0,34,0,AG
1_10325782,+,A,130190,13,0,19,0,5060,5,⋯,4,0,37,0,100390,10,0,39,0,AG


In [27]:
proportions

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
ID,Ctrl.01_bases_Non_Edited_Count,Ctrl.01_bases_Edited_Count,Ctrl.01_bases_Edited_Count_Proportion,Ctrl.02_bases_Non_Edited_Count,Ctrl.02_bases_Edited_Count,Ctrl.02_bases_Edited_Count_Proportion,Ctrl.03_bases_Non_Edited_Count,Ctrl.03_bases_Edited_Count,Ctrl.03_bases_Edited_Count_Proportion,⋯,NO.03_bases_Edited_Count_Proportion,NO.04_bases_Non_Edited_Count,NO.04_bases_Edited_Count,NO.04_bases_Edited_Count_Proportion,NO.05_bases_Non_Edited_Count,NO.05_bases_Edited_Count,NO.05_bases_Edited_Count_Proportion,NO.06_bases_Non_Edited_Count,NO.06_bases_Edited_Count,NO.06_bases_Edited_Count_Proportion
1_5479,11,31,0.738095238095238,2,20,0.909090909090909,3,26,0.896551724137931,⋯,0.91304347826087,9,28,0.756756756756757,2,39,0.951219512195122,4,18,0.818181818181818
1_5536,6,27,0.818181818181818,5,22,0.814814814814815,4,20,0.833333333333333,⋯,0.885714285714286,12,36,0.75,1,31,0.96875,4,21,0.84
1_5539,33,0,0,29,0,0,20,4,0.166666666666667,⋯,0,47,0,0,32,0,0,27,0,0
1_6119,31,0,0,31,0,0,27,0,0,⋯,0.111111111111111,31,0,0,24,0,0,32,0,0
1_6643,32,0,0,21,0,0,35,0,0,⋯,0,18,0,0,32,0,0,15,0,0
1_6727,33,0,0,41,0,0,40,2,0.0476190476190476,⋯,0,19,0,0,21,2,0.0869565217391304,22,0,0
1_6727,33,0,0,41,0,0,40,2,0.0476190476190476,⋯,0,19,0,0,21,2,0.0869565217391304,1646,11,0.00663850331925166
1_6727,33,0,0,41,0,0,40,2,0.0476190476190476,⋯,0,19,0,0,1592,4,0.0025062656641604,22,0,0
1_6727,33,0,0,41,0,0,40,2,0.0476190476190476,⋯,0,19,0,0,1592,4,0.0025062656641604,1646,11,0.00663850331925166


In [28]:
# Fix vector notation in proportions dataframe 
columns <- proportions[1, ]
proportions <- proportions[-1, ]
colnames(proportions) <- columns

In [29]:
# Merge but only keep sites present in "all_sites" dataframe since it's been filtered 
unique_ids <- unique(all_sites$ID)
merged_data <- merge(all_sites, proportions, by = "ID", all = FALSE)
merged_data <- merged_data[merged_data$ID %in% unique_ids, ]

In [30]:
merged_data

Unnamed: 0_level_0,ID,strand,ref,Ctrl.01_Base_count,Ctrl.01_A_count,Ctrl.01_C_count,Ctrl.01_G_count,Ctrl.01_T_count,Ctrl.02_Base_count,Ctrl.02_A_count,⋯,NO.03_bases_Edited_Count_Proportion,NO.04_bases_Non_Edited_Count,NO.04_bases_Edited_Count,NO.04_bases_Edited_Count_Proportion,NO.05_bases_Non_Edited_Count,NO.05_bases_Edited_Count,NO.05_bases_Edited_Count_Proportion,NO.06_bases_Non_Edited_Count,NO.06_bases_Edited_Count,NO.06_bases_Edited_Count_Proportion
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1_10023876,+,A,40090,40,0,9,0,54090,54,⋯,0.127659574468085,63,18,0.222222222222222,87,11,0.112244897959184,92,32,0.258064516129032
2,1_10054490,+,A,47101020,471,0,102,0,3840190,384,⋯,0.267886855241265,780,355,0.312775330396476,1268,212,0.143243243243243,707,271,0.277096114519427
3,1_10057975,+,A,1550420,155,0,42,0,1150190,115,⋯,0.198757763975155,461,228,0.330914368650218,535,142,0.209748892171344,337,178,0.345631067961165
4,1_10067239,-,T,0530151,0,53,0,151,0260107,0,⋯,0.46551724137931,184,113,0.38047138047138,241,115,0.323033707865169,161,81,0.334710743801653
5,1_1006825,-,T,021015,0,21,0,15,011042,0,⋯,0.19047619047619,38,17,0.309090909090909,29,25,0.462962962962963,38,2,0.05
6,1_10068596,-,T,0159095,0,159,0,95,093095,0,⋯,0.814977973568282,167,350,0.676982591876209,0,583,1,34,407,0.922902494331066
7,1_1009780,-,T,015023,0,15,0,23,010022,0,⋯,0.318181818181818,25,13,0.342105263157895,44,11,0.2,18,15,0.454545454545455
8,1_10313231,+,A,10120,1,0,12,0,70360,7,⋯,0.652173913043478,10,18,0.642857142857143,9,24,0.727272727272727,15,25,0.625
9,1_10314652,+,A,520290,52,0,29,0,480250,48,⋯,0.372093023255814,63,30,0.32258064516129,13,67,0.8375,28,34,0.548387096774194
10,1_10325782,+,A,130190,13,0,19,0,5060,5,⋯,0.411764705882353,11,21,0.65625,4,37,0.902439024390244,10,39,0.795918367346939


In [31]:
# Merge consequences/p-values with counts/editing proportions 
final_data <- merge(data_new, merged_data, by = "ID", all = FALSE)

In [32]:
final_data

ID,p_value,p_adj_BH,most_severe_consequence,strand,ref,Ctrl.01_Base_count,Ctrl.01_A_count,Ctrl.01_C_count,Ctrl.01_G_count,⋯,NO.03_bases_Edited_Count_Proportion,NO.04_bases_Non_Edited_Count,NO.04_bases_Edited_Count,NO.04_bases_Edited_Count_Proportion,NO.05_bases_Non_Edited_Count,NO.05_bases_Edited_Count,NO.05_bases_Edited_Count_Proportion,NO.06_bases_Non_Edited_Count,NO.06_bases_Edited_Count,NO.06_bases_Edited_Count_Proportion
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1_10023876,0.0442094628,0.35273257,missense_variant,+,A,40090,40,0,9,⋯,0.127659574468085,63,18,0.222222222222222,87,11,0.112244897959184,92,32,0.258064516129032
1_10054490,0.0662903807,0.40293478,missense_variant,+,A,47101020,471,0,102,⋯,0.267886855241265,780,355,0.312775330396476,1268,212,0.143243243243243,707,271,0.277096114519427
1_10057975,0.0092256133,0.20586423,missense_variant,+,A,1550420,155,0,42,⋯,0.198757763975155,461,228,0.330914368650218,535,142,0.209748892171344,337,178,0.345631067961165
1_10067239,0.0028576348,0.13556956,missense_variant,-,T,0530151,0,53,0,⋯,0.46551724137931,184,113,0.38047138047138,241,115,0.323033707865169,161,81,0.334710743801653
1_10313231,0.0005234153,0.07134766,missense_variant,+,A,10120,1,0,12,⋯,0.652173913043478,10,18,0.642857142857143,9,24,0.727272727272727,15,25,0.625
1_10379135,0.6257129526,0.86705499,missense_variant,+,A,50160,5,0,16,⋯,0.91304347826087,18,34,0.653846153846154,16,78,0.829787234042553,14,24,0.631578947368421
1_10432785,0.5186447508,0.81722897,missense_variant,+,A,170160,17,0,16,⋯,0.117647058823529,59,27,0.313953488372093,16,40,0.714285714285714,22,13,0.371428571428571
1_10897372,0.2453501989,0.64196246,missense_variant,-,T,064019,0,64,0,⋯,0.711864406779661,51,100,0.662251655629139,44,126,0.741176470588235,76,63,0.453237410071942
1_10901303,0.2171556195,0.61626369,missense_variant,-,T,085025,0,85,0,⋯,0.673333333333333,45,124,0.733727810650888,45,137,0.752747252747253,111,105,0.486111111111111
1_10918137,0.4873411206,0.80458672,missense_variant,-,T,014023,0,14,0,⋯,0.364705882352941,83,29,0.258928571428571,15,76,0.835164835164835,75,51,0.404761904761905


In [33]:
duplicated_ids <- final_data$ID[duplicated(final_data$ID)]

In [34]:
duplicated_ids

In [35]:
final_data_unique <- final_data[!duplicated(final_data$ID), ]

In [36]:
final_data_unique

Unnamed: 0_level_0,ID,p_value,p_adj_BH,most_severe_consequence,strand,ref,Ctrl.01_Base_count,Ctrl.01_A_count,Ctrl.01_C_count,Ctrl.01_G_count,⋯,NO.03_bases_Edited_Count_Proportion,NO.04_bases_Non_Edited_Count,NO.04_bases_Edited_Count,NO.04_bases_Edited_Count_Proportion,NO.05_bases_Non_Edited_Count,NO.05_bases_Edited_Count,NO.05_bases_Edited_Count_Proportion,NO.06_bases_Non_Edited_Count,NO.06_bases_Edited_Count,NO.06_bases_Edited_Count_Proportion
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1_10023876,0.0442094628,0.35273257,missense_variant,+,A,40090,40,0,9,⋯,0.127659574468085,63,18,0.222222222222222,87,11,0.112244897959184,92,32,0.258064516129032
2,1_10054490,0.0662903807,0.40293478,missense_variant,+,A,47101020,471,0,102,⋯,0.267886855241265,780,355,0.312775330396476,1268,212,0.143243243243243,707,271,0.277096114519427
3,1_10057975,0.0092256133,0.20586423,missense_variant,+,A,1550420,155,0,42,⋯,0.198757763975155,461,228,0.330914368650218,535,142,0.209748892171344,337,178,0.345631067961165
4,1_10067239,0.0028576348,0.13556956,missense_variant,-,T,0530151,0,53,0,⋯,0.46551724137931,184,113,0.38047138047138,241,115,0.323033707865169,161,81,0.334710743801653
5,1_10313231,0.0005234153,0.07134766,missense_variant,+,A,10120,1,0,12,⋯,0.652173913043478,10,18,0.642857142857143,9,24,0.727272727272727,15,25,0.625
6,1_10379135,0.6257129526,0.86705499,missense_variant,+,A,50160,5,0,16,⋯,0.91304347826087,18,34,0.653846153846154,16,78,0.829787234042553,14,24,0.631578947368421
7,1_10432785,0.5186447508,0.81722897,missense_variant,+,A,170160,17,0,16,⋯,0.117647058823529,59,27,0.313953488372093,16,40,0.714285714285714,22,13,0.371428571428571
8,1_10897372,0.2453501989,0.64196246,missense_variant,-,T,064019,0,64,0,⋯,0.711864406779661,51,100,0.662251655629139,44,126,0.741176470588235,76,63,0.453237410071942
9,1_10901303,0.2171556195,0.61626369,missense_variant,-,T,085025,0,85,0,⋯,0.673333333333333,45,124,0.733727810650888,45,137,0.752747252747253,111,105,0.486111111111111
10,1_10918137,0.4873411206,0.80458672,missense_variant,-,T,014023,0,14,0,⋯,0.364705882352941,83,29,0.258928571428571,15,76,0.835164835164835,75,51,0.404761904761905


In [37]:
output_path <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/JACUSA2_all_dpf_final.tsv"
write.table(final_data_unique, file = output_path, sep = "\t", quote = FALSE, row.names = FALSE)