This notebook merges the VEP "Most Severe" with the associated p-values from REDITs.

In [12]:
library(dplyr)
library(tidyr)

In [13]:
# Set paths to data
vep_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/VEP/all_dpf/JACUSA2_all_dpf_VEP_most_severe_output.txt"
p_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/REDITs/all_dpf/JACUSA2_all_dpf_p_values.tsv"

In [14]:
# Load in data
vep_data <- read.table(vep_file, header = FALSE, sep = "\t", stringsAsFactors = FALSE)
p_data <- read.table(p_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

In [15]:
#Extract ID and most severe consequence columns and name accordingly 
vep_new <- vep_data[, c(1, 4)]
colnames(vep_new) <- c("ID", "most_severe_consequence")

In [16]:
#Remove the _A/G from ID column
vep_new$ID <- sub("_A/G", "", vep_new$ID)

In [17]:
p_data$ID <- sub("___AG", "", p_data$ID)
p_data$ID <- sub("___TC", "", p_data$ID)

In [18]:
annotated_data <- merge(p_data, vep_new, by = "ID")

In [19]:
# Remove p_adj values (redoing after filter)
annotated_data <- annotated_data[, !colnames(annotated_data) %in% c("p_adj_BH")]

In [20]:
annotated_data

ID,p_value,most_severe_consequence
<chr>,<dbl>,<chr>
1_10023876,0.0442094628,missense_variant
1_10054490,0.0662903807,missense_variant
1_10057975,0.0092256133,missense_variant
1_10067239,0.0028576348,missense_variant
1_1006825,0.4785242670,non_coding_transcript_exon_variant
1_10068596,0.0054651169,synonymous_variant
1_1009780,0.8750256688,non_coding_transcript_exon_variant
1_10313231,0.0005234153,missense_variant
1_10314652,0.8984568719,3_prime_UTR_variant
1_10325782,0.3259710215,intron_variant


In [21]:
output_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/JACUSA2_all_dpf_VEP_p_val.tsv"

In [11]:
write.table(annotated_data, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE)