Trying to create this format for VEP:

VCF

VEP also supports using VCF (Variant Call Format) version 4.0. This is a common format used by the 1000 genomes project, and can be produced as an output format by many variant calling tools:

#CHROM  POS        ID      REF  ALT            QUAL  FILTER  INFO  FORMAT

1       65568      .       A    C              .     .       .     .

In [1]:
library(dplyr)
library(tidyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [3]:
# Paths to AG and TC data files
AG_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_AG_sites.tsv"
TC_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/all_dpf/all_dpf_TC_sites.tsv"

In [4]:
# Load in data
AG_data <- read.table(AG_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
TC_data <- read.table(TC_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

Sites for T/C (-) were checked with GenomeBrowser and corresponded to A/G forward strand so all T/C sites were switched over to A/G since the VCF input for VEP assumes forward strand.

In [5]:
# Replace ref column "TRUE" to T (somehow got changed to TRUE)
TC_data$ref <- "A"

In [6]:
# Add in ALT column for A/G and T/C
AG_data$ALT <- "G"
TC_data$ALT <- "G"

In [7]:
# Remove counts data
AG_data <- AG_data[, !grepl("count", colnames(AG_data))]
TC_data <- TC_data[, !grepl("count", colnames(TC_data))]

In [8]:
# Remove strand and tag
AG_data <- AG_data[, !grepl("strand", colnames(AG_data))]
TC_data <- TC_data[, !grepl("strand", colnames(TC_data))]

AG_data <- AG_data[, !grepl("tag", colnames(AG_data))]
TC_data <- TC_data[, !grepl("tag", colnames(TC_data))]

In [9]:
AG_data

ID,ref,ALT
<chr>,<chr>,<chr>
1_10023876,A,G
1_10054490,A,G
1_10057975,A,G
1_10313231,A,G
1_10314652,A,G
1_10325782,A,G
1_10325785,A,G
1_10379135,A,G
1_10432785,A,G
1_10684036,A,G


In [10]:
# Merge the two for futher manipulating 
all_data <- merge(AG_data, TC_data, by = c("ID", "ref", "ALT"), all = TRUE)

In [11]:
all_data

ID,ref,ALT
<chr>,<chr>,<chr>
1_10023876,A,G
1_10054490,A,G
1_10057975,A,G
1_10067239,A,G
1_1006825,A,G
1_10068596,A,G
1_1009780,A,G
1_10313231,A,G
1_10314652,A,G
1_10325782,A,G


In [12]:
#Split ID into CHROM POS
all_data <- all_data %>%
  separate(ID, into = c("#CHROM", "POS"), sep = "_", remove = FALSE)

In [13]:
head(all_data)

Unnamed: 0_level_0,ID,#CHROM,POS,ref,ALT
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
1,1_10023876,1,10023876,A,G
2,1_10054490,1,10054490,A,G
3,1_10057975,1,10057975,A,G
4,1_10067239,1,10067239,A,G
5,1_1006825,1,1006825,A,G
6,1_10068596,1,10068596,A,G


In [14]:
# Remove ID (region position) column 
all_data <- all_data %>% select(-ID)

In [15]:
# Add in VCF ID column after POS
# Find the index of the "POS" column
pos_index <- which(names(all_data) == "POS")

# Insert a new column called "ID" with "." in every row after the "POS" column

all_data <- cbind(all_data[, 1:pos_index], ID = ".", all_data[, (pos_index + 1):ncol(all_data)])

In [16]:
# Rename the "ref" column to "REF"
names(all_data)[names(all_data) == "ref"] <- "REF"

In [17]:
# Add in QUAL column to the right of index 

# Find the index of the "ALT" column
alt_index <- which(names(all_data) == "ALT")

# Create a new variable with the modified data
all_data <- all_data %>%
  mutate(QUAL = ifelse(row_number() == alt_index + 1, ".", "."))

In [18]:
head(all_data)

Unnamed: 0_level_0,#CHROM,POS,ID,REF,ALT,QUAL
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,10023876,.,A,G,.
2,1,10054490,.,A,G,.
3,1,10057975,.,A,G,.
4,1,10067239,.,A,G,.
5,1,1006825,.,A,G,.
6,1,10068596,.,A,G,.


In [19]:
# Find the index of the "QUAL" column
qual_index <- which(names(all_data) == "QUAL")

# Create a new variable with the modified data, adding "FILTER" column
all_data <- all_data %>%
  mutate(FILTER = ifelse(row_number() == qual_index + 1, ".", "."))

In [20]:
head(all_data)

Unnamed: 0_level_0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,10023876,.,A,G,.,.
2,1,10054490,.,A,G,.,.
3,1,10057975,.,A,G,.,.
4,1,10067239,.,A,G,.,.
5,1,1006825,.,A,G,.,.
6,1,10068596,.,A,G,.,.


In [21]:
# Find the index of the "FILTER" column
filter_index <- which(names(all_data) == "FILTER")

# Create a new variable with the modified data, adding "INFO" column
all_data <- all_data %>%
  mutate(INFO = ifelse(row_number() == filter_index + 1, ".", "."))

In [22]:
head(all_data)

Unnamed: 0_level_0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,10023876,.,A,G,.,.,.
2,1,10054490,.,A,G,.,.,.
3,1,10057975,.,A,G,.,.,.
4,1,10067239,.,A,G,.,.,.
5,1,1006825,.,A,G,.,.,.
6,1,10068596,.,A,G,.,.,.


In [23]:
# Add in format column

# Find the index of the "INFO" column
info_index <- which(names(all_data) == "INFO")

# Create a new variable with the modified data, adding "FORMAT" column
all_data <- all_data %>%
  mutate(FORMAT = ifelse(row_number() == info_index + 1, ".", "."))

In [24]:
head(all_data)

Unnamed: 0_level_0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,10023876,.,A,G,.,.,.,.
2,1,10054490,.,A,G,.,.,.,.
3,1,10057975,.,A,G,.,.,.,.
4,1,10067239,.,A,G,.,.,.,.
5,1,1006825,.,A,G,.,.,.,.
6,1,10068596,.,A,G,.,.,.,.


In [26]:
# Define output file path
output_file <- "/mnt/vast/hpc/csg/hcs2152/ZFR_RNA_Editing/JACUSA2/VEP/all_dpf/JACUSA2_all_dpf_VEP_input.tsv"

In [75]:
# Write data to output file
write.table(all_data, output_file, sep = "\t", quote = FALSE, row.names = FALSE)