# Step 11: Consequence File Related (PreFiltration_Type)

## Inputs:

In [None]:

# Actual visual file path (Step 10 Removal Of DBsnpSites Tsv File)
file_path <- "/path/to/Step_10___RemovalOfAllDBsnpSitesFromBisualFile/PreFiltration_Type/Step_10___Removed_TabSeparatedFileRows_AKA_DBsnpSitesFileRows.tsv"

# Actual path to your consequence file generated with most severe flag
consequence_file_path <- "/path/to/Step_09___RemovingLinesBeforeColumnNames/PreFiltration_Type/A___Most_Severe_Flag_Added/RemovedLinesBeforeColumnNames_S82a___MostSevereFlagAdded___.txt"

# Define output variables
output_directory <- "/path/to/Step_11___Consequence_Column_Related/PreFiltration_Type"

# Choose a descriptor of your choice:
File_BaseName <- "DescriptorOfYourChoice___PreFiltration_Type"


## Check Provided Paths/Etc. Exist And Create Prefixes Per Part:

In [None]:
# Check if the visual file exists on path
if (file.exists(file_path)) {
  cat("\n*TSV File Path:*", file_path, "\n")
} else {
  cat("File does not exist. Please provide a valid file path.\n")
}

# Check if the consequence file exists
if (file.exists(consequence_file_path)) {
  cat("\n*Consequence File Path:*", consequence_file_path, "\n")
} else {
  cat("Consequence file does not exist. Please provide a valid file path.\n")
}

# Check if the output directory exists
if (dir.exists(output_directory)) {
  cat("\n*Output Directory:*", output_directory, "\n")
} else {
  cat("Output directory does not exist. Please provide a valid directory path.\n")
}

# Check if File_BaseName is a non-empty string
if (nchar(File_BaseName) == 0) {
  cat("\n\n\n\n****Error: File_BaseName is an empty string. Please provide a non-empty base name.****\n\n")
  # Add any necessary steps to handle the error or exit the script
} else {
  cat("\n*File_BaseName:*", File_BaseName, "\n\n")
}

### Print Prefix And Stem Name For Each Part

In [None]:
# Create the modified prefixes for each part separately
Part_0___Prefix <- paste0("\"Part_0___", File_BaseName, "___\"")
Part_1___Prefix <- paste0("\"Part_1___", File_BaseName, "___\"")
Part_2___Prefix <- paste0("\"Part_2___", File_BaseName, "___\"")
Part_3___Prefix <- paste0("\"Part_3___", File_BaseName, "___\"")
Part_4___Prefix <- paste0("\"Part_4___", File_BaseName, "___\"")

# Print statements indicating the purpose
cat("*Prefix for Part 0:* ", Part_0___Prefix, "\n")
cat("*Prefix for Part 1:* ", Part_1___Prefix, "\n")
cat("*Prefix for Part 2:* ", Part_2___Prefix, "\n")
cat("*Prefix for Part 3:* ", Part_3___Prefix, "\n")
cat("*Prefix for Part 4:* ", Part_4___Prefix, "\n")


## Read In File:

In [None]:

# Read the TSV file
data <- read.table(file_path, sep = '\t', header = TRUE)

# View the first six rows of the data
head(data)


## Read In Consequence File:

In [None]:
# Read the header separately
header <- readLines(consequence_file_path, n = 1)
header <- gsub("^#", "", header)
column_names <- strsplit(header, '\t')[[1]]

# Read the rest of the data skipping lines starting with '#'
consequence_data <- read.table(consequence_file_path, header = FALSE, sep = '\t', comment.char = '#', skip = 1, col.names = column_names)

# View the first six rows of the data
head(consequence_data)

### Remove Allele, Impact, Symbol, Gene, Feature_type, Feature, Biotype, cDNA_position, CDS_position, Protein_position, Amino_acids, Codons, Existing_variation & Extra Columns

In [None]:
# Subset the data frame to keep only specified columns
consequence_data_subset <- consequence_data[c("Uploaded_variation", "Location", "Consequence")]

# View the first six rows of the subsetted data
head(consequence_data_subset)


## Libraries:

In [None]:

library(dplyr)
library(stringr)
library(tidyr)


### Split Uploaded_variation Column By Underscore

In [None]:
# Replace 'consequence_data_subset' with your actual data frame
consequence_data_subset$Uploaded_variation <- as.character(consequence_data_subset$Uploaded_variation)

# Split the "Uploaded_variation" column
split_variation <- str_split(consequence_data_subset$Uploaded_variation, "_", simplify = TRUE)

# Add new columns for chromosome, position, and RefAlt to the existing data frame
consequence_data_subset$Chromosome <- split_variation[, 1]
consequence_data_subset$Position <- split_variation[, 2]
consequence_data_subset$RefAlt <- split_variation[, 3]

# View the updated data frame
head(consequence_data_subset)

In [None]:
# Remove specific columns
consequence_data_subset <- subset(consequence_data_subset, select = -c(Uploaded_variation, Location, RefAlt))

# View the updated data frame
head(consequence_data_subset)


### See The Unique Strings In The Consequence Columns:

In [None]:
# Replace 'consequence_data_subset' with your actual data frame
unique_consequences <- unique(consequence_data_subset$Consequence)

# Print the unique strings
cat("*Unique Consequences:*\n", unique_consequences, sep = "\n")

### Make An Impact Column:

In [None]:
# Create a vector to map Consequence to IMPACT
impact_mapping <- c(
  "transcript_ablation" = "HIGH",
  "splice_acceptor_variant" = "HIGH",
  "splice_donor_variant" = "HIGH",
  "stop_gained" = "HIGH",
  "frameshift_variant" = "HIGH",
  "stop_lost" = "HIGH",
  "start_lost" = "HIGH",
  "transcript_amplification" = "HIGH",
  "feature_elongation" = "HIGH",
  "feature_truncation" = "HIGH",
  "inframe_insertion" = "MODERATE",
  "inframe_deletion" = "MODERATE",
  "missense_variant" = "MODERATE",
  "protein_altering_variant" = "MODERATE"
)

# Create the IMPACT column based on the Consequence column
consequence_data_subset$IMPACT <- impact_mapping[consequence_data_subset$Consequence]

# View the updated data frame
head(consequence_data_subset)


#### Remove All Rows Where Impact Is Not Moderate Or High In Conseqence Data Set

In [None]:
# Filter rows based on the IMPACT column
filtered_data <- consequence_data_subset[consequence_data_subset$IMPACT %in% c("HIGH", "MODERATE"), ]

# View the first six rows of the filtered data
head(filtered_data)


## Merge DataSets:

In [None]:
# Assuming 'data' and 'filtered_data' are your data frames
merged_data <- merge(data, filtered_data, by.x = c("chromosome", "position"), by.y = c("Chromosome", "Position"), all.x = TRUE)

# View the first few rows of the merged data
head(merged_data)


### Write Merged Data To Part 0 Folder

In [None]:

# Create a subfolder named 'Merged_Data_With_NA_In_Impact_Column'
subfolder <- file.path(output_directory, 'Part_0___Merged_Data_With_NA_In_Impact_Column')
dir.create(subfolder, showWarnings = FALSE)

# Remove quotes from Part_0___Prefix
Part_0___Prefix <- gsub('"', '', Part_0___Prefix)

# Write merged_data to TSV and CSV files in the subfolder
tsv_file <- file.path(subfolder, paste0(c(Part_0___Prefix, "Merged_Data_With_NA_In_Impact_Column.tsv"), collapse = ""))
csv_file <- file.path(subfolder, paste0(c(Part_0___Prefix, "Merged_Data_With_NA_In_Impact_Column.csv"), collapse = ""))

write.table(merged_data, file = tsv_file, sep = '\t', quote = FALSE, row.names = FALSE)
write.csv(merged_data, file = csv_file, row.names = FALSE)

# Print success message
cat("Files written to:", tsv_file, "and", csv_file, "\n")


## Create A Merged Reference Column With Only The Unique Values:

In [None]:
# Identify Reference_* columns
ref_columns <- grep("^Reference_", names(merged_data), value = TRUE)

# Create Reference_AllSamples column
merged_data$Reference_AllSamples <- apply(merged_data[, ref_columns, drop = FALSE], 1, function(row) {
  # Concatenate all values from reference columns into one string
  all_values <- paste(as.character(row), collapse = "")
  
  # Keep only unique values
  unique_values <- unique(strsplit(all_values, "")[[1]])
  
  # Concatenate unique values into one string
  unique_values_string <- paste(unique_values, collapse = "")
  
  return(unique_values_string)
})

# View the updated data frame
head(merged_data)


## View The First 50 Lines:

In [None]:
head(merged_data, 50)

In [None]:
# Set the subfolder name
Part_1___Subfolder <- "Part_1___AddedInAllSampleReferenceColumn"

# Create the full path for the subfolder
subfolder_path <- file.path(output_directory, Part_1___Subfolder)

# Create the subfolder if it doesn't exist
if (!file.exists(subfolder_path)) {
  dir.create(subfolder_path)
}

# Remove quotes from Part_1___Prefix
Part_1___Prefix <- gsub('"', '', Part_1___Prefix)

# Set the output file names with the specified prefix
tsv_file <- file.path(subfolder_path, paste0(c(Part_1___Prefix, "AddedInAllSampleReferenceColumn.tsv"), collapse = ""))
csv_file <- file.path(subfolder_path, paste0(c(Part_1___Prefix, "AddedInAllSampleReferenceColumn.csv"), collapse = ""))

# Write merged_data to TSV
write.table(merged_data, tsv_file, sep = '\t', quote = FALSE, row.names = FALSE)

# Write merged_data to CSV
write.csv(merged_data, csv_file, row.names = FALSE)

# Print confirmation
cat("*Data has been successfully written to the subfolder:* ", subfolder_path, "\n")


If it says "No rows with empty string in Reference_AllSamples." then it means that all the Reference_(SingleSample) columns interesected correctly.

## Remove All Impact Rows Where Impact Is Not Equal To Moderate Or High

In [None]:
# Filter rows based on the Impact column
merged_data_filtered <- merged_data[merged_data$IMPACT %in% c("MODERATE", "HIGH"), ]

# View the updated data frame
head(merged_data_filtered)


In [None]:
# Set the subfolder name for Part 2
Part_2___Subfolder <- "Part_2___RemovedNonModerateNonHighFromImpactColumn"

# Create the full path for the subfolder
Part_2___Subfolder_path <- file.path(output_directory, Part_2___Subfolder)

# Create the subfolder if it doesn't exist
if (!file.exists(Part_2___Subfolder_path)) {
  dir.create(Part_2___Subfolder_path)
}

# Remove quotes from Part_2___Prefix
Part_2___Prefix <- gsub('"', '', Part_2___Prefix)

# Set the output file names with the specified prefix
tsv_file_part_2 <- file.path(Part_2___Subfolder_path, paste0(c(Part_2___Prefix, "RemovedNonModerateNonHighFromImpactColumn.tsv"), collapse = ""))
csv_file_part_2 <- file.path(Part_2___Subfolder_path, paste0(c(Part_2___Prefix, "RemovedNonModerateNonHighFromImpactColumn.csv"), collapse = ""))

# Write merged_data_filtered to TSV for Part 2
write.table(merged_data_filtered, tsv_file_part_2, sep = '\t', quote = FALSE, row.names = FALSE)

# Write merged_data_filtered to CSV for Part 2
write.csv(merged_data_filtered, csv_file_part_2, row.names = FALSE)

# Print confirmation for Part 2
cat("*Data (Part 2) has been successfully written to the subfolder:* ", Part_2___Subfolder_path, "\n")


## Remove All Single Sample Reference_* Columns (Do Not Remove Reference_AllSamples)

In [None]:
# Identify Reference_* columns
ref_columns <- grep("^Reference_", names(merged_data_filtered), value = TRUE)

# Remove Reference_* columns except Reference_AllSamples
merged_data_filtered <- merged_data_filtered[, c(setdiff(names(merged_data_filtered), ref_columns), "Reference_AllSamples"), drop = FALSE]

# View the updated data frame
head(merged_data_filtered)


In [None]:
# Set the subfolder name for Part 3
Part_3___Subfolder <- "Part_3___RemovedAllSingleSampleReference_Columns"

# Create the full path for the subfolder
Part_3___Subfolder_path <- file.path(output_directory, Part_3___Subfolder)

# Create the subfolder if it doesn't exist
if (!file.exists(Part_3___Subfolder_path)) {
  dir.create(Part_3___Subfolder_path)
}

# Identify Reference_* columns in merged_data_filtered
ref_columns_part_3 <- grep("^Reference_", names(merged_data_filtered), value = TRUE)

# Remove Reference_* columns except Reference_AllSamples
merged_data_filtered_part_3 <- merged_data_filtered[, c(setdiff(names(merged_data_filtered), ref_columns_part_3), "Reference_AllSamples"), drop = FALSE]


In [None]:
# Remove quotes from Part_3___Prefix
Part_3___Prefix <- gsub('"', '', Part_3___Prefix)

# Set the output file names with the specified prefix for Part 3
tsv_file_part_3 <- file.path(Part_3___Subfolder_path, paste0(c(Part_3___Prefix, "RemovedAllSingleSampleReference_Columns.tsv"), collapse = ""))
csv_file_part_3 <- file.path(Part_3___Subfolder_path, paste0(c(Part_3___Prefix, "RemovedAllSingleSampleReference_Columns.csv"), collapse = ""))

# Write merged_data_filtered (Part 3) to TSV
write.table(merged_data_filtered_part_3, tsv_file_part_3, sep = '\t', quote = FALSE, row.names = FALSE)

# Write merged_data_filtered (Part 3) to CSV
write.csv(merged_data_filtered_part_3, csv_file_part_3, row.names = FALSE)

# Print confirmation for Part 3
cat("*Data (Part 3) has been successfully written to the subfolder:* ", Part_3___Subfolder_path, "\n")


## Remove Consequences That Are `splice_donor_variant`s or `splice_acceptor_variant`s

In [None]:
# Assuming merged_data_filtered_part_3 is your original dataframe
merged_data_filtered_part_4 <- merged_data_filtered_part_3[!(merged_data_filtered_part_3$Consequence %in% c("splice_donor_variant", "splice_acceptor_variant")), ]

# Now merged_data_filtered_part_4 has rows without splice_donor_variant or splice_acceptor_variant
head(merged_data_filtered_part_4)

In [None]:
cat("Removing Splice Donor And Acceptor Variants results in the dataframe having", nrow(merged_data_filtered_part_4), "rows.\n")

In [None]:
# Set the subfolder name for Part 4
Part_4___Subfolder_name <- "Part_4___Removing_SpliceDonorAndAcceptorSites"

# Create the full subfolder path
Part_4___Subfolder_path <- file.path(output_directory, Part_4___Subfolder_name)

# Create the subfolder if it doesn't exist
if (!file.exists(Part_4___Subfolder_path)) {
  dir.create(Part_4___Subfolder_path, recursive = TRUE)
}


In [None]:
# Remove quotes from Part_4___Prefix
Part_4___Prefix <- gsub('"', '', Part_4___Prefix)

# Set the output file names with the specified prefix for Part 4
tsv_file_part_4 <- file.path(Part_4___Subfolder_path, paste0(c(Part_4___Prefix, "RemovingSpliceDonorAndAcceptorSites.tsv"), collapse = ""))
csv_file_part_4 <- file.path(Part_4___Subfolder_path, paste0(c(Part_4___Prefix, "RemovingSpliceDonorAndAcceptorSites.csv"), collapse = ""))

# Write merged_data_filtered (Part 4) to TSV
write.table(merged_data_filtered_part_4, tsv_file_part_4, sep = '\t', quote = FALSE, row.names = FALSE)

# Write merged_data_filtered (Part 4) to CSV
write.csv(merged_data_filtered_part_4, csv_file_part_4, row.names = FALSE)

# Print confirmation for Part 4
cat("*Data (Part 4) has been successfully written to the subfolder:* ", Part_4___Subfolder_path, "\n")


## Session Information

In [None]:
sessionInfo()