diff --git a/README.md b/README.md index abc5879..269d01a 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ The following additional libraries need to be installed in order to use the prog + scipy (v1.10.1) + scikit-learn (v1.3.0) -If a newer scikit-learn version is used the models provided should still work (they were created using the version v0.19.1 and the import was tested with v0.21.3 and v0.22.2). +If a newer scikit-learn version is used it is advised to create a new model with the newer scikit version. ## Annotation resources and tools diff --git a/aidiva/helper_modules/convert_indels_to_snps_and_create_vcf.py b/aidiva/helper_modules/convert_indels_to_snps_and_create_vcf.py index 66c314d..b159325 100644 --- a/aidiva/helper_modules/convert_indels_to_snps_and_create_vcf.py +++ b/aidiva/helper_modules/convert_indels_to_snps_and_create_vcf.py @@ -5,27 +5,31 @@ import random +RANDOM_SEED = 14038 + logger = logging.getLogger(__name__) def write_data_information_to_file(input_data, outfile, ref_sequence, header): - data_grouped = [group for key, group in input_data.groupby("CHROM")] + data_grouped = [group for key, group in input_data.groupby("#CHROM")] in_fasta = pysam.FastaFile(ref_sequence) - random.seed(14038) + random.seed(RANDOM_SEED) for line in header: if line.startswith("#CHROM"): outfile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") + else: outfile.write(line) for group in data_grouped: - if "chr" in str(group["CHROM"].iloc[0]): - chrom_id = str(group["CHROM"].iloc[0]) + if "chr" in str(group["#CHROM"].iloc[0]): + chrom_id = str(group["#CHROM"].iloc[0]) + else: - chrom_id = "chr" + str(group["CHROM"].iloc[0]) + chrom_id = "chr" + str(group["#CHROM"].iloc[0]) - for row in group.itertuples(): + for row in group.itertuples(index=False): # make sure that the window starts at the beginning of the reference sequence window_start = max(int(row.POS) - 3, 1) @@ -37,47 +41,51 @@ def write_data_information_to_file(input_data, outfile, ref_sequence, header): alt_variant = "" if (extended_ref_seq[i] == "A") or (extended_ref_seq[i] == "T"): alt_variant = random.choice(["G", "C"]) + elif (extended_ref_seq[i] == "G") or (extended_ref_seq[i] == "C"): alt_variant = random.choice(["A", "T"]) + elif (extended_ref_seq[i] == "N"): logger.debug("Reference base was skipped because it was 'N'!") continue + else: logger.error("The given reference sequence seems to be corrupted!") - outfile.write(str(row.CHROM).strip() + "\t" + str(window_start + i + 1).strip() + "\t" + "." + "\t" + str(extended_ref_seq[i]).strip() + "\t" + str(alt_variant).strip() + "\t" + "." + "\t" + "." + "\t" + str(row.INFO).strip() + "\n") + outfile.write(str(chrom_id).strip() + "\t" + str(window_start + i + 1).strip() + "\t" + "." + "\t" + str(extended_ref_seq[i]).strip() + "\t" + str(alt_variant).strip() + "\t" + "." + "\t" + "." + "\t" + str(row.INFO).strip() + "\n") def import_vcf_data(in_data): header_line = "" comment_lines = [] + # WARNING: reset file pointer to begin after reading header to start processing from the start again (is done by closing the file before reading again) with open(in_data, "r") as input_vcf: # extract header from vcf file for line in input_vcf: if line.strip().startswith("##"): comment_lines.append(line) - if line.strip().startswith("#CHROM"): + + elif line.strip().startswith("#CHROM"): header_line = line.strip() comment_lines.append(line) break # now the variant entries are coming + else: continue if header_line == "": logger.error("The VCF seems to be corrupted, missing header line!") - # reset file pointer to begin reading at the beginning (is done by closing the file before reading again) - data = pd.read_csv(in_data, names=header_line.split("\t"), sep="\t", comment="#", low_memory=False) data.fillna(".", inplace=True) - data = data.rename(columns={"#CHROM": "CHROM"}) return data, comment_lines def convert_indel_vcf_to_expanded_indel_vcf(in_data, out_data, ref_folder): input_data, header = import_vcf_data(in_data) + with open(out_data, "w", newline="") as outfile: write_data_information_to_file(input_data, outfile, ref_folder, header) diff --git a/aidiva/helper_modules/convert_vcf_to_csv.py b/aidiva/helper_modules/convert_vcf_to_csv.py index fc8848b..e3e6077 100644 --- a/aidiva/helper_modules/convert_vcf_to_csv.py +++ b/aidiva/helper_modules/convert_vcf_to_csv.py @@ -2,6 +2,7 @@ import logging import multiprocessing as mp import numpy as np +import os import pandas as pd import tempfile @@ -77,11 +78,11 @@ "RF_SCORE", "SpliceAI", "oe_lof", + "oe_mis", + "oe_syn", "SegDup", "SimpleRepeats", "CLINVAR_DETAILS", - #"DETAILS", - #"CLASS", "HGMD_CLASS", "HGMD_RANKSCORE", "OMIM", @@ -96,17 +97,20 @@ def reformat_vcf_file_and_read_into_pandas_and_extract_header(filepath): with open(filepath, "r") as vcf_file_to_reformat, tempfile.NamedTemporaryFile(mode="w+") as tmp: # make sure that there are no unwanted linebreaks in the variant entries - tmp.write(vcf_file_to_reformat.read().replace(r"(\n(?!((((((chr)?[0-9]{1,2}|(chr)?[xXyY]{1}|(chr)?(MT|mt){1})\t)(.+\t){6,}(.+(\n|\Z))))|(#{1,2}.*(\n|\Z))|(\Z))))", "")) - tmp.seek(0) + ## TODO: remove regular expression, should not be needed anymore + #tmp.write(vcf_file_to_reformat.read().replace(r"(\n(?!((((((chr)?[0-9]{1,2}|(chr)?[xXyY]{1}|(chr)?(MT|mt){1})\t)(.+\t){6,}(.+(\n|\Z))))|(#{1,2}.*(\n|\Z))|(\Z))))", "")) + #tmp.seek(0) # extract header from vcf file - for line in tmp: + for line in vcf_file_to_reformat: if line.strip().startswith("##"): comment_lines.append(line.strip()) - if line.strip().startswith("#CHROM"): + + elif line.strip().startswith("#CHROM"): header_line = line.strip() comment_lines.append(header_line) break # now the variant entries are coming + else: continue @@ -116,8 +120,6 @@ def reformat_vcf_file_and_read_into_pandas_and_extract_header(filepath): vcf_header = header_line.strip().split("\t") vcf_as_dataframe = pd.read_csv(tmp.name, names=vcf_header, sep="\t", comment="#", low_memory=False) - #vcf_as_dataframe = vcf_as_dataframe.rename(columns={"#CHROM": "CHROM"}) - return comment_lines, vcf_as_dataframe @@ -171,6 +173,8 @@ def extract_columns(cell, process_indel): spliceAI = np.nan spliceAI_raw = [] oe_lof = np.nan + oe_mis = np.nan + oe_syn = np.nan simpleRepeat = "" clinvar_details = "" hgmd_class = "" @@ -179,9 +183,6 @@ def extract_columns(cell, process_indel): annotation = "" repeat_masker = "" - #details = "" - #class_orig = "" - for field in info_fields: field_splitted = field.split("=") @@ -202,132 +203,154 @@ def extract_columns(cell, process_indel): elif field_splitted[0] == "FATHMM_XF": if "&" in field_splitted[1]: fathmm_xf = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: fathmm_xf = float(field_splitted[1]) elif field_splitted[0] == "CONDEL": if "&" in field_splitted[1]: condel = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: condel = float(field_splitted[1]) elif field_splitted[0] == "EIGEN_PHRED": if "&" in field_splitted[1]: eigen_phred = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: eigen_phred = float(field_splitted[1]) elif field_splitted[0] == "MutationAssessor": if "&" in field_splitted[1]: mutation_assessor = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: mutation_assessor = float(field_splitted[1]) elif field_splitted[0] == "REVEL": if "&" in field_splitted[1]: revel = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: revel = float(field_splitted[1]) elif field_splitted[0] == "phyloP_primate": if "&" in field_splitted[1]: phyloP_primate = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: phyloP_primate = float(field_splitted[1]) elif field_splitted[0] == "phyloP_mammal": if "&" in field_splitted[1]: phyloP_mammal = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: phyloP_mammal = float(field_splitted[1]) elif field_splitted[0] == "phyloP_vertebrate": if "&" in field_splitted[1]: phyloP_vertebrate = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: phyloP_vertebrate = float(field_splitted[1]) elif field_splitted[0] == "phastCons_primate": if "&" in field_splitted[1]: phastCons_primate = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: phastCons_primate = float(field_splitted[1]) elif field_splitted[0] == "phastCons_mammal": if "&" in field_splitted[1]: phastCons_mammal = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: phastCons_mammal = float(field_splitted[1]) elif field_splitted[0] == "phastCons_vertebrate": if "&" in field_splitted[1]: phastCons_vertebrate = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: phastCons_vertebrate = float(field_splitted[1]) elif field_splitted[0] == "gnomAD_Hom": if "&" in field_splitted[1]: gnomAD_hom = float(field_splitted[1].split("&")[0]) + else: gnomAD_hom = float(field_splitted[1]) elif field_splitted[0] == "gnomAD_AN": if "&" in field_splitted[1]: gnomAD_an = float(field_splitted[1].split("&")[0]) + else: gnomAD_an = float(field_splitted[1]) elif field_splitted[0] == "gnomAD_AFR_AF": if "&" in field_splitted[1]: gnomAD_afr_af = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: gnomAD_afr_af = float(field_splitted[1]) elif field_splitted[0] == "gnomAD_AMR_AF": if "&" in field_splitted[1]: gnomAD_amr_af = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: gnomAD_amr_af = float(field_splitted[1]) elif field_splitted[0] == "gnomAD_EAS_AF": if "&" in field_splitted[1]: gnomAD_eas_af = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: gnomAD_eas_af = float(field_splitted[1]) elif field_splitted[0] == "gnomAD_NFE_AF": if "&" in field_splitted[1]: gnomAD_nfe_af = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: gnomAD_nfe_af = float(field_splitted[1]) elif field_splitted[0] == "gnomAD_SAS_AF": if "&" in field_splitted[1]: gnomAD_sas_af = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: gnomAD_sas_af = float(field_splitted[1]) elif field_splitted[0] == "CAPICE": if "&" in field_splitted[1]: capice = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: capice = float(field_splitted[1]) elif field_splitted[0] == "CADD": if "&" in field_splitted[1]: cadd = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: cadd = float(field_splitted[1]) elif field_splitted[0] == "ADA_SCORE": if "&" in field_splitted[1]: ada = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: ada = float(field_splitted[1]) elif field_splitted[0] == "RF_SCORE": if "&" in field_splitted[1]: rf = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: rf = float(field_splitted[1]) @@ -338,6 +361,14 @@ def extract_columns(cell, process_indel): elif field_splitted[0] == "oe_lof": oe_lof = min([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + ## currently not used + elif field_splitted[0] == "oe_mis": + oe_mis = min([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + + ## currently not used + elif field_splitted[0] == "oe_syn": + oe_syn = min([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + elif field_splitted[0] == "SegDup": segDup = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) @@ -347,18 +378,13 @@ def extract_columns(cell, process_indel): elif field_splitted[0] == "CLINVAR_DETAILS": clinvar_details = field_splitted[1] - #elif field_splitted[0] == "DETAILS": - # details = field_splitted[1] - - #elif field_splitted[0] == "CLASS": - # class_orig = field_splitted[1] - elif field_splitted[0] == "HGMD_CLASS": hgmd_class = field_splitted[1] elif field_splitted[0] == "HGMD_RANKSCORE": if "&" in field_splitted[1]: hgmd_rankscore = max([float(value) for value in field_splitted[1].split("&") if (value != "." and value != "nan" and value !="")], default=np.nan) + else: hgmd_rankscore = float(field_splitted[1]) @@ -383,28 +409,40 @@ def extract_columns(cell, process_indel): max_af = max([gnomAD_afr_af, gnomAD_amr_af, gnomAD_eas_af, gnomAD_nfe_af, gnomAD_sas_af], default=np.nan) if process_indel: - extracted_columns = [indel_ID, annotation, fathmm_xf, condel, eigen_phred, mutation_assessor, revel, phyloP_primate, phyloP_mammal, phyloP_vertebrate, phastCons_primate, phastCons_mammal, phastCons_vertebrate, gnomAD_homAF, gnomAD_afr_af, gnomAD_amr_af, gnomAD_eas_af, gnomAD_nfe_af, gnomAD_sas_af, max_af, capice, cadd, oe_lof, segDup, simpleRepeat, ada, rf, spliceAI, repeat_masker, clinvar_details, hgmd_class, hgmd_rankscore, omim_details] #, details, class_orig] #, abb_score] + extracted_columns = [indel_ID, annotation, fathmm_xf, condel, eigen_phred, mutation_assessor, revel, phyloP_primate, phyloP_mammal, phyloP_vertebrate, phastCons_primate, phastCons_mammal, phastCons_vertebrate, gnomAD_homAF, gnomAD_afr_af, gnomAD_amr_af, gnomAD_eas_af, gnomAD_nfe_af, gnomAD_sas_af, max_af, capice, cadd, oe_lof, oe_mis, oe_syn, segDup, simpleRepeat, ada, rf, spliceAI, repeat_masker, clinvar_details, hgmd_class, hgmd_rankscore, omim_details] #, details, class_orig] #, abb_score] else: - extracted_columns = [annotation, fathmm_xf, condel, eigen_phred, mutation_assessor, revel, phyloP_primate, phyloP_mammal, phyloP_vertebrate, phastCons_primate, phastCons_mammal, phastCons_vertebrate, gnomAD_homAF, gnomAD_afr_af, gnomAD_amr_af, gnomAD_eas_af, gnomAD_nfe_af, gnomAD_sas_af, max_af, capice, cadd, oe_lof, segDup, simpleRepeat, ada, rf, spliceAI, repeat_masker, clinvar_details, hgmd_class, hgmd_rankscore, omim_details] #, details, class_orig, abb_score] + extracted_columns = [annotation, fathmm_xf, condel, eigen_phred, mutation_assessor, revel, phyloP_primate, phyloP_mammal, phyloP_vertebrate, phastCons_primate, phastCons_mammal, phastCons_vertebrate, gnomAD_homAF, gnomAD_afr_af, gnomAD_amr_af, gnomAD_eas_af, gnomAD_nfe_af, gnomAD_sas_af, max_af, capice, cadd, oe_lof, oe_mis, oe_syn, segDup, simpleRepeat, ada, rf, spliceAI, repeat_masker, clinvar_details, hgmd_class, hgmd_rankscore, omim_details] #, details, class_orig, abb_score] return extracted_columns -def extract_vep_annotation(cell, annotation_header): +def extract_vep_annotation(cell, annotation_header, canonical_transcripts=[]): annotation_fields = str(cell["CSQ"]).split(",") new_cols = [] - # choose the most severe annotation variant - # if new consequence terms were added to the database that are not yet handled from aiDIVA use default consequence "unknown" with lowest severity value + + if (len(annotation_fields) >= 1) and (annotation_fields[0] != ""): - consequences = [min([VARIANT_CONSEQUENCES.get(consequence) if consequence in VARIANT_CONSEQUENCES.keys() else VARIANT_CONSEQUENCES.get("unknown") for consequence in field.split("|")[annotation_header.index("Consequence")].split("&")]) for field in annotation_fields] - target_index = min(enumerate(consequences), key=itemgetter(1))[0] - new_cols = annotation_fields[target_index].strip().split("|") + + if canonical_transcripts: + for annotation in annotation_fields: + transcript_index = annotation_header.index("Feature") + + if annotation.split("|")[transcript_index] in canonical_transcripts: + new_cols = annotation.split("|") + break + + if new_cols == []: + # choose the most severe annotation variant + # if new consequence terms were added to the database that are not yet handled from aiDIVA use default consequence "unknown" with lowest severity value + consequences = [min([VARIANT_CONSEQUENCES.get(consequence) if consequence in VARIANT_CONSEQUENCES.keys() else VARIANT_CONSEQUENCES.get("unknown") for consequence in field.split("|")[annotation_header.index("Consequence")].split("&")]) for field in annotation_fields] + target_index = min(enumerate(consequences), key=itemgetter(1))[0] + new_cols = annotation_fields[target_index].strip().split("|") else: # can happen with the expanded InDels - logger.warn("Empty VEP annotation!") + logger.warning("Empty VEP annotation!") return new_cols @@ -422,18 +460,23 @@ def extract_sample_information(row, sample, sample_header=None): splitted_sample_information = sample_dict[sample_id].split("|") if splitted_sample_information[0] == "wt": sample_information.append("0/0") + elif splitted_sample_information[0] == "hom": sample_information.append("1/1") + elif splitted_sample_information[0] == "het": sample_information.append("0/1") + else: logger.warning("Genotype not recognized! (%s)" % (splitted_sample_information[0])) sample_information.append(splitted_sample_information[1]) sample_information.append(splitted_sample_information[2]) sample_information.append(sample_id + "=" + sample_dict[sample_id]) + else: logger.error("Format is MULTI but no sample_header is given!") + else: format_entries = str(row["FORMAT"]).strip().split(":") sample_fields = str(row[sample + ".full"]).strip().split(":") @@ -445,23 +488,27 @@ def extract_sample_information(row, sample, sample_header=None): if "GT" in format_entries: sample_gt_information = sample_fields[format_entries.index("GT")] + else: sample_gt_information = "./." if "DP" in format_entries: sample_dp_information = sample_fields[format_entries.index("DP")] + else: sample_dp_information = "." if "AD" in format_entries: sample_ref_information = sample_fields[format_entries.index("AD")].split(",")[0] sample_alt_information = sample_fields[format_entries.index("AD")].split(",")[1] + else: sample_ref_information = "." sample_alt_information = "." if "GQ" in format_entries: sample_gq_information = sample_fields[format_entries.index("GQ")] + else: sample_gq_information = "." @@ -469,6 +516,7 @@ def extract_sample_information(row, sample, sample_header=None): divisor = (int(sample_ref_information) + int(sample_alt_information)) if divisor == 0: sample_af_information = 0 + else: sample_af_information = (int(sample_alt_information) / divisor) @@ -568,28 +616,22 @@ def convert_variant_representation(row): def add_INFO_fields_to_dataframe(process_indel, expanded_indel, vcf_as_dataframe): - indel_annotation_columns = ["INDEL_ID", "CSQ", "FATHMM_XF", "CONDEL", "EIGEN_PHRED", "MutationAssessor", "REVEL", "phyloP_primate", "phyloP_mammal", "phyloP_vertebrate", "phastCons_primate", "phastCons_mammal", "phastCons_vertebrate", "homAF", "gnomAD_AFR_AF", "gnomAD_AMR_AF", "gnomAD_EAS_AF", "gnomAD_NFE_AF", "gnomAD_SAS_AF", "MAX_AF", "CAPICE", "CADD_PHRED", "oe_lof", "segmentDuplication", "simpleRepeat", "ada_score", "rf_score", "SpliceAI", "REPEATMASKER", "CLINVAR_DETAILS", "HGMD_CLASS", "HGMD_RANKSCORE", "OMIM"] - snp_annotation_columns = ["CSQ", "FATHMM_XF", "CONDEL", "EIGEN_PHRED", "MutationAssessor", "REVEL", "phyloP_primate", "phyloP_mammal", "phyloP_vertebrate", "phastCons_primate", "phastCons_mammal", "phastCons_vertebrate", "homAF", "gnomAD_AFR_AF", "gnomAD_AMR_AF", "gnomAD_EAS_AF", "gnomAD_NFE_AF", "gnomAD_SAS_AF", "MAX_AF", "CAPICE", "CADD_PHRED", "oe_lof", "segmentDuplication", "simpleRepeat", "ada_score", "rf_score", "SpliceAI", "REPEATMASKER", "CLINVAR_DETAILS", "HGMD_CLASS", "HGMD_RANKSCORE", "OMIM"] + indel_annotation_columns = ["INDEL_ID", "CSQ", "FATHMM_XF", "CONDEL", "EIGEN_PHRED", "MutationAssessor", "REVEL", "phyloP_primate", "phyloP_mammal", "phyloP_vertebrate", "phastCons_primate", "phastCons_mammal", "phastCons_vertebrate", "homAF", "gnomAD_AFR_AF", "gnomAD_AMR_AF", "gnomAD_EAS_AF", "gnomAD_NFE_AF", "gnomAD_SAS_AF", "MAX_AF", "CAPICE", "CADD_PHRED", "oe_lof", "oe_mis", "oe_syn", "segmentDuplication", "simpleRepeat", "ada_score", "rf_score", "SpliceAI", "REPEATMASKER", "CLINVAR_DETAILS", "HGMD_CLASS", "HGMD_RANKSCORE", "OMIM"] + snp_annotation_columns = ["CSQ", "FATHMM_XF", "CONDEL", "EIGEN_PHRED", "MutationAssessor", "REVEL", "phyloP_primate", "phyloP_mammal", "phyloP_vertebrate", "phastCons_primate", "phastCons_mammal", "phastCons_vertebrate", "homAF", "gnomAD_AFR_AF", "gnomAD_AMR_AF", "gnomAD_EAS_AF", "gnomAD_NFE_AF", "gnomAD_SAS_AF", "MAX_AF", "CAPICE", "CADD_PHRED", "oe_lof", "oe_mis", "oe_syn", "segmentDuplication", "simpleRepeat", "ada_score", "rf_score", "SpliceAI", "REPEATMASKER", "CLINVAR_DETAILS", "HGMD_CLASS", "HGMD_RANKSCORE", "OMIM"] if process_indel: if not expanded_indel: vcf_as_dataframe["GSVAR_VARIANT"] = vcf_as_dataframe.apply(lambda row: convert_variant_representation(row), axis=1) vcf_as_dataframe[indel_annotation_columns] = vcf_as_dataframe["INFO"].apply(lambda x: pd.Series(extract_columns(x, process_indel))) vcf_as_dataframe["IS_INDEL"] = 1 - #vcf_as_dataframe["HIGH_IMPACT"] = vcf_as_dataframe.apply(lambda row: specify_impact_class(row), axis=1) - #vcf_as_dataframe["MOST_SEVERE_CONSEQUENCE"] = vcf_as_dataframe.apply(lambda row: get_most_severe_consequence(row), axis=1) else: vcf_as_dataframe[indel_annotation_columns] = vcf_as_dataframe["INFO"].apply(lambda x: pd.Series(extract_columns(x, process_indel))) - #vcf_as_dataframe["HIGH_IMPACT"] = vcf_as_dataframe.apply(lambda row: specify_impact_class(row), axis=1) - #vcf_as_dataframe["MOST_SEVERE_CONSEQUENCE"] = vcf_as_dataframe.apply(lambda row: get_most_severe_consequence(row), axis=1) else: vcf_as_dataframe["GSVAR_VARIANT"] = vcf_as_dataframe.apply(lambda row: convert_variant_representation(row), axis=1) vcf_as_dataframe[snp_annotation_columns] = vcf_as_dataframe["INFO"].apply(lambda x: pd.Series(extract_columns(x, process_indel))) vcf_as_dataframe["IS_INDEL"] = 0 - #vcf_as_dataframe["HIGH_IMPACT"] = vcf_as_dataframe.apply(lambda row: specify_impact_class(row), axis=1) - #vcf_as_dataframe["MOST_SEVERE_CONSEQUENCE"] = vcf_as_dataframe.apply(lambda row: get_most_severe_consequence(row), axis=1) vcf_as_dataframe = vcf_as_dataframe.drop(columns=["INFO"]) @@ -620,12 +662,13 @@ def specify_impact_class(row): if variant_impact == "HIGH": return 1 + else: return 0 -def add_VEP_annotation_to_dataframe(annotation_header, vcf_as_dataframe): - vcf_as_dataframe[annotation_header] = vcf_as_dataframe.apply(lambda x: pd.Series(extract_vep_annotation(x, annotation_header)), axis=1) +def add_VEP_annotation_to_dataframe(annotation_header, canonical_transcripts, vcf_as_dataframe): + vcf_as_dataframe[annotation_header] = vcf_as_dataframe.apply(lambda x: pd.Series(extract_vep_annotation(x, annotation_header, canonical_transcripts)), axis=1) vcf_as_dataframe = vcf_as_dataframe.drop(columns=["CSQ"]) vcf_as_dataframe = vcf_as_dataframe.rename(columns={"am_class": "ALPHA_MISSENSE_CLASS"}) vcf_as_dataframe = vcf_as_dataframe.rename(columns={"am_pathogenicity": "ALPHA_MISSENSE_SCORE"}) @@ -645,6 +688,7 @@ def add_sample_information_to_dataframe(sample_ids, sample_header, vcf_as_datafr sample_header_multi.append("DP." + sample_id) sample_header_multi.append("ALT." + sample_id) sample_header_multi.append(sample_id + ".full") + vcf_as_dataframe[sample_header_multi] = vcf_as_dataframe.apply(lambda x: pd.Series(extract_sample_information(x, sample, sample_header)), axis=1) else: @@ -658,11 +702,22 @@ def add_sample_information_to_dataframe(sample_ids, sample_header, vcf_as_datafr return vcf_as_dataframe -def convert_vcf_to_pandas_dataframe(input_file, process_indel, expanded_indel, num_cores): +def convert_vcf_to_pandas_dataframe(input_file, process_indel, expanded_indel, transcript_file_path, num_cores): header, vcf_as_dataframe = reformat_vcf_file_and_read_into_pandas_and_extract_header(input_file) logger.debug("Convert VCF file") + canonical_transcripts = [] + + if os.path.isfile(transcript_file_path): + with open(transcript_file_path, "r") as transcript_file: + for line in transcript_file: + if line.startswith("#"): + continue + + else: + canonical_transcripts.append(line.strip()) + sample_ids = [] # FORMAT column has index 8 (counted from 0) and sample columns follow afterwards (sample names are unique) # Check if FORMAT column exists @@ -676,7 +731,7 @@ def convert_vcf_to_pandas_dataframe(input_file, process_indel, expanded_indel, n if not vcf_as_dataframe.empty: vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_INFO_fields_to_dataframe, process_indel, expanded_indel), num_cores) - vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_VEP_annotation_to_dataframe, annotation_header), num_cores) + vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_VEP_annotation_to_dataframe, annotation_header, canonical_transcripts), num_cores) if len(vcf_as_dataframe.columns) > 8: if "FORMAT" in vcf_as_dataframe.columns: @@ -728,6 +783,7 @@ def write_vcf_to_csv(vcf_as_dataframe, out_file): parser.add_argument("--out_data", type=str, dest="out_data", metavar="output.csv", required=True, help="CSV file containing the converted VCF file\n") parser.add_argument("--indel", action="store_true", required=False, help="Flag to indicate whether the file to convert consists of indel variants or not.\n") parser.add_argument("--expanded", action="store_true", required=False, help="Flag to indicate whether the file to convert consists of expanded indel variants.\n") + parser.add_argument("--transcript_ids", type=str, dest="transcript_ids", metavar="mane_v13_transcripts_ids.txt", required=True, help="File specifying the ensembl transcripts that should be used (e.g. MANE-SELECT).\n") parser.add_argument("--threads", type=int, dest="threads", metavar="1", required=False, help="Number of threads to use.") args = parser.parse_args() @@ -737,5 +793,8 @@ def write_vcf_to_csv(vcf_as_dataframe, out_file): else: num_cores = 1 - vcf_as_dataframe = convert_vcf_to_pandas_dataframe(args.in_data, args.indel, args.expanded, num_cores) + # TODO: add as parameter if it works + #transcript_file_path = "/mnt/storage2/users/ahboced1/phd_project/mane_grch38_v13_transcript_ids.txt" + + vcf_as_dataframe = convert_vcf_to_pandas_dataframe(args.in_data, args.indel, args.expanded, args.transcript_ids, num_cores) write_vcf_to_csv(vcf_as_dataframe, args.out_data) diff --git a/aidiva/helper_modules/create_result_vcf.py b/aidiva/helper_modules/create_result_vcf.py index efe5c5f..a6812c9 100644 --- a/aidiva/helper_modules/create_result_vcf.py +++ b/aidiva/helper_modules/create_result_vcf.py @@ -10,13 +10,14 @@ def write_header(out_file, assembly_build, single): out_file.write("##fileformat=VCFv4.2\n") if not single: - out_file.write("##INFO=\n") - out_file.write("##INFO=\n") - out_file.write("##INFO=\n") + out_file.write("##INFO=\n") + out_file.write("##INFO=\n") + out_file.write("##INFO=\n") + else: - out_file.write("##INFO=\n") - out_file.write("##INFO=\n") - out_file.write("##INFO=\n") + out_file.write("##INFO=\n") + out_file.write("##INFO=\n") + out_file.write("##INFO=\n") if assembly_build == "GRCh38": out_file.write("##reference=GRCh38.fa\n") @@ -44,6 +45,7 @@ def write_header(out_file, assembly_build, single): out_file.write("##contig=\n") out_file.write("##contig=\n") out_file.write("##contig=\n") + elif assembly_build == "GRCh37": out_file.write("##reference=GRCh37.fa\n") out_file.write("##contig=\n") @@ -70,6 +72,7 @@ def write_header(out_file, assembly_build, single): out_file.write("##contig=\n") out_file.write("##contig=\n") out_file.write("##contig=\n") + else: logger.error(f"Unsupported assembly build: {assembly_build}") @@ -82,112 +85,136 @@ def write_result_vcf(input_data, vcf_file, assembly_build, single): input_data = input_data.sort_values(["CHROM", "POS"], ascending=[True, True]) input_data = input_data.reset_index(drop=True) colnames = input_data.columns - + write_header(out_file, assembly_build, single) for row in input_data.itertuples(): if ("AIDIVA_SCORE" in colnames): if (str(row.AIDIVA_SCORE) == "nan"): aidiva_score = "." + else: aidiva_score = str(row.AIDIVA_SCORE) + else: aidiva_score = "." if ("FINAL_AIDIVA_SCORE" in colnames): if (str(row.FINAL_AIDIVA_SCORE) == "nan"): final_aidiva_score = "." + else: final_aidiva_score = str(row.FINAL_AIDIVA_SCORE) + else: final_aidiva_score = "." if ("HPO_RELATEDNESS" in colnames): if (str(row.HPO_RELATEDNESS) == "nan"): hpo_relatedness = "." + else: hpo_relatedness = str(row.HPO_RELATEDNESS) + else: hpo_relatedness = "." if ("HPO_RELATEDNESS_INTERACTING" in colnames): if (str(row.HPO_RELATEDNESS_INTERACTING) == "nan"): hpo_relatedness_interacting = "." + else: hpo_relatedness_interacting = str(row.HPO_RELATEDNESS_INTERACTING) + else: hpo_relatedness_interacting = "." - + if ("FILTER_PASSED" in colnames): if (str(row.FILTER_PASSED) == "nan"): filter_passed = "." + else: filter_passed = str(row.FILTER_PASSED) + else: filter_passed = "." - + if ("DOMINANT" in colnames): if (str(row.DOMINANT) == "nan"): dominant = "." + else: dominant = str(row.DOMINANT) + else: dominant = "." - + if ("DOMINANT_DENOVO" in colnames): if (str(row.DOMINANT_DENOVO) == "nan"): dominant_denovo = "." + else: dominant_denovo = str(row.DOMINANT_DENOVO) + else: dominant_denovo = "." - + if ("RECESSIVE" in colnames): if (str(row.RECESSIVE) == "nan"): recessive = "." + else: recessive = str(row.RECESSIVE) + else: recessive = "." - + if ("XLINKED" in colnames): if (str(row.XLINKED) == "nan"): xlinked = "." + else: xlinked = str(row.XLINKED) + else: xlinked = "." - + if ("COMPOUND" in colnames): if (str(row.COMPOUND) == "nan"): compound = "." + else: compound = str(row.COMPOUND) + else: compound = "." - + if ("INHERITANCE" in colnames): if (str(row.INHERITANCE) == "nan") or (str(row.INHERITANCE) == ""): inheritance_comment = "." + else: inheritance_comment = str(row.INHERITANCE) + else: inheritance_comment = "." if not single: info_entry = "AIDIVA=" + aidiva_score + "," + final_aidiva_score + "," + hpo_relatedness + "," + hpo_relatedness_interacting + "," + filter_passed + ";AIDIVA_INHERITANCE=" + dominant + "," + dominant_denovo + "," + recessive + "," + xlinked + "," + compound + ";AIDIVA_INHERITANCE_COMMENT=" + inheritance_comment + else: info_entry = "AIDIVA=" + aidiva_score + "," + final_aidiva_score + "," + hpo_relatedness + "," + hpo_relatedness_interacting + "," + filter_passed + ";AIDIVA_INHERITANCE=" + recessive + "," + compound + ";AIDIVA_INHERITANCE_COMMENT=" + inheritance_comment out_file.write(str(row.CHROM).strip() + "\t" + str(row.POS) + "\t" + "." + "\t" + str(row.REF) + "\t" + str(row.ALT) + "\t" + "." + "\t" + "." + "\t" + info_entry + "\n") + else: write_header(out_file, assembly_build, single) - logger.warn(f"Wrote an empty result file!") + logger.warning(f"Wrote an empty result file!") if __name__=="__main__": parser = argparse.ArgumentParser() - parser.add_argument("--in", type=str, dest="in_data", metavar="in.csv", required=True, help="CSV file with the results from the AIdiva run that should be converted to a VCF file\n") - parser.add_argument("--out", type=str, dest="out_data", metavar="out.vcf", required=True, help="VCF file witht the results of the AIdiva run\n") + parser.add_argument("--in", type=str, dest="in_data", metavar="in.csv", required=True, help="CSV file with the results from the aiDIVA run that should be converted to a VCF file\n") + parser.add_argument("--out", type=str, dest="out_data", metavar="out.vcf", required=True, help="VCF file witht the results of the aiDIVA run\n") parser.add_argument("--not_single", action="store_false", required=False, help="Flag to indicate whether the file is single sample or not.\n") args = parser.parse_args() diff --git a/aidiva/helper_modules/filter_vcf.py b/aidiva/helper_modules/filter_vcf.py index dd4a703..49616cb 100644 --- a/aidiva/helper_modules/filter_vcf.py +++ b/aidiva/helper_modules/filter_vcf.py @@ -31,8 +31,10 @@ def filter_coding_variants(filepath, filepath_out, annotation_field_name): if filepath.endswith(".gz"): vcf_file_to_reformat = gzip.open(filepath, "rt") + else: vcf_file_to_reformat = open(filepath, "r") + outfile = open(filepath_out, "w") consequence_index = 0 @@ -43,32 +45,40 @@ def filter_coding_variants(filepath, filepath_out, annotation_field_name): # extract header from vcf file for line in tmp: - splitted_line = line.split("\t") + splitted_line = line.replace("\n", "").split("\t") if line.strip().startswith("##"): if line.strip().startswith("##fileformat="): outfile.write(line) continue + elif line.strip().startswith("##filedate="): outfile.write(line) continue + elif line.strip().startswith("##source="): outfile.write(line) continue + elif line.strip().startswith("##reference="): outfile.write(line) continue + elif line.strip().startswith("##contig="): outfile.write(line) continue + elif line.strip().startswith("##FORMAT="): outfile.write(line) continue + elif line.strip().startswith("##FILTER="): outfile.write(line) continue + elif line.strip().startswith("##ANALYSISTYPE="): outfile.write(line) continue + elif line.strip().startswith("##SAMPLE="): outfile.write(line) continue @@ -79,7 +89,9 @@ def filter_coding_variants(filepath, filepath_out, annotation_field_name): if annotation_header[i] == "Consequence": consequence_index = i break + continue + continue if line.strip().startswith("#CHROM"): diff --git a/aidiva/helper_modules/split_vcf_in_indel_and_snp_set.py b/aidiva/helper_modules/split_vcf_in_indel_and_snp_set.py index 5d90c28..2730fb8 100644 --- a/aidiva/helper_modules/split_vcf_in_indel_and_snp_set.py +++ b/aidiva/helper_modules/split_vcf_in_indel_and_snp_set.py @@ -12,8 +12,10 @@ def split_vcf_file_in_indel_and_snps_set(filepath, filepath_snp, filepath_indel): if filepath.endswith(".gz"): vcf_file_to_reformat = gzip.open(filepath, "rt") + else: vcf_file_to_reformat = open(filepath, "r") + outfile_snps = open(filepath_snp, "w") outfile_indel = open(filepath_indel, "w") @@ -30,7 +32,8 @@ def split_vcf_file_in_indel_and_snps_set(filepath, filepath_snp, filepath_indel) outfile_snps.write(line) outfile_indel.write(line) continue - if line.strip().startswith("#CHROM"): + + elif line.strip().startswith("#CHROM"): outfile_snps.write(line) outfile_indel.write(line) continue @@ -43,19 +46,24 @@ def split_vcf_file_in_indel_and_snps_set(filepath, filepath_snp, filepath_indel) if "," in splitted_line[4]: logger.debug(f"Processed variant was removed, too many alternative alleles reported (ALT={splitted_line[4]})!") continue + else: ref_length = len(splitted_line[3]) alt_length = max([len(alt) for alt in splitted_line[4].split(",")]) if (ref_length == 1) and (alt_length == 1): outfile_snps.write(line) + elif (ref_length > 1 and alt_length == 1) or (alt_length > 1 and ref_length == 1): indel_ID += 1 if splitted_line[7].endswith("\n"): splitted_line[7] = splitted_line[7].replace("\n", "") + ";INDEL_ID=" + str(indel_ID) + "\n" + else: splitted_line[7] = splitted_line[7].replace("\n", "") + ";INDEL_ID=" + str(indel_ID) + outfile_indel.write("\t".join(splitted_line)) + else: logger.critical("Something bad happened!") diff --git a/aidiva/run_aiDIVA.py b/aidiva/run_aiDIVA.py index 1762ad9..11aa19c 100644 --- a/aidiva/run_aiDIVA.py +++ b/aidiva/run_aiDIVA.py @@ -13,21 +13,22 @@ if __name__=="__main__": - parser = argparse.ArgumentParser(description = "aiDIVA -- augmented intelligence-based Disease Variant Analysis") + parser = argparse.ArgumentParser(description = "AIdiva -- Augmented Intelligence Disease Variant Analysis") parser.add_argument("--snp_vcf", type=str, dest="snp_vcf", metavar="snp.vcf", required=True, help="VCF file with the annotated SNP variants [required]") parser.add_argument("--indel_vcf", type=str, dest="indel_vcf", metavar="indel.vcf", required=True, help="VCF file with the annotated (only basic annotation) InDel variants [required]") parser.add_argument("--expanded_indel_vcf", type=str, dest="expanded_indel_vcf", metavar="expanded_indel.vcf", required=True, help="VCF file with the annotated expanded InDel variants [required]") parser.add_argument("--out_prefix", type=str, dest="out_prefix", metavar="/output_path/aidiva_result", required=True, help="Prefix that is used to save the results [required]") - parser.add_argument("--config", type=str, dest="config", metavar="config.yaml", required=True, help="Config file specifying the parameters for AIdiva [required]") + parser.add_argument("--config", type=str, dest="config", metavar="config.yaml", required=True, help="Config file specifying the parameters for aiDIVA [required]") parser.add_argument("--workdir", type=str, dest="workdir", metavar="/tmp/aidiva_workdir/", required=False, help="Path to the working directory, here all intermediate files are saved (if not specified a temporary folder will be created and used)") parser.add_argument("--hpo_list", type=str, dest="hpo_list", metavar="hpo.txt", required=False, help="TXT file containing the HPO terms reported for the current patient") parser.add_argument("--gene_exclusion", type=str, dest="gene_exclusion", metavar="gene_exclusion.txt", required=False, help="Tab separated file containing the genes to exclude in the analysis. Genes are assumed to be in the first column.") parser.add_argument("--family_file", type=str, dest="family_file", metavar="family.txt", required=False, help="TXT file showing the sample relations of the current data") parser.add_argument("--family_type", type=str, dest="family_type", metavar="SINGLE", required=False, help="In case of multisample data the kind of sample relation [SINGLE, TRIO, MULTI]") parser.add_argument("--skip_db_check", dest="skip_db_check", action="store_true", required=False, help="Flag to skip database (ClinVar, HGMD) lookup") - parser.add_argument("--only_top_results", dest="only_top_results", action="store_true", required=False, help="Report only the top 25 variants as result") + parser.add_argument("--only_top_results", dest="only_top_results", action="store_true", required=False, help="Report only the top ranking variants as result. The desired rank can be given as parameter with '--top_rank' (default: 25)") + parser.add_argument("--top_rank", type=str, dest="top_rank", metavar="25", required=False, help="Rank parameter for '--only_top_results' (default: 25)") parser.add_argument("--threads", type=int, dest="threads", metavar="1", required=False, help="Number of threads to use (default: 1)") - parser.add_argument("--log_file", type=str, dest="log_file", metavar="/tmp/aidiva_workdir/sample_id_aidiva.log", required=False, help="Custom log file, if not specified the log file is saved in the working directory") + parser.add_argument("--log_file", type=str, dest="log_file", metavar="/output_path/logs/aidiva_log.txt", required=False, help="Path plus name of the log file to be saved, if not specified the log file is saved in the working directory") parser.add_argument("--log_level", type=str, dest="log_level", metavar="INFO", required=False, help="Define logging level, if unsure just leave the default [DEBUG, INFO] (default: INFO)") parser.add_argument("--save_as_vcf", dest="save_as_vcf", action="store_true", required=False, help="Flag to create additional result files in VCF format.") args = parser.parse_args() @@ -37,6 +38,7 @@ snp_vcf = args.snp_vcf indel_vcf = args.indel_vcf expanded_indel_vcf = args.expanded_indel_vcf + else: raise SystemExit("The input VCF files were not specified!") @@ -44,17 +46,21 @@ if args.config is not None: try: config_file = open(args.config, "r") + except OSError: raise SystemExit("The given config file could not be opened!") + else: configuration = yaml.load(config_file, Loader=yaml.SafeLoader) config_file.close() + else: raise SystemExit("The parameter 'config' was not specified!") # parse output files if args.out_prefix is not None: output_filename = args.out_prefix + else: raise SystemExit("The parameter 'out_prefix' was not specified!") @@ -63,6 +69,7 @@ if not working_directory.endswith("/"): working_directory = working_directory + "/" + else: workdir = tempfile.TemporaryDirectory(suffix="", prefix="aidiva_workdir_", dir=None) working_directory = workdir.name @@ -73,17 +80,20 @@ # parse disease and inheritance information if args.hpo_list is not None: hpo_file = args.hpo_list + else: hpo_file = None if args.gene_exclusion is not None: gene_exclusion_file = args.gene_exclusion + else: gene_exclusion_file = None if (args.family_file is not None) and (args.family_type is not None): family_file = args.family_file family_type = args.family_type + else: family_file = None family_type = "SINGLE" @@ -91,44 +101,57 @@ skip_db_check = args.skip_db_check only_top_results = args.only_top_results save_as_vcf = args.save_as_vcf - + if args.threads is not None: num_cores = int(args.threads) + else: num_cores = 1 - + + if args.top_rank is not None: + top_rank = int(args.top_rank) + + else: + top_rank = 25 + # use log level INFO as default if args.log_level is not None: if args.log_level == "DEBUG": log_level = logging.DEBUG log_format = "%(asctime)s -- %(name)s - %(levelname)s - %(message)s" + elif args.log_level == "INFO": log_level = logging.INFO log_format = "%(asctime)s -- %(levelname)s - %(message)s" - #elif args.log_level == "WARN": - # log_level = logging.WARN + + #elif args.log_level == "WARNING": + # log_level = logging.WARNING # log_format = "%(asctime)s -- %(levelname)s - %(message)s" + #elif args.log_level == "ERROR": # log_level = logging.ERROR # log_format = "%(asctime)s -- %(levelname)s - %(message)s" + #elif args.log_level == "CRITICAL": # log_level = logging.CRITICAL # log_format = "%(asctime)s -- %(levelname)s - %(message)s" + else: log_level = logging.INFO log_format = "%(asctime)s -- %(levelname)s - %(message)s" + else: log_level = logging.INFO log_format = "%(asctime)s -- %(levelname)s - %(message)s" if args.log_file is not None: log_file = args.log_file + else: timestamp = time.strftime("%Y%m%d-%H%M%S") - log_file = str(working_directory + "/" + "aidiva_" + timestamp + ".log") + log_file = str(working_directory + "/" + "aidiva_" + timestamp + ".txt") # set up logger - logging.basicConfig(filename=log_file, filemode="a", format=log_format, @@ -136,16 +159,10 @@ level=log_level) logger = logging.getLogger() - logger.info("Running aiDIVA on already annotated data") + logger.info("Running aiDIVA on annotated data") logger.info("Start program") logger.info(f"Working directory: {working_directory}") - # load SNP ML model - #scoring_model_snp = configuration["Analysis-Input"]["scoring-model-snp"] - - # load InDel ML model - #scoring_model_indel = configuration["Analysis-Input"]["scoring-model-indel"] - # load ML model scoring_model = configuration["Analysis-Input"]["scoring-model"] @@ -157,43 +174,50 @@ assembly_build = configuration["Assembly-Build"] ref_path = configuration["Analysis-Input"]["ref-path"] + transcript_path = configuration["Canonical-Transcripts"] + # convert splitted input data to vcf and annotate if snp_vcf is not None: - input_data_snp = convert_vcf.convert_vcf_to_pandas_dataframe(snp_vcf, False, False, num_cores) + input_data_snp = convert_vcf.convert_vcf_to_pandas_dataframe(snp_vcf, False, False, transcript_path, num_cores) + else: input_data_snp = pd.DataFrame() if indel_vcf is not None and expanded_indel_vcf is not None: - input_data_indel = convert_vcf.convert_vcf_to_pandas_dataframe(indel_vcf, True, False, num_cores) - input_data_expanded_indel = convert_vcf.convert_vcf_to_pandas_dataframe(expanded_indel_vcf, True, True, num_cores) + input_data_indel = convert_vcf.convert_vcf_to_pandas_dataframe(indel_vcf, True, False, transcript_path, num_cores) + input_data_expanded_indel = convert_vcf.convert_vcf_to_pandas_dataframe(expanded_indel_vcf, True, True, transcript_path, num_cores) + else: input_data_indel = pd.DataFrame() input_data_expanded_indel = pd.DataFrame() + #logger.debug(f"Condition-Check: {input_data_snp.dropna(how='all').empty}, {input_data_indel.dropna(how='all').empty}, {input_data_expanded_indel.dropna(how='all').empty}") + #logger.debug(f"Condition: {(not input_data_snp.dropna(how='all').empty) or ((not input_data_indel.dropna(how='all').empty) and (not input_data_expanded_indel.dropna(how='all').empty))}") + if (not input_data_snp.dropna(how='all').empty) or ((not input_data_indel.dropna(how='all').empty) and (not input_data_expanded_indel.dropna(how='all').empty)): if ((not input_data_indel.empty) and (not input_data_expanded_indel.empty)): logger.info("Combine InDel variants ...") input_data_combined_indel = combine_expanded_indels.parallelized_indel_combination(input_data_indel, input_data_expanded_indel, feature_list, num_cores) - + else: - logger.warn("No InDel variants given move on to SNP processing!") + logger.warning("No InDel variants given move on to SNV processing!") input_data_combined_indel = pd.DataFrame() # predict pathogenicity score logger.info("Score variants ...") - + if not input_data_snp.dropna(how='all').empty: predicted_data_snp = predict.perform_pathogenicity_score_prediction(scoring_model, input_data_snp, allele_frequency_list, feature_list, num_cores) - + else: - logger.warn("No SNP variants, skip SNP prediction!") + logger.warning("No SNV variants, skip SNV prediction!") predicted_data_snp = pd.DataFrame() if not input_data_combined_indel.dropna(how='all').empty: predicted_data_indel = predict.perform_pathogenicity_score_prediction(scoring_model, input_data_combined_indel, allele_frequency_list, feature_list, num_cores) - + else: - logger.warn("No InDel variants, skip InDel prediction!") + logger.warning("No InDel variants, skip InDel prediction!") predicted_data_indel = pd.DataFrame() if (not predicted_data_snp.dropna(how='all').empty) and (not predicted_data_indel.dropna(how='all').empty): @@ -214,10 +238,12 @@ # prioritize and filter variants logger.info("Prioritize variants and finalize score ...") prioritized_data = prio.prioritize_variants(predicted_data, internal_parameter_dict, ref_path, num_cores, assembly_build, feature_list, skip_db_check, family_file, family_type, hpo_file, gene_exclusion_file) + prioritized_data["AIDIVA_RANK"] = prioritized_data["FINAL_AIDIVA_SCORE"].rank(method='min', ascending=False) + ## TODO: create additional output files according to the inheritance information (only filtered data) if only_top_results: - prioritized_data[prioritized_data["FILTER_PASSED"] == 1].head(n=25).to_csv(str(output_filename + "_aidiva_result_filtered.tsv"), sep="\t", index=False) - logger.info("Only 25 best variants are reported as result!") + prioritized_data[prioritized_data["FILTER_PASSED"] == 1 & prioritized_data["AIDIVA_RANK"] <= top_rank].to_csv(str(output_filename + "_aidiva_result_filtered.tsv"), sep="\t", index=False) + logger.info(f"Only top ranking variants are reported as result (rank: {top_rank})!") else: if save_as_vcf: @@ -242,4 +268,4 @@ write_result.write_result_vcf(None, str(output_filename + "_aidiva_result.vcf"), assembly_build, bool(family_type == "SINGLE")) write_result.write_result_vcf(None, str(output_filename + "_aidiva_result_filtered.vcf"), assembly_build, bool(family_type == "SINGLE")) - logger.warn("The given input files were empty!") + logger.warning("The given input files were empty!") diff --git a/aidiva/run_annotation.py b/aidiva/run_annotation.py index 685572b..e3ba96f 100644 --- a/aidiva/run_annotation.py +++ b/aidiva/run_annotation.py @@ -1,9 +1,12 @@ import argparse +import helper_modules.combine_expanded_indels_and_create_csv as combine_expanded_indels import helper_modules.convert_indels_to_snps_and_create_vcf as expand_indels_and_create_vcf +import helper_modules.convert_vcf_to_csv as convert_vcf import helper_modules.filter_vcf as filt_vcf import helper_modules.split_vcf_in_indel_and_snp_set as split_vcf import logging import os +import pandas as pd import tempfile import time import variant_annotation.annotate_with_vep as annotate @@ -11,58 +14,67 @@ if __name__=="__main__": - parser = argparse.ArgumentParser(description = "aiDIVA -- augmented intelligence-based Disease Variant Analysis") + parser = argparse.ArgumentParser(description = "AIdiva -- Augmented Intelligence Disease Variant Analysis") parser.add_argument("--vcf", type=str, dest="vcf", metavar="input.vcf(.gz)", required=True, help="VCF file with the variants to annotate [required]") - parser.add_argument("--config", type=str, dest="config", metavar="config.yaml", required=True, help="Config file specifying the parameters for AIdiva [required]") + parser.add_argument("--config", type=str, dest="config", metavar="config.yaml", required=True, help="Config file specifying the parameters for aiDIVA [required]") parser.add_argument("--out_folder", type=str, dest="out_folder", metavar="/output_path/aidiva_result", required=True, help="Prefix that is used to save the annotated files [required]") parser.add_argument("--filtered", dest="filtered", action="store_true", required=False, help="Flag indicating that the filtered files already exist in the result folder (skips the prefiltering step to save time)") parser.add_argument("--filtered_folder", type=str, dest="filtered_folder", metavar="/output_path/aidiva_filtered", required=False, help="Path to the prefiltered input VCF files") parser.add_argument("--inhouse_sample", dest="inhouse_sample", action="store_true", required=False, help="Flag to indicate that we are annotating an inhouse sample (skips leftNormalize since it is already performed)") + parser.add_argument("--output_table", dest="output_table", action="store_true", required=False, help="Flag to indicate that we want to save the annotation additionally as a TSV file") parser.add_argument("--threads", type=int, dest="threads", metavar="1", required=False, help="Number of threads to use. (default: 1)") - parser.add_argument("--log_file", type=str, dest="log_file", metavar="/tmp/aidiva_workdir/sample_id_aidiva.log", required=False, help="Custom log file, if not specified the log file is saved in the working directory") + parser.add_argument("--log_file", type=str, dest="log_file", metavar="/output_path/logs/aidiva_log.txt", required=False, help="Path plus name of the log file to be saved, if not specified the log file is saved in the working directory") parser.add_argument("--log_level", type=str, dest="log_level", metavar="INFO", required=False, help="Define logging level, if unsure just leave the default [DEBUG, INFO] (default: INFO)") args = parser.parse_args() - + workdir = tempfile.TemporaryDirectory(suffix="", prefix="aidiva_workdir_", dir=None) working_directory = workdir.name if not working_directory.endswith("/"): working_directory = working_directory + "/" - + inhouse_sample = args.inhouse_sample + is_filtered = args.filtered + output_table = args.output_table # use log level INFO as default if args.log_level is not None: if args.log_level == "DEBUG": log_level = logging.DEBUG log_format = "%(asctime)s -- %(name)s - %(levelname)s - %(message)s" + elif args.log_level == "INFO": log_level = logging.INFO log_format = "%(asctime)s -- %(levelname)s - %(message)s" - #elif args.log_level == "WARN": - # log_level = logging.WARN + + #elif args.log_level == "WARNING": + # log_level = logging.WARNING # log_format = "%(asctime)s -- %(levelname)s - %(message)s" + #elif args.log_level == "ERROR": # log_level = logging.ERROR # log_format = "%(asctime)s -- %(levelname)s - %(message)s" + #elif args.log_level == "CRITICAL": # log_level = logging.CRITICAL # log_format = "%(asctime)s -- %(levelname)s - %(message)s" + else: log_level = logging.INFO log_format = "%(asctime)s -- %(levelname)s - %(message)s" + else: log_level = logging.INFO log_format = "%(asctime)s -- %(levelname)s - %(message)s" - + if args.log_file is not None: log_file = args.log_file + else: timestamp = time.strftime("%Y%m%d-%H%M%S") - log_file = str(working_directory + "/" + "aidiva_" + timestamp + ".log") + log_file = str(working_directory + "/" + "aidiva_" + timestamp + ".txt") # set up logger - logging.basicConfig(filename=log_file, filemode="a", format=log_format, @@ -76,6 +88,7 @@ # parse input files if os.path.isfile(args.vcf): input_vcf = args.vcf + else: raise SystemExit("The given input file could not be opened!") @@ -83,17 +96,21 @@ if args.config is not None: try: config_file = open(args.config, "r") + except OSError: raise SystemExit("The given config file could not be opened!") + else: configuration = yaml.load(config_file, Loader=yaml.SafeLoader) config_file.close() + else: raise SystemExit("The parameter 'config' was not specified!") # parse output files if args.out_folder is not None: output_folder = args.out_folder + else: raise SystemExit("The parameter 'out_prefix' was not specified!") @@ -101,6 +118,7 @@ if args.filtered and args.filtered_folder is not None: filtered_folder = args.filtered_folder + else: logger.info("Pre-filtering will be performed and filtered files are saved to the output folder.") filtered_folder = output_folder @@ -108,6 +126,7 @@ # obtain number of threads to use during computation if args.threads is not None: num_cores = int(args.threads) + else: num_cores = 1 @@ -117,6 +136,10 @@ ref_path = configuration["Analysis-Input"]["ref-path"] + feature_list = configuration["Model-Features"]["feature-list"] + + transcript_path = configuration["Canonical-Transcripts"] + # convert splitted input data to vcf and annotate input_file = os.path.splitext(input_vcf)[0] input_filename = os.path.basename(input_file) @@ -124,7 +147,7 @@ logger.info("Starting VCF preparation...") # sorting and filtering step to remove unsupported variants - if not args.filtered: + if not is_filtered: annotate.left_normalize_and_sort_vcf(input_vcf, str(working_directory + "/" + input_filename + "_sorted.vcf"), annotation_dict, ref_path, inhouse_sample) annotate.annotate_consequence_information(str(working_directory + "/" + input_filename + "_sorted.vcf"), str(working_directory + "/" + input_filename + "_consequence.vcf"), annotation_dict, assembly_build, num_cores) filt_vcf.filter_coding_variants(str(working_directory + "/" + input_filename + "_consequence.vcf"), str(filtered_folder + "/" + input_filename + "_filtered.vcf"), "CONS") @@ -157,3 +180,16 @@ annotate.filter_regions(str(working_directory + "/" + input_filename + "_snp_vep_annotated_bed_bw.vcf"), str(output_folder + "/" + input_filename + "_snp_annotated.vcf"), annotation_dict) annotate.filter_regions(str(working_directory + "/" + input_filename + "_indel_vep_annotated_bed.vcf"), str(output_folder + "/" + input_filename + "_indel_annotated.vcf"), annotation_dict) + if output_table: + # Convert annotated VCF files to pandas dataframes + input_data_snp_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(output_folder + "/" + input_filename + "_snp_annotated.vcf"), False, False, transcript_path, num_cores) + input_data_indel_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(output_folder + "/" + input_filename + "_indel_annotated.vcf"), True, False, transcript_path, num_cores) + input_data_indel_expanded_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(output_folder + "/" + input_filename + "_indelExpanded_annotated.vcf"), True, True, transcript_path, num_cores) + + # Combine InDel variants + input_data_indel_combined_annotated = combine_expanded_indels.parallelized_indel_combination(input_data_indel_annotated, input_data_indel_expanded_annotated, feature_list, num_cores) + + # Create combined annotated CSV file + input_data_combined = pd.concat([input_data_snp_annotated, input_data_indel_combined_annotated]).sort_values(["#CHROM", "POS"], ascending=[True, True]) + input_data_combined.to_csv(str(output_folder + "/" + input_filename + "_annotated.csv"), index=False, sep="\t") + diff --git a/aidiva/run_annotation_and_aiDIVA.py b/aidiva/run_annotation_and_aiDIVA.py index 2ad77f5..147d5ad 100644 --- a/aidiva/run_annotation_and_aiDIVA.py +++ b/aidiva/run_annotation_and_aiDIVA.py @@ -17,9 +17,9 @@ if __name__=="__main__": - parser = argparse.ArgumentParser(description = "aiDIVA -- augmented intelligence-based Disease Variant Analysis") + parser = argparse.ArgumentParser(description = "AIdiva -- Augmented Intelligence Disease Variant Analysis") parser.add_argument("--vcf", type=str, dest="vcf", metavar="input.vcf(.gz)", required=True, help="VCF file with the variants to analyze [required]") - parser.add_argument("--config", type=str, dest="config", metavar="config.yaml", required=True, help="Config file specifying the parameters for AIdiva [required]") + parser.add_argument("--config", type=str, dest="config", metavar="config.yaml", required=True, help="Config file specifying the parameters for aiDIVA [required]") parser.add_argument("--out_prefix", type=str, dest="out_prefix", metavar="/output_path/aidiva_result", required=True, help="Prefix that is used to save the results [required]") parser.add_argument("--workdir", type=str, dest="workdir", metavar="/tmp/aidiva_workdir/", required=False, help="Path to the working directory, here all intermediate files are saved (if not specified a temporary folder will be created and used)") parser.add_argument("--hpo_list", type=str, dest="hpo_list", metavar="hpo.txt", required=False, help="TXT file containing the HPO terms reported for the current patient") @@ -27,10 +27,11 @@ parser.add_argument("--family_file", type=str, dest="family_file", metavar="family.txt", required=False, help="TXT file showing the family relation of the current patient") parser.add_argument("--family_type", type=str, dest="family_type", metavar="SINGLE", required=False, help="String indicating the present family type [SINGLE, TRIO]") parser.add_argument("--skip_db_check", dest="skip_db_check", action="store_true", required=False, help="Flag to skip DB lookup of variants") - parser.add_argument("--only_top_results", dest="only_top_results", action="store_true", required=False, help="Report only the top 25 variants as result") + parser.add_argument("--only_top_results", dest="only_top_results", action="store_true", required=False, help="Report only the top ranking variants as result. The desired rank can be given as parameter with '--top_rank' (default: 25)") + parser.add_argument("--top_rank", type=str, dest="top_rank", metavar="25", required=False, help="Rank parameter for '--only_top_results' (default: 25)") parser.add_argument("--inhouse_sample", dest="inhouse_sample", action="store_true", required=False, help="Flag to indicate that we are annotating an inhouse sample (skips leftNormalize since it is already performed)") parser.add_argument("--threads", type=int, dest="threads", metavar="1", required=False, help="Number of threads to use. (default: 1)") - parser.add_argument("--log_file", type=str, dest="log_file", metavar="/tmp/aidiva_workdir/sample_id_aidiva.log", required=False, help="Custom log file, if not specified the log file is saved in the working directory") + parser.add_argument("--log_file", type=str, dest="log_file", metavar="/output_path/logs/aidiva_log.txt", required=False, help="Path plus name of the log file to be saved, if not specified the log file is saved in the working directory") parser.add_argument("--log_level", type=str, dest="log_level", metavar="INFO", required=False, help="Define logging level, if unsure just leave the default [DEBUG, INFO] (default: INFO)") parser.add_argument("--save_as_vcf", dest="save_as_vcf", action="store_true", required=False, help="Flag to create additional result files in VCF format.") args = parser.parse_args() @@ -38,6 +39,7 @@ # parse input files if os.path.isfile(args.vcf): input_vcf = args.vcf + else: raise SystemExit("The given input file could not be opened!") @@ -45,17 +47,21 @@ if args.config is not None: try: config_file = open(args.config, "r") + except OSError: raise SystemExit("The given config file could not be opened!") + else: configuration = yaml.load(config_file, Loader=yaml.SafeLoader) config_file.close() + else: raise SystemExit("The parameter 'config' was not specified!") # parse output files if args.out_prefix is not None: output_filename = args.out_prefix + else: raise SystemExit("The parameter 'out_prefix' was not specified!") @@ -64,6 +70,7 @@ if not working_directory.endswith("/"): working_directory = working_directory + "/" + else: workdir = tempfile.TemporaryDirectory(suffix="", prefix="aidiva_workdir_", dir=None) working_directory = workdir.name @@ -74,11 +81,13 @@ # parse disease and inheritance information if args.hpo_list is not None: hpo_file = args.hpo_list + else: hpo_file = None if args.gene_exclusion is not None: gene_exclusion_file = args.gene_exclusion + else: gene_exclusion_file = None @@ -87,15 +96,15 @@ if args.family_type is not None: family_type = args.family_type + else: family_type = "SINGLE" - + else: family_file = None family_type = "SINGLE" save_as_vcf = args.save_as_vcf - skip_db_check = args.skip_db_check only_top_results = args.only_top_results inhouse_sample = args.inhouse_sample @@ -103,41 +112,54 @@ # obtain number of threads to use during computation if args.threads is not None: num_cores = int(args.threads) + else: num_cores = 1 + if args.top_rank is not None: + top_rank = int(args.top_rank) + + else: + top_rank = 25 + # use log level INFO as default if args.log_level is not None: if args.log_level == "DEBUG": log_level = logging.DEBUG log_format = "%(asctime)s -- %(name)s - %(levelname)s - %(message)s" + elif args.log_level == "INFO": log_level = logging.INFO log_format = "%(asctime)s -- %(levelname)s - %(message)s" - #elif args.log_level == "WARN": - # log_level = logging.WARN + + #elif args.log_level == "WARNING": + # log_level = logging.WARNING # log_format = "%(asctime)s -- %(levelname)s - %(message)s" + #elif args.log_level == "ERROR": # log_level = logging.ERROR # log_format = "%(asctime)s -- %(levelname)s - %(message)s" + #elif args.log_level == "CRITICAL": # log_level = logging.CRITICAL # log_format = "%(asctime)s -- %(levelname)s - %(message)s" + else: log_level = logging.INFO log_format = "%(asctime)s -- %(levelname)s - %(message)s" + else: log_level = logging.INFO log_format = "%(asctime)s -- %(levelname)s - %(message)s" - + if args.log_file is not None: log_file = args.log_file + else: timestamp = time.strftime("%Y%m%d-%H%M%S") - log_file = str(working_directory + "/" + "aidiva_" + timestamp + ".log") + log_file = str(working_directory + "/" + "aidiva_" + timestamp + ".txt") # set up logger - logging.basicConfig(filename=log_file, filemode="a", format=log_format, @@ -145,16 +167,10 @@ level=log_level) logger = logging.getLogger() - logger.info("Running annotation and aiDIVA") + logger.info("Running annotation and AIdiva") logger.info("Start program") logger.info(f"Working directory: {working_directory}") - # load SNP ML model - #scoring_model_snp = configuration["Analysis-Input"]["scoring-model-snp"] - - # load InDel ML model - #scoring_model_indel = configuration["Analysis-Input"]["scoring-model-indel"] - # load ML model scoring_model = configuration["Analysis-Input"]["scoring-model"] @@ -168,6 +184,8 @@ ref_path = configuration["Analysis-Input"]["ref-path"] + transcript_path = configuration["Canonical-Transcripts"] + # convert splitted input data to vcf and annotate input_file = os.path.splitext(input_vcf)[0] input_filename = os.path.basename(input_file) @@ -208,9 +226,9 @@ annotate.filter_regions(str(working_directory + "/" + input_filename + "_indel_vep_annotated_bed.vcf"), str(working_directory + "/" + input_filename + "_indel_vep_annotated_bed_filtered.vcf"), annotation_dict) # convert annotated vcfs back to pandas dataframes - input_data_snp_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(working_directory + "/" + input_filename + "_snp_vep_annotated_bed_bw_filtered.vcf"), False, False, num_cores) - input_data_indel_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(working_directory + "/" + input_filename + "_indel_vep_annotated_bed_filtered.vcf"), True, False, num_cores) - input_data_indel_expanded_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(working_directory + "/" + input_filename + "_indel_expanded_vep_annotated_bw.vcf"), True, True, num_cores) + input_data_snp_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(working_directory + "/" + input_filename + "_snp_vep_annotated_bed_bw_filtered.vcf"), False, False, transcript_path, num_cores) + input_data_indel_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(working_directory + "/" + input_filename + "_indel_vep_annotated_bed_filtered.vcf"), True, False, transcript_path, num_cores) + input_data_indel_expanded_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(working_directory + "/" + input_filename + "_indel_expanded_vep_annotated_bw.vcf"), True, True, transcript_path, num_cores) if (not input_data_snp_annotated.dropna(how='all').empty) or ((not input_data_indel_annotated.dropna(how='all').empty) and (not input_data_indel_expanded_annotated.dropna(how='all').empty)): if ((not input_data_indel_annotated.empty) and (not input_data_indel_expanded_annotated.empty)): @@ -219,7 +237,7 @@ input_data_indel_combined_annotated = combine_expanded_indels.parallelized_indel_combination(input_data_indel_annotated, input_data_indel_expanded_annotated, feature_list, num_cores) else: - logger.warn("No InDel variants given move on to SNP processing!") + logger.warning("No InDel variants given move on to SNV processing!") input_data_indel_combined_annotated = pd.DataFrame() # predict pathogenicity score @@ -229,14 +247,14 @@ predicted_data_snp = predict.perform_pathogenicity_score_prediction(scoring_model, input_data_snp_annotated, allele_frequency_list, feature_list, num_cores) else: - logger.warn("No SNP variants, skip SNP prediction!") + logger.warning("No SNV variants, skip SNV prediction!") predicted_data_snp = pd.DataFrame() - + if not input_data_indel_combined_annotated.dropna(how='all').empty: predicted_data_indel = predict.perform_pathogenicity_score_prediction(scoring_model, input_data_indel_combined_annotated, allele_frequency_list, feature_list, num_cores) else: - logger.warn("No InDel variants, skip InDel prediction!") + logger.warning("No InDel variants, skip InDel prediction!") predicted_data_indel = pd.DataFrame() if (not predicted_data_snp.dropna(how='all').empty) and (not predicted_data_indel.dropna(how='all').empty): @@ -247,7 +265,7 @@ elif (predicted_data_snp.dropna(how='all').empty) and (not predicted_data_indel.dropna(how='all').empty): predicted_data = predicted_data_indel - + elif (predicted_data_indel.dropna(how='all').empty) and (not predicted_data_snp.dropna(how='all').empty): predicted_data = predicted_data_snp @@ -257,16 +275,19 @@ # prioritize and filter variants logger.info("Filter variants and finalize score...") prioritized_data = prio.prioritize_variants(predicted_data, internal_parameter_dict, ref_path, num_cores, assembly_build, feature_list, skip_db_check, family_file, family_type, hpo_file, gene_exclusion_file) + prioritized_data["AIDIVA_RANK"] = prioritized_data["FINAL_AIDIVA_SCORE"].rank(method='min', ascending=False) + ## TODO: create additional output files according to the inheritance information (only filtered data) if only_top_results: - prioritized_data[prioritized_data["FILTER_PASSED"] == 1].head(n=25).to_csv(str(output_filename + "_aidiva_result_filtered.tsv"), sep="\t", index=False) - logger.info("Only 25 best variants are reported as result!") + prioritized_data[prioritized_data["FILTER_PASSED"] == 1 & prioritized_data["AIDIVA_RANK"] <= top_rank].to_csv(str(output_filename + "_aidiva_result_filtered.tsv"), sep="\t", index=False) + logger.info(f"Only top ranking variants are reported as result (rank: {top_rank})!") else: if save_as_vcf: write_result.write_result_vcf(prioritized_data, str(output_filename + "_aidiva_result.vcf"), assembly_build, bool(family_type == "SINGLE")) write_result.write_result_vcf(prioritized_data[prioritized_data["FILTER_PASSED"] == 1], str(output_filename + "_aidiva_result_filtered.vcf"), assembly_build, bool(family_type == "SINGLE")) + #prioritized_data = prioritized_data.rename(columns={"CHROM": "#CHROM"}) # not needed anymore prioritized_data.to_csv(str(output_filename + "_aidiva_result.tsv"), sep="\t", index=False) prioritized_data[prioritized_data["FILTER_PASSED"] == 1].to_csv(str(output_filename + "_aidiva_result_filtered.tsv"), sep="\t", index=False) @@ -284,4 +305,4 @@ write_result.write_result_vcf(None, str(output_filename + "_aidiva_result.vcf"), assembly_build, bool(family_type == "SINGLE")) write_result.write_result_vcf(None, str(output_filename + "_aidiva_result_filtered.vcf"), assembly_build, bool(family_type == "SINGLE")) - logger.warn("The given input files were empty!") + logger.warning("The given input files were empty!") diff --git a/aidiva/variant_annotation/annotate_with_vep.py b/aidiva/variant_annotation/annotate_with_vep.py index fae03b3..223600e 100644 --- a/aidiva/variant_annotation/annotate_with_vep.py +++ b/aidiva/variant_annotation/annotate_with_vep.py @@ -16,46 +16,54 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di # set the correct paths to the needed perl modules if "PERL5LIB" in os.environ: - #os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/:" + os.environ["PERL5LIB"] - os.environ["PERL5LIB"] = f"{vep_annotation_dict['vep']}/Bio/:/mnt/storage2/share/opt/perl_cpan_ubuntu20.04/lib/perl5/:{vep_annotation_dict['vep-plugin-path']}:{os.environ['PERL5LIB']}" + os.environ["PERL5LIB"] = f"{os.environ['PERL5LIB']}:{vep_annotation_dict['vep']}/Bio/:{vep_annotation_dict['vep-plugin-path']}:{vep_annotation_dict['vep-cpan']}" + else: - #os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/" - os.environ["PERL5LIB"] = f"{vep_annotation_dict['vep']}/Bio/:/mnt/storage2/share/opt/perl_cpan_ubuntu20.04/lib/perl5/:{vep_annotation_dict['vep-plugin-path']}" + os.environ["PERL5LIB"] = f"{vep_annotation_dict['vep']}/Bio/:{vep_annotation_dict['vep-plugin-path']}:{vep_annotation_dict['vep-cpan']}" cache_path = vep_annotation_dict['vep-cache'] + "/" + plugin_path = vep_annotation_dict['vep-plugin-path'] + "/" # add essential parameters vep_command = f"{vep_command} --species homo_sapiens --assembly {build} " vep_command = f"{vep_command} --offline" vep_command = f"{vep_command} --cache" vep_command = f"{vep_command} --dir_cache {cache_path}" + vep_command = f"{vep_command} --dir_plugins {plugin_path}" vep_command = f"{vep_command} --gencode_basic" vep_command = f"{vep_command} --symbol" vep_command = f"{vep_command} --biotype" vep_command = f"{vep_command} --variant_class" - if not expanded: - pass # deprecated + #if not expanded: + # pass + # deprecated -> to remove # allele frequencies to include #vep_command = f"{vep_command} --af" #vep_command = f"{vep_command} --af_1kg" #vep_command = f"{vep_command} --af_esp" - if build == "GRCh37": - vep_command = f"{vep_command} --af_gnomad" - elif build == "GRCh38": - vep_command = f"{vep_command} --af_gnomadg" + #if build == "GRCh37": + # vep_command = f"{vep_command} --af_gnomad" + #elif build == "GRCh38": + # vep_command = f"{vep_command} --af_gnomadg" # vep plugins to use if not basic: vep_command = f"{vep_command} --sift s" vep_command = f"{vep_command} --polyphen s" - #vep_command = f"{vep_command} --plugin EVE,file={vep_annotation_dict['plugin-files']['EVE']}" vep_command = f"{vep_command} --plugin AlphaMissense,file={vep_annotation_dict['plugin-files']['AlphaMissense']}" vep_command = f"{vep_command} -i " + input_vcf_file + " " vep_command = f"{vep_command} -o " + output_vcf_file + " " - vep_command = f"{vep_command} --fork " + str(num_cores) + " " + + # the developers of VEP recommend to use 4 threads + if num_cores >= 4: + vep_command = f"{vep_command} --fork 4 " + + elif num_cores == 2: + vep_command = f"{vep_command} --fork 2 " + vep_command = f"{vep_command} --format vcf" + " " # we need this to prevent vep from not working if the VCF file has no variant entries vep_command = f"{vep_command} --vcf" + " " vep_command = f"{vep_command} --no_stats" + " " @@ -72,11 +80,10 @@ def annotate_consequence_information(input_vcf_file, output_vcf_file, vep_annota # set the correct paths to the needed perl modules if "PERL5LIB" in os.environ: - #os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/:" + os.environ["PERL5LIB"] - os.environ["PERL5LIB"] = f"{vep_annotation_dict['vep']}/Bio/:/mnt/storage2/share/opt/perl_cpan_ubuntu20.04/lib/perl5/:{vep_annotation_dict['vep-plugin-path']}:{os.environ['PERL5LIB']}" + os.environ["PERL5LIB"] = f"{vep_annotation_dict['vep']}/Bio/:{vep_annotation_dict['vep-cpan']}:{vep_annotation_dict['vep-plugin-path']}:{os.environ['PERL5LIB']}" + else: - #os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/" - os.environ["PERL5LIB"] = f"{vep_annotation_dict['vep']}/Bio/:/mnt/storage2/share/opt/perl_cpan_ubuntu20.04/lib/perl5/:{vep_annotation_dict['vep-plugin-path']}" + os.environ["PERL5LIB"] = f"{vep_annotation_dict['vep']}/Bio/:{vep_annotation_dict['vep-cpan']}:{vep_annotation_dict['vep-plugin-path']}" cache_path = vep_annotation_dict['vep-cache'] + "/" @@ -91,7 +98,14 @@ def annotate_consequence_information(input_vcf_file, output_vcf_file, vep_annota vep_command = f"{vep_command} -o {output_vcf_file}" vep_command = f"{vep_command} --fields Consequence" vep_command = f"{vep_command} --vcf_info_field CONS" - vep_command = f"{vep_command} --fork {num_cores}" + + # the developers of VEP recommend to use 4 threads + if num_cores >= 4: + vep_command = f"{vep_command} --fork 4 " + + elif num_cores == 2: + vep_command = f"{vep_command} --fork 2 " + vep_command = f"{vep_command} --vcf" vep_command = f"{vep_command} --no_stats" vep_command = f"{vep_command} --force_overwrite" @@ -112,9 +126,6 @@ def annotate_from_vcf(input_vcf_file, output_vcf_file, annotation_dict, expanded tmp.write(f"{vcf_annotation['EIGEN_PHRED']}\t\tEIGEN_PHRED\t\ttrue\n".encode()) tmp.write(f"{vcf_annotation['FATHMM_XF']}\t\tFATHMM_XF\t\ttrue\n".encode()) tmp.write(f"{vcf_annotation['MutationAssessor']}\t\tMutationAssessor\t\ttrue\n".encode()) - # add allele frequencies to gnomAD annotation (instead of annotation from VEP) - #tmp.write(f"{vcf_annotation['gnomAD']}\tgnomAD\tAN,Hom\t\ttrue\n".encode()) - #tmp.write(f"{vcf_annotation['gnomAD']}\tgnomAD\tAN,Hom,AFR_AF,AMR_AF,EAS_AF,NFE_AF,SAS_AF\t\ttrue\n".encode()) tmp.write(f"{vcf_annotation['CAPICE']}\t\tCAPICE\t\ttrue\n".encode()) tmp.write(f"{vcf_annotation['dbscSNV']}\t\tADA=ADA_SCORE,RF=RF_SCORE\t\ttrue\n".encode()) tmp.write(f"{vcf_annotation['CADD']}\t\tCADD\t\ttrue\n".encode()) @@ -129,7 +140,7 @@ def annotate_from_vcf(input_vcf_file, output_vcf_file, annotation_dict, expanded tmp.write(f"{vcf_annotation['hgmd']}\tHGMD\tCLASS,RANKSCORE\t\ttrue\n".encode()) else: - logger.warn("HGMD file is not found! Skip HGMD annotation!") + logger.warning("HGMD file is not found! Skip HGMD annotation!") # switch between SNV and InDel file if basic: @@ -157,17 +168,25 @@ def annotate_from_bed(input_vcf_file, output_vcf_file, annotation_dict, num_core tmp_segDup = tempfile.NamedTemporaryFile(mode="w+b", suffix="_segDup.vcf", delete=False) tmp_simpleRepeat = tempfile.NamedTemporaryFile(mode="w+b", suffix="_simpleRepeat.vcf", delete=False) tmp_oe_lof = tempfile.NamedTemporaryFile(mode="w+b", suffix="_oe_lof.vcf", delete=False) + #tmp_oe_mis = tempfile.NamedTemporaryFile(mode="w+b", suffix="_oe_mis.vcf", delete=False) ## currently not used + #tmp_oe_syn = tempfile.NamedTemporaryFile(mode="w+b", suffix="_oe_syn.vcf", delete=False) ## currently not used tmp_repeatmasker = tempfile.NamedTemporaryFile(mode="w+b", suffix="_repeatmasker.vcf", delete=False) # close temporary files to make them accessible tmp_segDup.close() tmp_simpleRepeat.close() tmp_oe_lof.close() + #tmp_oe_mis.close() ## currently not used + #tmp_oe_syn.close() ## currently not used tmp_repeatmasker.close() subprocess.run(f"{command} -bed {bed_annotation['segmentDuplication']} -name SegDup -sep '&' -in {input_vcf_file} -out {tmp_segDup.name} -threads {num_cores}", shell=True, check=True) subprocess.run(f"{command} -bed {bed_annotation['simpleRepeat']} -name SimpleRepeats -sep '&' -in {tmp_segDup.name} -out {tmp_simpleRepeat.name} -threads {num_cores}", shell=True, check=True) subprocess.run(f"{command} -bed {bed_annotation['oe_lof']} -name oe_lof -sep '&' -in {tmp_simpleRepeat.name} -out {tmp_oe_lof.name} -threads {num_cores}", shell=True, check=True) + + ## currently not used + #subprocess.run(f"{command} -bed {bed_annotation['oe_mis']} -name oe_mis -sep '&' -in {tmp_oe_lof.name} -out {tmp_oe_mis.name} -threads {num_cores}", shell=True, check=True) + #subprocess.run(f"{command} -bed {bed_annotation['oe_syn']} -name oe_syn -sep '&' -in {tmp_oe_mis.name} -out {tmp_oe_syn.name} -threads {num_cores}", shell=True, check=True) if os.path.isfile(f"{bed_annotation['omim']}"): subprocess.run(f"{command} -bed {bed_annotation['repeatMasker']} -name REPEATMASKER -sep '&' -in {tmp_oe_lof.name} -out {tmp_repeatmasker.name} -threads {num_cores}", shell=True, check=True) @@ -182,6 +201,8 @@ def annotate_from_bed(input_vcf_file, output_vcf_file, annotation_dict, num_core os.remove(tmp_segDup.name) os.remove(tmp_simpleRepeat.name) os.remove(tmp_oe_lof.name) + #os.remove(tmp_oe_mis.name) ## currently not used + #os.remove(tmp_oe_syn.name) ## currently not used os.remove(tmp_repeatmasker.name) @@ -287,7 +308,7 @@ def left_normalize_and_sort_vcf(input_vcf_file, output_vcf_file, vep_annotation_ tmp_bigwig_annot = tempfile.NamedTemporaryFile(mode="w+b", suffix="_bigwigAnnot.vcf", delete=False) # perform annotations - left_normalize_and_sort_vcf(input_vcf_file, tmp_sorted.name, annotation_dict, reference, inhouse_sample) + left_normalize_and_sort_vcf(input_vcf_file, tmp_sorted.name, annotation_dict, args.reference, inhouse_sample) call_vep_and_annotate_vcf(tmp_sorted.name, tmp_vep_annot.name, annotation_dict, assembly_build, basic_annotation, expanded_annotation, num_threads) annotate_from_vcf(tmp_vep_annot.name, tmp_vcf_annot.name, annotation_dict, expanded_annotation, basic_annotation, num_threads) diff --git a/aidiva/variant_prioritization/get_HPO_similarity_score.py b/aidiva/variant_prioritization/get_HPO_similarity_score.py index 43baf40..a5b3358 100644 --- a/aidiva/variant_prioritization/get_HPO_similarity_score.py +++ b/aidiva/variant_prioritization/get_HPO_similarity_score.py @@ -1,4 +1,3 @@ - import numpy as np import logging @@ -9,7 +8,7 @@ def find_mica_IC(HPO_term_a, HPO_term_b, ic_per_nodes, node_ancestor_mapping): hpo_nodes_shared_ancestors = node_ancestor_mapping[HPO_term_a].intersection(node_ancestor_mapping[HPO_term_b]) mica_ic = max([float(ic_per_nodes[node]) for node in hpo_nodes_shared_ancestors], default=0.0) - + return mica_ic @@ -17,19 +16,20 @@ def compute_similarity_between_nodes(hpo_term_a, hpo_term_b, ic_per_nodes, node_ node_a_ic = float(ic_per_nodes[hpo_term_a]) node_b_ic = float(ic_per_nodes[hpo_term_b]) mica_ic = find_mica_IC(hpo_term_a, hpo_term_b, ic_per_nodes, node_ancestor_mapping) - + if similarity_measure == "Lin": if mica_ic == 0.0 or (node_a_ic == 0.0 and node_b_ic == 0.0): nodes_similarity = 0.0 + else: nodes_similarity = (2.0 * mica_ic) / (node_a_ic + node_b_ic) - + elif similarity_measure == "Resnik": nodes_similarity = mica_ic - + elif similarity_measure == "Jiang-Conrath": nodes_similarity = (1 - (node_a_ic + node_b_ic - 2.0 * mica_ic)) - + elif similarity_measure == "Relevance": #nodes_similarity = ((2.0 * mica_ic) / (node_a_ic + node_b_ic)) * (1 - p(mica)) pass @@ -42,69 +42,78 @@ def compute_similarity_between_nodes(hpo_term_a, hpo_term_b, ic_per_nodes, node_ elif similarity_measure == "Wang": pass - + else: logger.error("An error occured while determining the similarity measure!") - + return nodes_similarity def calculate_hpo_set_similarity(hpo_graph, hpo_term_set_a, hpo_term_set_b, ic_per_nodes, node_ancestor_mapping, hpo_replacement_information): checked_term_set_a = [] checked_term_set_b = [] - + # alternatives and/or considerations are ignored for now alternatives = hpo_replacement_information["alternatives"] considerations = hpo_replacement_information["considerations"] replacements = hpo_replacement_information["replacements"] - + for term_a in hpo_term_set_a: if term_a not in hpo_graph: if term_a in replacements.keys(): checked_term_set_a.append(replacements[term_a]) logger.debug(f"{term_a} (sample) not in HPO graph! Replacement ({replacements[term_a]}) found will use this term instead!") + elif term_a in alternatives.keys(): #checked_term_set_a.extend(alternatives[term_a]) logger.debug(f"{term_a} (sample) not in HPO graph! Alternatives ({alternatives[term_a]}) found! HPO term will be skipped!") + elif term_a in considerations.keys(): #checked_term_set_a.extend(considerations[term_a]) logger.debug(f"{term_a} (sample) not in HPO graph! Considerations ({considerations[term_a]}) found! HPO term will be skipped!") + else: logger.debug(f"{term_a} (sample) not in HPO graph! HPO term will be skipped!") + else: checked_term_set_a.append(term_a) - + for term_b in hpo_term_set_b: if term_b not in hpo_graph: if term_b in replacements.keys(): checked_term_set_b.append(replacements[term_b]) logger.debug(f"{term_b} (gene) not in HPO graph! Replacement ({replacements[term_b]}) found will use this term instead!") + elif term_b in considerations.keys(): #checked_term_set_b.extend(considerations[term_b]) logger.debug(f"{term_b} (gene) not in HPO graph! Considerations ({considerations[term_b]}) found! HPO term will be skipped!") + elif term_b in alternatives.keys(): #checked_term_set_b.extend(alternatives[term_b]) logger.debug(f"{term_b} (gene) not in HPO graph! Alternatives ({alternatives[term_b]}) found! HPO term will be skipped!") + else: logger.debug(f"{term_b} (gene) not in HPO graph! HPO term will be skipped!") + else: checked_term_set_b.append(term_b) - + if checked_term_set_a and checked_term_set_b: similarities_a_to_b = [max([compute_similarity_between_nodes(term_a, term_b, ic_per_nodes, node_ancestor_mapping) for term_b in checked_term_set_b], default=0.0) for term_a in checked_term_set_a] #similarities_b_to_a = [max([compute_similarity_between_nodes(term_b, term_a, ic_per_nodes, node_ancestor_mapping) for term_a in checked_term_set_a], default=0.0) for term_b in checked_term_set_b] - + set_a_to_b_similarity = np.median(similarities_a_to_b) #set_b_to_a_similarity = np.median(similarities_b_to_a) #if set_a_to_b_similarity != 0.0 or set_b_to_a_similarity != 0.0: # hpo_set_similarity = (set_a_to_b_similarity + set_b_to_a_similarity) / 2 + #else: # hpo_set_similarity = 0.0 - + return set_a_to_b_similarity else: logger.debug(f"Sample HPO set ({hpo_term_set_a}) and/or Gene HPO set ({hpo_term_set_b}) was empty after checking if the terms are part of the used HPO graph! Maybe no supported term was in the set! Similarity set to 0.0!") - + return 0.0 diff --git a/aidiva/variant_prioritization/prioritize_variants.py b/aidiva/variant_prioritization/prioritize_variants.py index 34008db..31925a0 100644 --- a/aidiva/variant_prioritization/prioritize_variants.py +++ b/aidiva/variant_prioritization/prioritize_variants.py @@ -126,7 +126,7 @@ def parse_ped_file(family_file): else: logger.error("The specified family file %s is not a valid file" % (family_file)) - logger.warn("Inheritance assessment will be skipped!") + logger.warning("Inheritance assessment will be skipped!") return family_dict @@ -153,7 +153,7 @@ def parse_hpo_list(hpo_list_file): #logger.error("The specified HPO list %s is not a valid file" % (hpo_list_file)) else: - logger.warn("HPO score finalization will be skipped!") + logger.warning("HPO score finalization will be skipped!") return list(hpo_query) @@ -176,7 +176,7 @@ def parse_gene_list(gene_exclusion_file): else: logger.error("The specified gene exclusion list %s is not a valid file" % (gene_exclusion_file)) - logger.warn("No genes will be excluded during filtering!") + logger.warning("No genes will be excluded during filtering!") return genes2exclude @@ -283,6 +283,7 @@ def parallelized_variant_processing(skip_db_check, transcript_dict, family, fami if genotype_column: variant_data = check_inheritance(variant_data, family_type, family) + else: logger.info(f"Skip inheritance check!") @@ -315,6 +316,7 @@ def investigate_transcript_cds_position(variant, transcript_dict): if "-" in str(variant["CDS_position"]): cds_start = variant["CDS_position"].split("-")[0] + else: cds_start = variant["CDS_position"] @@ -327,14 +329,19 @@ def investigate_transcript_cds_position(variant, transcript_dict): if 0.95 <= start_percentage < 0.96: adjusted_score = float(predicted_score) - 0.01 + elif 0.96 <= start_percentage < 0.97: adjusted_score = float(predicted_score) - 0.02 + elif 0.97 <= start_percentage < 0.98: adjusted_score = float(predicted_score) - 0.03 + elif 0.98 <= start_percentage < 0.99: adjusted_score = float(predicted_score) - 0.04 + elif 0.99 <= start_percentage <= 1: adjusted_score = float(predicted_score) - 0.05 + else: adjusted_score = predicted_score @@ -365,11 +372,14 @@ def check_databases_for_pathogenicity_classification(variant): clinvar_classification = str(variant["CLINVAR_DETAILS"]).split("%")[0] if "(replaced" in clinvar_classification: clinvar_classification = clinvar_classification.split("(replaced")[0] + elif "/" in clinvar_classification: if clinvar_classification.lower() == "benign/likely_benign" or clinvar_classification.lower() == "likely_benign/benign": clinvar_classification = "likely_benign" + elif clinvar_classification.lower() == "pathogenic/likely_pathogenic" or clinvar_classification.lower() == "likely_pathogenic/pathogenic": clinvar_classification = "likely_pathogenic" + else: logger.debug(f"Found unknown ClinVar classification {clinvar_classification}") @@ -377,33 +387,44 @@ def check_databases_for_pathogenicity_classification(variant): if clinvar_classification.lower() == "benign": clinvar_score = 0.0 + elif clinvar_classification.lower() == "likely_benign": clinvar_score = 0.25 + elif clinvar_classification.lower() == "uncertain_significance": clinvar_score = 0.5 + elif clinvar_classification.lower() == "likely_pathogenic": clinvar_score = 0.75 + elif clinvar_classification.lower() == "pathogenic": clinvar_score = 1.0 + else: # ignore missing database scores (if both scores missing use unmodified AIDIVA_SCORE) clinvar_score = np.nan - + if hgmd_classification == "DM": hgmd_score = 1.0 #if float(variant["HGMD_RANKSCORE"]) > 0.5: # hgmd_score = 1.0 + elif hgmd_classification == "DM?": hgmd_score = 0.75 + #elif hgmd_classification == "DP": # hgmd_score = 0.5 + #elif hgmd_classification == "FP": # hgmd_score = 0.5 + #elif hgmd_classification == "DFP": # hgmd_score = 0.5 + # completely ignore removed entries database entries elif hgmd_classification == "R": hgmd_score = np.nan + else: # ignore missing database scores (if both scores missing use unmodified AIDIVA_SCORE) hgmd_score = np.nan @@ -411,24 +432,29 @@ def check_databases_for_pathogenicity_classification(variant): # log a warning message if hgmd score and clinvar_score are both not missing but differ if (hgmd_score != clinvar_score) and not (np.isnan(hgmd_score) or np.isnan(clinvar_score)): - logger.warn(f"The scores of the used databases (ClinVar, HGMD) are different: {clinvar_score}, {hgmd_score} (Variant: {variant['#CHROM']}:{variant['POS']}_{variant['REF']}_{variant['ALT']}) \n If the HGMD database was used during the annotation you may want to further investigate this matter, otherwise you can ignore this warning since we just use the predicted value as default value if the entry is missing.") + logger.warning(f"The scores of the used databases (ClinVar, HGMD) are different: {clinvar_score}, {hgmd_score} (Variant: {variant['#CHROM']}:{variant['POS']}_{variant['REF']}_{variant['ALT']}) \n If the HGMD database was used during the annotation you may want to further investigate this matter, otherwise you can ignore this warning since we just use the predicted value as default value if the entry is missing.") if np.isnan(clinvar_score) and not np.isnan(hgmd_score): database_score = hgmd_score + elif np.isnan(hgmd_score) and not np.isnan(clinvar_score): database_score = clinvar_score + elif not np.isnan(hgmd_score) and not np.isnan(clinvar_score): database_score = (hgmd_score + clinvar_score) / 2 + else: database_score = np.nan - + if not np.isnan(float(variant["AIDIVA_SCORE"])): if not np.isnan(database_score): # update the predicted score with known information from the databases improved_score = 0.8 * float(variant["AIDIVA_SCORE"]) + 0.2 * database_score + else: # use unmodified AIDIVA_SCORE if no database entries were found improved_score = float(variant["AIDIVA_SCORE"]) + else: improved_score = np.nan @@ -437,7 +463,6 @@ def check_databases_for_pathogenicity_classification(variant): def compute_hpo_relatedness_and_final_score(variant, genes2exclude, gene_2_HPO, hgnc_2_gene, gene_2_interacting, HPO_graph, HPO_query, ic_per_nodes, node_ancestor_mapping, hpo_replacement_information): if HPO_query: - #if np.isnan(variant["AIDIVA_SCORE"]) and ((str(variant["rf_score"]) == "nan") or (str(variant["rf_score"]) == "")) and ((str(variant["ada_score"]) == "nan") or (str(variant["ada_score"]) == "")): if np.isnan(variant["AIDIVA_SCORE"]) and ((str(variant["SpliceAI"]) == "nan") or (str(variant["SpliceAI"]) == "")): final_score = np.nan hpo_relatedness = np.nan @@ -445,22 +470,13 @@ def compute_hpo_relatedness_and_final_score(variant, genes2exclude, gene_2_HPO, else: if np.isnan(variant["AIDIVA_SCORE"]): - # if both scores are present use maximum to benefit of the strengths of each model - #if ((str(variant["rf_score"]) != "nan") and (str(variant["rf_score"]) != "")) and ((str(variant["ada_score"]) != "nan") and (str(variant["ada_score"]) != "")): - # pathogenictiy_prediction = max([float(variant["rf_score"]), float(variant["ada_score"])], default=np.nan) - #elif ((str(variant["rf_score"]) != "") and (str(variant["rf_score"]) != "nan")) and ((str(variant["ada_score"]) == "") or (str(variant["ada_score"]) == "nan")): - # pathogenictiy_prediction = float(variant["rf_score"]) - #elif ((str(variant["ada_score"]) != "") and (str(variant["ada_score"]) != "nan")) and ((str(variant["rf_score"]) == "") or (str(variant["rf_score"]) == "nan")): - # pathogenictiy_prediction = float(variant["ada_score"]) - #else: - # pathogenictiy_prediction = np.nan - if ((str(variant["SpliceAI"]) != "nan") and (str(variant["SpliceAI"]) != "")): pathogenictiy_prediction = float(variant["SpliceAI"]) + else: pathogenictiy_prediction = np.nan - - logger.debug("Using SpliceAI prediction instead of AIdiva prediction for splicing variant!") + + logger.debug("Using SpliceAI prediction instead of aiDIVA prediction for splicing variant!") else: pathogenictiy_prediction = float(variant["AIDIVA_SCORE"]) @@ -473,6 +489,7 @@ def compute_hpo_relatedness_and_final_score(variant, genes2exclude, gene_2_HPO, if variant_gene in gene_2_interacting.keys(): interacting_genes = [gene_interaction["interacting_gene"] for gene_interaction in gene_2_interacting[variant_gene]] + else: interacting_genes = [] @@ -523,26 +540,12 @@ def compute_hpo_relatedness_and_final_score(variant, genes2exclude, gene_2_HPO, else: if np.isnan(variant["AIDIVA_SCORE"]): - #if ((str(variant["rf_score"]) == "") or (str(variant["rf_score"]) == "nan")) and ((str(variant["ada_score"]) == "") or (str(variant["ada_score"]) == "nan")): if ((str(variant["SpliceAI"]) == "") or (str(variant["SpliceAI"]) == "nan")): pathogenictiy_prediction = np.nan else: - #if (str(variant["rf_score"]) != "") and (str(variant["rf_score"]) != "nan"): - # rf_score = float(variant["rf_score"]) - - #else: - # rf_score = np.nan - - #if (str(variant["ada_score"]) != "") and (str(variant["ada_score"]) != "nan"): - # ada_score = float(variant["ada_score"]) - - #else: - # ada_score = np.nan - - #pathogenictiy_prediction = max([rf_score, ada_score], default=np.nan) pathogenictiy_prediction = float(variant["SpliceAI"]) - logger.debug("Using SpliceAI prediction instead of AIdiva prediction for splicing variant!") + logger.debug("Using SpliceAI prediction instead of aiDIVA prediction for splicing variant!") else: pathogenictiy_prediction = float(variant["AIDIVA_SCORE"]) @@ -564,6 +567,7 @@ def check_inheritance(variant_data, family_type="SINGLE", family=None): for group in variant_data_grouped: check_compound_single(group, variant_columns) + variant_data = pd.concat(variant_data_grouped) elif family_type == "TRIO": @@ -585,19 +589,23 @@ def check_inheritance(variant_data, family_type="SINGLE", family=None): for name in family.keys(): if family[name] == 1: affected_child = name + elif family[name] == 0: if not parent_1: parent_1 = name continue + elif not parent_2: parent_2 = name continue + else: logger.error("There was a problem processing the loaded PED file") if affected_child and parent_1 and parent_2: for group in variant_data_grouped: check_compound(group, affected_child, parent_1, parent_2) + else: logger.error(f"Something went wrong while checking the TRIO for compound heterozygosity!") @@ -605,8 +613,8 @@ def check_inheritance(variant_data, family_type="SINGLE", family=None): else: logger.error("If family type (TRIO) is used a proper PED file defining the family relations is required!") - logger.warn("Inheritance assessment will be skipped!") - + logger.warning("Inheritance assessment will be skipped!") + elif (family_type == "MULTI") and (family is not None): if family is not None: variant_data["DOMINANT"] = variant_data.apply(lambda variant: check_dominant(variant, family), axis=1) @@ -615,11 +623,11 @@ def check_inheritance(variant_data, family_type="SINGLE", family=None): else: logger.error("If family type (MULTI) is used a proper PED file defining the family relations (if known) is required!") - logger.warn("Inheritance assessment will be skipped!") - + logger.warning("Inheritance assessment will be skipped!") + else: logger.error("Unsupported family type (%s) was given!" % family_type) - logger.warn("Inheritance assessment will be skipped!") + logger.warning("Inheritance assessment will be skipped!") variant_columns = variant_data.columns variant_data["INHERITANCE"] = variant_data.apply(lambda variant: add_inheritance_mode(variant, variant_columns), axis=1) @@ -710,7 +718,8 @@ def homopolymer_filter(sequence): def check_filters(variant, genes2exclude, HPO_query, reference): - variant_genes = re.sub("\(.*?\)", "", str(variant["SYMBOL"])) + #variant_genes = re.sub("\(.*?\)", "", str(variant["SYMBOL"])) + variant_genes = str(variant["SYMBOL"]) genenames = set(variant_genes.split(";")) in_fasta = pysam.FastaFile(reference) @@ -724,12 +733,14 @@ def check_filters(variant, genes2exclude, HPO_query, reference): try: seg_dup = float(variant[duplication_identifier]) + except Exception as e: logger.debug("Use 0.0 for missing segment duplication entries!") seg_dup = 0.0 try: maf = float(variant["MAX_AF"]) + except Exception as e: logger.debug("Allele frequency entry could not be identified, use 0.0 instead!") maf = 0.0 @@ -762,29 +773,31 @@ def check_filters(variant, genes2exclude, HPO_query, reference): filter_comment = "tandem repeat" return filter_passed, filter_comment - + # let variants with a high FINAL_AIDIVA_SCORE (>=0.7) pass to be more sensitive if (seg_dup > 0) and (float(variant["FINAL_AIDIVA_SCORE"]) < 0.7): filter_passed = 0 # segmental duplication filter_comment = "segmental duplication" return filter_passed, filter_comment - + # let variants with a high FINAL_AIDIVA_SCORE (>=0.7) pass to be more sensitive if ((len(variant["REF"]) > 1 or len(variant["ALT"]) > 1)) and (float(variant["FINAL_AIDIVA_SCORE"]) < 0.7): # make sure to use the correct internal chromsome notation (with chr) if "chr" in str(variant["#CHROM"]): chrom_id = str(variant["#CHROM"]) + else: chrom_id = "chr" + str(variant["#CHROM"]) - + # Get sequence context (vicinity) of a variant for homopolymer check (5 bases up- and down-stream) # Get fewer bases when variant is at the start or end of the sequence if "chr" in str(variant["#CHROM"]): chrom_id = str(variant["#CHROM"]) + else: chrom_id = "chr" + str(variant["#CHROM"]) - + num_bases = 5 pos_start = max(int(variant["POS"]) - (num_bases + 1), 1) pos_end = min(int(variant["POS"]) + num_bases, in_fasta.get_reference_length(chrom_id)) @@ -792,9 +805,10 @@ def check_filters(variant, genes2exclude, HPO_query, reference): try: sequence_context = in_fasta.fetch(chrom_id, pos_start, pos_end) sequence_context = sequence_context.upper() + except FileNotFoundError: sequence_context = '.' - + homopolymer_flag, low_complexity_flag = homopolymer_filter(sequence_context) if low_complexity_flag and homopolymer_flag: @@ -922,6 +936,7 @@ def check_denovo(variant, family): # sample info complete? if name in check_samples: check_samples[name] = 1 + else: logger.debug(f"It seems that your pedigree is incomplete. The following sample: {name} could not be found in the pedigree!") @@ -953,7 +968,7 @@ def check_denovo(variant, family): else: # reject missing genotype here or beforehand pass - + for vals in check_samples.values(): if vals == 0: judgement = 0 @@ -975,6 +990,7 @@ def check_dominant(variant, family): if name in check_samples: check_samples[name] = 1 + else: logger.debug(f"It seems that your pedigree is incomplete. The following sample: {name} could not be found in the pedigree!") @@ -1008,7 +1024,7 @@ def check_dominant(variant, family): elif zygosity == "1/1" and family[name] == 0: judgement = 0 break - + else: # reject missing genotype here or beforehand pass @@ -1044,6 +1060,7 @@ def check_recessive(variant, family, family_type): if name in check_samples: check_samples[name] = 1 + else: logger.debug(f"It seems that your pedigree is incomplete. The following sample: {name} could not be found in the pedigree!") @@ -1080,7 +1097,7 @@ def check_recessive(variant, family, family_type): else: # reject missing genotype here or beforehand pass - + for vals in check_samples.values(): if vals == 0: judgement = 0 @@ -1098,7 +1115,7 @@ def check_recessive_single(variant, variant_columns): # recessive variants have to be homozygous in affected samples if (variant_genotype == "1/1"): is_recessive = 1 - + return is_recessive @@ -1118,6 +1135,7 @@ def check_xlinked(variant, family): if name in check_samples: check_samples[name] = 1 + else: logger.debug(f"It seems that your pedigree is incomplete. The following sample: {name} could not be found in the pedigree!") @@ -1152,7 +1170,7 @@ def check_xlinked(variant, family): else: # reject missing genotype here or beforehand pass - + # sanity check het_checker = 0 hom_checker = 0 @@ -1161,13 +1179,14 @@ def check_xlinked(variant, family): # mother if values == "0/1": het_checker = 1 - + # father if values == "0/0": hom_checker = 1 if het_checker == 1 and hom_checker == 1: judgement = 1 + else: judgement = 0 @@ -1187,14 +1206,52 @@ def check_xlinked(variant, family): parser = argparse.ArgumentParser(description = "Filter variants and finalize the AIDIVA_SCORE based on the given HPO terms (if this information is present)") parser.add_argument("--in_file", type=str, dest="in_file", required=True, help="Tab separated input annotated and scored file [required]") parser.add_argument("--out_file", type=str, dest="out_filename", required=True, help="Name to save the results [required]") - parser.add_argument("--family", type=str, dest="family", required=False, help="Tab separated list of samples annotated with affection status. [required]") - parser.add_argument("--family_type", type=str, choices=["TRIO", "FAMILY", "SINGLE"], dest="family_type", required=False, help="Choose if the data you provide is a trio or a larger family [required]") + parser.add_argument("--family", type=str, dest="family", required=False, help="Tab separated list of samples annotated with affection status.") + parser.add_argument("--family_type", type=str, choices=["TRIO", "FAMILY", "SINGLE"], dest="family_type", required=False, help="Choose if the data you provide is a trio or a larger family") parser.add_argument("--gene_exclusion", type=str, dest="gene_exclusion_list", required=False, help="List of genes that should be excluded in the prioritization") parser.add_argument("--hpo_list", type=str, dest="hpo_list", default=None, required=False, help="List of HPO terms that are observed in the patient. These terms are used to adjust the AIDIVA_SCORE\n") - parser.add_argument("--hpo_resources", type=str, dest="hpo_resources", default="../../data/", required=True, help="Folder where the HPO resources (HPO_graph,...) are found\n") + #parser.add_argument("--hpo_resources", type=str, dest="hpo_resources", default="../../data/", required=True, help="Folder where the HPO resources (HPO_graph,...) are found\n") + parser.add_argument("--genome_build", type=str, dest="genome_build", default="GRCh38", required=True, help="Version of the genome build to use [GRCh37, GRCH38]\n") + parser.add_argument("--feature_list", type=str, dest="feature_list", required=True, help="Feature list used in the AIDIVA_SCORE prediction.\n") + parser.add_argument("--reference", type=str, dest="reference", required=True, help="Path to the refernce genome.\n") + parser.add_argument("--skip_db_check", action="store_true", required=False, help="Skip db check.\n") + parser.add_argument("--threads", type=str, dest="threads", default=1, required=False, help="Number of threads to use.\n") args = parser.parse_args() input_data = pd.read_csv(args.in_file, sep="\t", low_memory=False) - prioritized_variants = prioritize_variants(input_data, args.hpo_resources, args.family, args.family_type, args.hpo_list, args.gene_exclusion_list, args.low_conf_region) + if args.family: + family_file = args.family + + else: + family_file = None + + if args.family_type and args.family_file is not None: + family_type = args.family_type + + else: + family_type="SINGLE" + + if args.genome_build: + genome_build = args.genome_build + + else: + genome_build = "GRCh38" + + internal_parameter_dict = {"gene2hpo-mapping": "../../data/hpo_resources/gene2hpo.json", + "gene2interacting-mapping": "../../data/hpo_resources/gene2interacting.json", + "transcript2length-mapping": "../../data/hpo_resources/grch38transcript2length.json", + "hgnc2gene-mapping": "../../data/hpo_resources/hgnc2gene.json", + "hpo-graph": "../../data/hpo_resources/hpo_graph.gexf", + "hpo2replacement-mapping": "../../data/hpo_resources/hpo2replacement.json"} + + if args.threads: + num_threads = int(args.threads) + + else: + num_threads = 1 + + feature_list = args.feature_list.split(",") + + prioritized_variants = prioritize_variants(input_data, internal_parameter_dict, args.reference, num_threads, genome_build, feature_list, args.skip_db_check, family_file, family_type, args.hpo_list, args.gene_exclusion_list) prioritized_variants.to_csv(args.out_filename, sep="\t", index=False) diff --git a/aidiva/variant_scoring/score_variants.py b/aidiva/variant_scoring/score_variants.py index cd5cd3f..41480f2 100644 --- a/aidiva/variant_scoring/score_variants.py +++ b/aidiva/variant_scoring/score_variants.py @@ -27,9 +27,10 @@ "REVEL": 0.28019263637740743, "PolyPhen": 0.5169017014355943, "oe_lof": 0.53667483333333332, + "oe_mis": 0.8742450611581991, ## currently not used + "oe_syn": 1.0262797714053697, ## currently not used "CAPICE": 0.04487945928377704, "ALPHA_MISSENSE_SCORE": 0.4074365673385914} - #"EVE_SCORE": 0.4866676824933756} MEDIAN_DICT = {"MutationAssessor": 1.87, "CONDEL": 0.4805749233199981, @@ -40,9 +41,10 @@ "REVEL": 0.193, "PolyPhen": 0.547, "oe_lof": 0.48225, + "oe_mis": 0.87952, ## currently not used + "oe_syn": 1.004, ## currently not used "CAPICE": 0.0006, "ALPHA_MISSENSE_SCORE": 0.2509} - #"EVE_SCORE": 0.4946792317600748} SUPPORTED_CODING_VARIANTS = ["stop_gained", "stop_lost", @@ -66,7 +68,8 @@ "splice_donor_region_variant", "splice_polypyrimidine_tract_variant"] -random_seed = 14038 +RANDOM_SEED = 14038 + logger = logging.getLogger(__name__) @@ -87,9 +90,9 @@ def read_input_data(input_file): # make sure to use the same features, that were used for the training of the model -# fill SegDup missing values with -> 0 -# fill ABB_SCORE missing values with -> 0 # fill Allele Frequency missing values with -> 0 +# fill homAF missing values with -> 0 +# fill HIGH_IMPACT missing values with -> 0 # fill missing values from other features with -> median or mean def prepare_input_data(feature_list, allele_frequency_list, input_data): for feature in feature_list: @@ -106,39 +109,16 @@ def prepare_input_data(feature_list, allele_frequency_list, input_data): elif (feature == "IS_INDEL"): continue - #elif feature == "CAPICE": - # TODO: compute mean and median of CAPICE database - #input_data[feature] = input_data[feature].fillna(0.5) - elif "SIFT" == feature: input_data[feature] = input_data.apply(lambda row: min([float(value) for value in str(row[feature]).split("&") if ((value != ".") and (value != "nan") and (value != ""))], default=np.nan), axis=1) input_data[feature] = input_data[feature].fillna(MEDIAN_DICT["SIFT"]) - elif feature == "oe_lof": + elif feature == "oe_lof" or feature == "oe_mis" or feature == "oe_syn": input_data[feature] = input_data.apply(lambda row: min([float(value) for value in str(row[feature]).split("&") if ((value != ".") and (value != "nan") and (value != "") and (":" not in value) and ("-" not in value))], default=np.nan), axis=1) - input_data[feature] = input_data[feature].fillna(MEDIAN_DICT["oe_lof"]) - - #elif feature == "REVEL": - # if input_data["IMPACT"] == "HIGH": - # input_data[feature] = input_data[feature].fillna(1.0) - # else: - # input_data[feature] = input_data[feature].fillna(1.0) - - #elif feature == "EVE_SCORE": - # if input_data["IMPACT"] == "HIGH": - # input_data[feature] = input_data[feature].fillna(1.0) - - # else: - # input_data[feature] = input_data[feature].fillna(MEDIAN_DICT["EVE_SCORE"]) - - #elif feature == "ALPHA_MISSENSE_SCORE": - # if input_data["IMPACT"] == "HIGH" and not "splice" in input_data["Consequence"]: - # input_data[feature] = input_data[feature].fillna(1.0) - - # else: - # input_data[feature] = input_data[feature].fillna(MEDIAN_DICT["ALPHA_MISSENSE_SCORE"]) + input_data[feature] = input_data[feature].fillna(MEDIAN_DICT[feature]) - elif feature == "REVEL" or feature == "ALPHA_MISSENSE_SCORE": #or feature == "EVE_SCORE": + # REVEL and ALPHA_MISSENSE score only missense variants so for missing high impact variants we fill with 1 instead of median/mean value + elif feature == "REVEL" or feature == "ALPHA_MISSENSE_SCORE": input_data[feature] = input_data.apply(lambda row: max([float(value) for value in str(row[feature]).split("&") if ((value != ".") & (value != "nan") & (value != "") & (not ":" in value) & (not "-" in value))], default=np.nan), axis=1) input_data.loc[(~(input_data["MOST_SEVERE_CONSEQUENCE"].str.contains("|".join(SPLICE_VARIANTS))) & (input_data["IMPACT"] == "HIGH") & (input_data[feature].isna())), feature] = 1.0 input_data[feature] = input_data[feature].fillna(MEDIAN_DICT[feature]) @@ -194,7 +174,7 @@ def parallelize_dataframe_processing(dataframe, function, num_cores): return dataframe -def perform_pathogenicity_score_prediction(rf_model, input_data, allele_frequency_list, feature_list, num_cores): +def perform_pathogenicity_score_prediction(rf_model, input_data, allele_frequency_list, feature_list, num_cores=1): rf_model = import_model(rf_model) # compute maximum Minor Allele Frequency (MAF) from population frequencies if MAX_AF not present @@ -218,18 +198,12 @@ def perform_pathogenicity_score_prediction(rf_model, input_data, allele_frequenc # set splicing donor/acceptor variants to NaN if not additionally a supported consequence is reported for the variant # add filter for splice_region variants - # later in the pipeline the score from dbscSNV for splicing variants will be used - # TODO: change to use SpliceAI instead of dbscSNV for splice variants - #predicted_data.loc[(~(any(term for term in SUPPORTED_CODING_VARIANTS if term in predicted_data["Consequence"])) & (~(predicted_data["rf_score"].isna()) | ~(predicted_data["ada_score"].isna()))), "AIDIVA_SCORE"] = np.nan - #predicted_data.loc[(~(any(term for term in SUPPORTED_CODING_VARIANTS if term in predicted_data["Consequence"])) & (~(predicted_data["SpliceAI"].isna()))), "AIDIVA_SCORE"] = np.nan - #predicted_data.loc[(~(predicted_data["Consequence"].str.contains("|".join(SUPPORTED_CODING_VARIANTS))) & (~(predicted_data["SpliceAI"].isna()))), "AIDIVA_SCORE"] = np.nan + # later in the pipeline the score from SpliceAI for splicing variants will be used predicted_data.loc[(~(predicted_data["MOST_SEVERE_CONSEQUENCE"].str.contains("|".join(SUPPORTED_CODING_VARIANTS))) & (predicted_data["MOST_SEVERE_CONSEQUENCE"].str.contains("|".join(SPLICE_VARIANTS)))), "AIDIVA_SCORE"] = np.nan - #predicted_data.loc[((predicted_data["Consequence"].str.contains("splice")) & (~(predicted_data["SpliceAI"].isna()))), "AIDIVA_SCORE"] = np.nan # set synonymous variants to 0.0 if they are not at a splicing site # TODO check how to handle overlapping consequences - #predicted_data.loc[(predicted_data["Consequence"].str.contains("synonymous") & ~(predicted_data["Consequence"].str.contains("splice")) & ~(any(term for term in SUPPORTED_CODING_VARIANTS if term in predicted_data["Consequence"]))), "AIDIVA_SCORE"] = 0.0 predicted_data.loc[(predicted_data["MOST_SEVERE_CONSEQUENCE"].str.contains("|".join(SYNONYMOUS_VARIANTS)) & ~(predicted_data["MOST_SEVERE_CONSEQUENCE"].str.contains("|".join(SUPPORTED_CODING_VARIANTS))) & ~(predicted_data["MOST_SEVERE_CONSEQUENCE"].str.contains("|".join(SPLICE_VARIANTS)))), "AIDIVA_SCORE"] = 0.0 return predicted_data @@ -242,6 +216,7 @@ def perform_pathogenicity_score_prediction(rf_model, input_data, allele_frequenc parser.add_argument("--model", type=str, dest="model", metavar="model.pkl", required=True, help="Specifies the name of the trained model to import\n") parser.add_argument("--feature_list", type=str, dest="feature_list", metavar="feature1,feature2,feature3", required=True, help="Comma separated list of the features used to train the model\n") parser.add_argument("--allele_frequency_list", type=str, dest="allele_frequency_list", metavar="frequency1,frequecy2,frequency3", required=False, help="Comma separated list of allele frequency sources that should be used as basis to get the maximum allele frequency\n") + parser.add_argument("--threads", type=str, dest="threads", metavar="1", required=False, help="Number of threads to use.\n") args = parser.parse_args() input_data = read_input_data(args.in_data) @@ -253,5 +228,11 @@ def perform_pathogenicity_score_prediction(rf_model, input_data, allele_frequenc else: allele_frequency_list = [] - predicted_data = perform_pathogenicity_score_prediction(args.model, input_data, allele_frequency_list, feature_list) + if args.threads: + num_threads = int(args.threads) + + else: + num_threads = 1 + + predicted_data = perform_pathogenicity_score_prediction(args.model, input_data, allele_frequency_list, feature_list, num_threads) predicted_data.to_csv(args.out_data, index=False, sep="\t", na_rep="NA") diff --git a/data/aiDIVA_example_configuration_annotated.yaml b/data/aiDIVA_example_configuration_annotated.yaml index c85321b..0acab26 100644 --- a/data/aiDIVA_example_configuration_annotated.yaml +++ b/data/aiDIVA_example_configuration_annotated.yaml @@ -1,18 +1,20 @@ -## aiDIVA -- augmented intelligence-based Disease Variant Analysis +## aiDIVA -- augmented intelligence-based disease variant analysis # Configuration file specifiying all the parameters that are needed to run aiDIVA in the different modes # if you modify the key names make sure to also update them in the run_aiDIVA.py script --- -Assembly-Build: GRCh38 +Assembly-Build: GRCh37 +Canonical-Transcripts: "/mnt/storage2/users/ahboced1/phd_project/mane_grch37_v13_transcript_ids.txt" Analysis-Input: # path to the reference assembly used during the expansion of the indels - # instead of GRCh38 you can also use hg38 the differences between the two should not matter here - ref-path: /GRCh38.fa + # instead of GRCh37 you can also use hg19 the differences between the two should not matter here + ref-path: /GRCh37.fa - # trained scoring model (random forest) used to predict the pathogenicity score + # trained scoring models used to predict the pathogenicity score # if no trained model is present you can use the train_model.py script to train a new custom model - scoring-model: /rf_model_combined.pkl + scoring-model-snp: /rf_model_snp.pkl + scoring-model-indel: /rf_model_indel.pkl prioritization-information: # Identifier to get the score from the annotated file @@ -22,16 +24,23 @@ Analysis-Input: Model-Features: - # List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these) + # List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these) if not wanted use a empty list "[]" instead + # NOTE: currently not used, we use directly the MAX_AF annotation from VEP allele-frequency-list: - gnomAD_AFR_AF - - gnomAD_AMR_AF + - gnomAD_ASJ_AF - gnomAD_EAS_AF + - gnomAD_FIN_AF - gnomAD_NFE_AF + - gnomAD_OTH_AF - gnomAD_SAS_AF - #- gnomAD_ASJ_AF - #- gnomAD_FIN_AF - #- gnomAD_OTH_AF + - AA_AF + - EA_AF + - AFR_AF + - AMR_AF + - EAS_AF + - EUR_AF + - SAS_AF # List containing the names of the features used for the model training # the exact order of the features is crucial (make sure to have the exact same order as in the training step) @@ -54,9 +63,6 @@ Model-Features: - oe_lof - homAF - CAPICE - - ALPHA_MISSENSE_SCORE - - HIGH_IMPACT - - IS_INDEL # DO NOT CHANGE IF YOU DON'T KNOW WHAT YOU DO @@ -64,7 +70,7 @@ Model-Features: Internal-Parameters: gene2hpo-mapping: ../../data/hpo_resources/gene2hpo.json gene2interacting-mapping: ../../data/hpo_resources/gene2interacting.json - transcript2length-mapping: ../../data/hpo_resources/grch38transcript2length.json + transcript2length-mapping: ../../data/hpo_resources/grch37transcript2length.json hgnc2gene-mapping: ../../data/hpo_resources/hgnc2gene.json hpo-graph: ../../data/hpo_resources/hpo_graph.gexf hpo2replacement-mapping: ../../data/hpo_resources/hpo2replacement.json diff --git a/data/aiDIVA_example_configuration_annotated_grch38.yaml b/data/aiDIVA_example_configuration_annotated_grch38.yaml new file mode 100644 index 0000000..066e057 --- /dev/null +++ b/data/aiDIVA_example_configuration_annotated_grch38.yaml @@ -0,0 +1,71 @@ +## aiDIVA -- augmented intelligence-based disease variant analysis +# Configuration file specifiying all the parameters that are needed to run aiDIVA in the different modes +# if you modify the key names make sure to also update them in the run_aiDIVA.py script +--- +Assembly-Build: GRCh38 +Canonical-Transcripts: "/mnt/storage2/users/ahboced1/phd_project/mane_grch38_v13_transcript_ids.txt" + + +Analysis-Input: + # path to the reference assembly used during the expansion of the indels + # instead of GRCh38 you can also use hg38 the differences between the two should not matter here + ref-path: /GRCh38.fa + + # trained scoring model (random forest) used to predict the pathogenicity score + # if no trained model is present you can use the train_model.py script to train a new custom model + scoring-model: /rf_model_combined.pkl + + prioritization-information: + # Identifier to get the score from the annotated file + cadd-identifier: CADD_PHRED + repeat-identifier: simpleRepeat + duplication-identifier: segmentDuplication + + +Model-Features: + # List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these) + allele-frequency-list: + - gnomAD_AFR_AF + - gnomAD_AMR_AF + - gnomAD_EAS_AF + - gnomAD_NFE_AF + - gnomAD_SAS_AF + #- gnomAD_ASJ_AF + #- gnomAD_FIN_AF + #- gnomAD_OTH_AF + + # List containing the names of the features used for the model training + # the exact order of the features is crucial (make sure to have the exact same order as in the training step) + feature-list: + - SIFT + - PolyPhen + - CADD_PHRED + - REVEL + - MAX_AF + - EIGEN_PHRED + - CONDEL + - FATHMM_XF + - MutationAssessor + - phastCons_mammal + - phastCons_primate + - phastCons_vertebrate + - phyloP_mammal + - phyloP_primate + - phyloP_vertebrate + - oe_lof + - homAF + - CAPICE + - ALPHA_MISSENSE_SCORE + - HIGH_IMPACT + - IS_INDEL + + +# DO NOT CHANGE IF YOU DON'T KNOW WHAT YOU DO +# In case you change the following parameters (eg. if you recomputed the hpo resources) make sure to specify the full path to the newly created resources +Internal-Parameters: + gene2hpo-mapping: ../../data/hpo_resources/gene2hpo.json + gene2interacting-mapping: ../../data/hpo_resources/gene2interacting.json + transcript2length-mapping: ../../data/hpo_resources/grch38transcript2length.json + hgnc2gene-mapping: ../../data/hpo_resources/hgnc2gene.json + hpo-graph: ../../data/hpo_resources/hpo_graph.gexf + hpo2replacement-mapping: ../../data/hpo_resources/hpo2replacement.json diff --git a/data/aiDIVA_example_configuration_with_annotation.yaml b/data/aiDIVA_example_configuration_with_annotation.yaml index b96b872..5ea5ac0 100644 --- a/data/aiDIVA_example_configuration_with_annotation.yaml +++ b/data/aiDIVA_example_configuration_with_annotation.yaml @@ -1,18 +1,20 @@ -## aiDIVA -- augmented intelligence-based Disease Variant Analysis +## aiDIVA -- augmented intelligence-based disease variant analysis # Configuration file specifiying all the parameters that are needed to run aiDIVA in the different modes # if you modify the key names make sure to also update them in the run_aiDIVA.py script --- -Assembly-Build: GRCh38 +Assembly-Build: GRCh37 +Canonical-Transcripts: "/mnt/storage2/users/ahboced1/phd_project/mane_grch37_v13_transcript_ids.txt" Analysis-Input: # path to the reference assembly used during the expansion of the indels - # instead of GRCh38 you can also use hg38 the differences between the two should not matter here - ref-path: /GRCh38.fa + # instead of GRCh37 you can also use hg19 the differences between the two should not matter here + ref-path: /GRCh37.fa - # trained scoring model (random forest) used to predict the pathogenicity score + # trained scoring models used to predict the pathogenicity score # if no trained model is present you can use the train_model.py script to train a new custom model - scoring-model: /rf_model_combined.pkl + scoring-model-snp: /rf_model_snp.pkl + scoring-model-indel: /rf_model_indel.pkl prioritization-information: # Identifier to get the score from the annotated file @@ -23,15 +25,22 @@ Analysis-Input: Model-Features: # List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these) + # NOTE: currently not used, we use directly the MAX_AF annotation from VEP allele-frequency-list: - gnomAD_AFR_AF - - gnomAD_AMR_AF + - gnomAD_ASJ_AF - gnomAD_EAS_AF + - gnomAD_FIN_AF - gnomAD_NFE_AF + - gnomAD_OTH_AF - gnomAD_SAS_AF - #- gnomAD_ASJ_AF - #- gnomAD_FIN_AF - #- gnomAD_OTH_AF + - AA_AF + - EA_AF + - AFR_AF + - AMR_AF + - EAS_AF + - EUR_AF + - SAS_AF # List containing the names of the features used for the model training # the exact order of the features is crucial (make sure to have the exact same order as in the training step) @@ -54,69 +63,62 @@ Model-Features: - oe_lof - homAF - CAPICE - - ALPHA_MISSENSE_SCORE - - HIGH_IMPACT - - IS_INDEL Annotation-Resources: # VEP: path to the VEP executable - vep: /ensembl-vep-release-109.3/ + vep: /ensembl-vep-release-103.1/vep - # path to the direcrory where the additional perl libraries are installed during the installation step - vep-cpan: /perl_cpan/ - - # VEP Plugin direcrory - vep-plugin-path: /vep_plugins/ - - # VEP Cache directory - vep-cache: /ensembl-vep-109/ + # Cache directory and plugin directory + vep-cache: /ensembl-vep-103/cache # ngs-bits install path - ngs-bits: /ngs-bits-2023_06/ + ngs-bits: /ngs-bits/ # low-confidence filter file - low-confidence: /grch38_low_conf_region.bed + low-confidence: /grch37_low_conf_regions.bed # BED like files here the key of the dictionary is used as the name to present the feature in the annotated file bed-files: - simpleRepeat: /hg38_simpleRepeat.bed - segmentDuplication: /hg38_genomicSuperDups.bed - oe_lof: /gnomAD_OE_grch38_sorted.bed - repeatMasker: /RepeatMasker_GRCh38.bed - - plugin-files: - AlphaMissense: /AlphaMissense_hg38.tsv.gz + simpleRepeat: /grch37_simpleRepeat.bed + segmentDuplication: /grch37_genomicSuperDups.bed + oe_lof: /gnomAD_oe_lof_sorted.bed + oe_mis: /gnomAD_oe_mis_grch38_sorted.bed + oe_syn: /gnomAD_oe_syn_grch38_sorted.bed + repeatMasker: /grch37_repeatmasker.bed + + # OPTIONAL: only used if DB check is not skipped + # NOTE: currently not implemented + # NOTE: the following annotation needs a valid license + omim: /omim.bed # VCF files the key of the dictionary is used to identify the feature in the INFO column of hte VCF file vcf-files: - EIGEN_PHRED: /grch38liftOver_precomputed_Eigen-phred_coding_chrom1-22.vcf.gz - CONDEL: /grch38liftOver_precomputed_Condel.vcf.gz - FATHMM_XF: /hg38_fathmm_xf_coding.vcf.gz - MutationAssessor: /grch38liftOver_precomputed_MutationAssessor.vcf.gz - gnomAD: /gnomAD_genome_v3.1.2_GRCh38.vcf.gz - CADD: /CADD_SNVs_1.6_GRCh38.vcf.gz - dbscSNV: /dbscSNV1.1_GRCh38.vcf.gz - CAPICE: /grch38liftOver_capice_v1_snvs.vcf.gz - REVEL: /REVEL_1.3.vcf.gz - SpliceAI-SNV: /spliceai_scores.masked.snv.hg38.vcf.gz - SpliceAI-InDel: /spliceai_scores.masked.indel.hg38.vcf.gz - + EIGEN_PHRED: /grch37_eigen_phred_coding.vcf.gz + CONDEL: /grch37_fannsDB_Condel.vcf.gz + FATHMM_XF: /grch37_fathmm_xf_coding.vcf.gz + MutationAssessor: /grch37_precomputed_MutationAssessor.vcf.gz + gnomAD: /grch37_gnomAD_genomes_r211.vcf.gz + CADD: /grch37_CADD_snvs_v16.vcf.gz + dbscSNV: /grch37_dbscSNV_scores.vcf.gz + CAPICE: /capice_v1_grch37_snvs.vcf.gz + REVEL: /grch37_revel_v13.vcf.gz + # OPTIONAL: only used if DB check is not skipped - clinvar: /clinvar_GRCh38.vcf.gz - + clinvar: /grch37_clinvar.vcf.gz + # OPTIONAL: only used if DB check is not skipped # NOTE: the following annotation needs a valid license hgmd: /HGMD.vcf.gz # Bigwig files the key of the dictionary is used as the name to present the feature in the annotated file bigwig-files: - phastCons_mammal: /hg38_phastCons17way_primate.bw - phastCons_primate: /hg38_phastCons30way_mammal.bw - phastCons_vertebrate: /hg38_phastCons100way_vertebrate.bw - phyloP_mammal: /hg38_phyloP17way_primate.bw - phyloP_primate: /hg38_phyloP30way_mammal.bw - phyloP_vertebrate: /hg38_phyloP100way_vertebrate.bw + phastCons_mammal: /primates.phastCons46way.bw + phastCons_primate: /placentalMammals.phastCons46way.bw + phastCons_vertebrate: /vertebrate.phastCons46way.bw + phyloP_mammal: /primates.phyloP46way.bw + phyloP_primate: /placentalMammals.phyloP46way.bw + phyloP_vertebrate: /vertebrate.phyloP46way.bw # DO NOT CHANGE THE FOLLOWING IF YOU DON'T KNOW WHAT YOU DO @@ -124,7 +126,7 @@ Annotation-Resources: Internal-Parameters: gene2hpo-mapping: ../../data/hpo_resources/gene2hpo.json gene2interacting-mapping: ../../data/hpo_resources/gene2interacting.json - transcript2length-mapping: ../../data/hpo_resources/grch38transcript2length.json + transcript2length-mapping: ../../data/hpo_resources/grch37transcript2length.json hgnc2gene-mapping: ../../data/hpo_resources/hgnc2gene.json hpo-graph: ../../data/hpo_resources/hpo_graph.gexf hpo2replacement-mapping: ../../data/hpo_resources/hpo2replacement.json diff --git a/data/aiDIVA_example_configuration_with_annotation_grch38.yaml b/data/aiDIVA_example_configuration_with_annotation_grch38.yaml new file mode 100644 index 0000000..bc04c6f --- /dev/null +++ b/data/aiDIVA_example_configuration_with_annotation_grch38.yaml @@ -0,0 +1,133 @@ +## aiDIVA -- augmented intelligence-based disease variant analysis +# Configuration file specifiying all the parameters that are needed to run aiDIVA in the different modes +# if you modify the key names make sure to also update them in the run_aiDIVA.py script +--- +Assembly-Build: GRCh38 +Canonical-Transcripts: "/mnt/storage2/users/ahboced1/phd_project/mane_grch38_v13_transcript_ids.txt" + + +Analysis-Input: + # path to the reference assembly used during the expansion of the indels + # instead of GRCh38 you can also use hg38 the differences between the two should not matter here + ref-path: /GRCh38.fa + + # trained scoring model (random forest) used to predict the pathogenicity score + # if no trained model is present you can use the train_model.py script to train a new custom model + scoring-model: /rf_model_combined.pkl + + prioritization-information: + # Identifier to get the score from the annotated file + cadd-identifier: CADD_PHRED + repeat-identifier: simpleRepeat + duplication-identifier: segmentDuplication + + +Model-Features: + # List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these) + allele-frequency-list: + - gnomAD_AFR_AF + - gnomAD_AMR_AF + - gnomAD_EAS_AF + - gnomAD_NFE_AF + - gnomAD_SAS_AF + #- gnomAD_ASJ_AF + #- gnomAD_FIN_AF + #- gnomAD_OTH_AF + + # List containing the names of the features used for the model training + # the exact order of the features is crucial (make sure to have the exact same order as in the training step) + feature-list: + - SIFT + - PolyPhen + - CADD_PHRED + - REVEL + - MAX_AF + - EIGEN_PHRED + - CONDEL + - FATHMM_XF + - MutationAssessor + - phastCons_mammal + - phastCons_primate + - phastCons_vertebrate + - phyloP_mammal + - phyloP_primate + - phyloP_vertebrate + - oe_lof + - homAF + - CAPICE + - ALPHA_MISSENSE_SCORE + - HIGH_IMPACT + - IS_INDEL + + +Annotation-Resources: + # VEP: path to the VEP executable + vep: /ensembl-vep-release-109.3/ + + # path to the direcrory where the additional perl libraries are installed during the installation step + vep-cpan: /perl_cpan/ + + # VEP Plugin direcrory + vep-plugin-path: /vep_plugins/ + + # VEP Cache directory + vep-cache: /ensembl-vep-109/ + + # ngs-bits install path + ngs-bits: /ngs-bits-2023_06/ + + # low-confidence filter file + low-confidence: /grch38_low_conf_region.bed + + # BED like files here the key of the dictionary is used as the name to present the feature in the annotated file + bed-files: + simpleRepeat: /hg38_simpleRepeat.bed + segmentDuplication: /hg38_genomicSuperDups.bed + oe_lof: /gnomAD_oe_lof_grch38_sorted.bed + oe_mis: /gnomAD_oe_mis_grch38_sorted.bed + oe_syn: /gnomAD_oe_syn_grch38_sorted.bed + repeatMasker: /RepeatMasker_GRCh38.bed + + plugin-files: + AlphaMissense: /AlphaMissense_hg38.tsv.gz + + # VCF files the key of the dictionary is used to identify the feature in the INFO column of hte VCF file + vcf-files: + EIGEN_PHRED: /grch38liftOver_precomputed_Eigen-phred_coding_chrom1-22.vcf.gz + CONDEL: /grch38liftOver_precomputed_Condel.vcf.gz + FATHMM_XF: /hg38_fathmm_xf_coding.vcf.gz + MutationAssessor: /grch38liftOver_precomputed_MutationAssessor.vcf.gz + gnomAD: /gnomAD_genome_v3.1.2_GRCh38.vcf.gz + CADD: /CADD_SNVs_1.6_GRCh38.vcf.gz + dbscSNV: /dbscSNV1.1_GRCh38.vcf.gz + CAPICE: /grch38liftOver_capice_v1_snvs.vcf.gz + REVEL: /REVEL_1.3.vcf.gz + SpliceAI-SNV: /spliceai_scores.masked.snv.hg38.vcf.gz + SpliceAI-InDel: /spliceai_scores.masked.indel.hg38.vcf.gz + + # OPTIONAL: only used if DB check is not skipped + clinvar: /clinvar_GRCh38.vcf.gz + + # OPTIONAL: only used if DB check is not skipped + # NOTE: the following annotation needs a valid license + hgmd: /HGMD.vcf.gz + + # Bigwig files the key of the dictionary is used as the name to present the feature in the annotated file + bigwig-files: + phastCons_mammal: /hg38_phastCons17way_primate.bw + phastCons_primate: /hg38_phastCons30way_mammal.bw + phastCons_vertebrate: /hg38_phastCons100way_vertebrate.bw + phyloP_mammal: /hg38_phyloP17way_primate.bw + phyloP_primate: /hg38_phyloP30way_mammal.bw + phyloP_vertebrate: /hg38_phyloP100way_vertebrate.bw + + +# DO NOT CHANGE THE FOLLOWING IF YOU DON'T KNOW WHAT YOU DO +# In case you change the following parameters (eg. if you recomputed the hpo resources) make sure to specify the full path to the newly created resources +Internal-Parameters: + gene2hpo-mapping: ../../data/hpo_resources/gene2hpo.json + gene2interacting-mapping: ../../data/hpo_resources/gene2interacting.json + transcript2length-mapping: ../../data/hpo_resources/grch38transcript2length.json + hgnc2gene-mapping: ../../data/hpo_resources/hgnc2gene.json + hpo-graph: ../../data/hpo_resources/hpo_graph.gexf + hpo2replacement-mapping: ../../data/hpo_resources/hpo2replacement.json diff --git a/doc/prepare_annotation_resources.md b/doc/prepare_annotation_resources.md index 0acf71e..b3c18ad 100644 --- a/doc/prepare_annotation_resources.md +++ b/doc/prepare_annotation_resources.md @@ -6,7 +6,7 @@ For some of the annotation sources there are no GRCh38/hg38 files available in t ## Reference Genome Assembly -Note hg19 and GRCh37 differ in the mitochondrial DNA, but since mitchondrial variants are not supported by aiDIVA it should not matter which of the two assemblies is used as reference. +Note hg19 and GRCh37 differ in the mitochondrial DNA, but since mitchondrial variants are not supported by AIdiva it does not matter which of the two assemblies is used as reference. GRCh37:
@@ -32,9 +32,9 @@ samtools faidx hg38.fa.gz ``` ## Annotation Databases -INFO: Please be adviced that some of the annotation sources aiDIVA uses are only free for non-commercial and academic use!!! +INFO: Please be adviced that some of the annotation sources aiDIVA uses are only free for non-commercial and academic use! -If for some reason you choose to exclude one or more of the Annotation sources shown here you have to make sure to modify the source code accordingly. If an annotation source is used as a feature for the random forest and you exclude it in your analysis you have to train a new model before you can use aiDIVA. +If for some reason you choose to exclude one or more of the Annotation sources shown here you have to make sure to modify the source code accordingly. If an annotation source is used as a feature for the random forest and you exclude it in your analysis you have to train a new model before you can use Aidiva. ### CADD @@ -280,7 +280,7 @@ https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes (https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.vcf.bgz) \ \ -From the complete gnomAD database we are only interested in a few entries from the VCF files, therefor we suggest to prepare the VCF and remove the unnecessary stuff from the gnomAD annotation VCF. The following code snippet uses the `prepare_gnomAD_vcf.py` script to reduce it to a minimal set of INFO entries that we need for the aiDIVA annotations: +From the complete gnomAD database we are only interested in a few entries from the VCF files, therefor we suggest to prepare the VCF and remove the unnecessary stuff from the gnomAD annotation VCF. The following code snippet uses the `prepare_gnomAD_vcf.py` script to reduce it to a minimal set of INFO entries that we need for the AIdiva annotations: ``` wget -c https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.vcf.bgz @@ -311,7 +311,12 @@ tabix -p vcf grch38_gnomAD_genomes_r211.vcf.gz ### \[optional\] HGMD (needs license) -The possibility to use HGMD in aiDIVA is optional due to the fact that you need a license for it. If you choose to include HGMD in the aiDIVA analysis you can use the public/professional version of HGMD. +The possibility to use HGMD in AIdiva is optional due to the fact that you need a license for it. If you choose to include HGMD in the AIdiva analysis you can use the public/professional version of HGMD. + + +### \[optional\] OMIM (needs license) +The possibility to use OMIM in AIdiva is optional due to the fact that you need a license for it. +**Currently not implemented!** ### Low Confidence Regions diff --git a/scripts/generate_HPO_resources.py b/scripts/generate_HPO_resources.py index b14ac29..94168f7 100644 --- a/scripts/generate_HPO_resources.py +++ b/scripts/generate_HPO_resources.py @@ -29,6 +29,7 @@ def generate_gene2hpo_dict(gene2phenotype_list, gene2hpo_dict): for line in rd: if line.startswith("#"): pass + else: splitted_line = line.strip().split("\t") # Format: HPO-idHPO labelentrez-gene-identrez-gene-symbolAdditional Info from G-D sourceG-D sourcedisease-ID for link @@ -41,6 +42,7 @@ def generate_gene2hpo_dict(gene2phenotype_list, gene2hpo_dict): with open(gene2hpo_dict, "w") as gene2hpo_f: json.dump(gene_2_HPO, gene2hpo_f) + logger.info(f"Gene to HPO mapping successfully generated and saved as {gene2hpo_dict}") @@ -72,25 +74,31 @@ def create_hpo_graph(hpo_ontology, phenotype_hpoa, hpo_graph_file, hpo_replaceme elif line.startswith("is_a:"): parents.append(line.strip().split("is_a: ")[1].split(" !")[0]) + elif line.startswith('is_obsolete:'): obsolete =True + elif line.startswith('replaced_by:'): replacement = line.strip().split('replaced_by: ')[1] obsolete =False + elif line.startswith('alt_id:'): alt = line.strip().split('alt_id: ')[1] if hpo_term in alternatives.keys(): current_alternatives = alternatives[hpo_term] current_alternatives.append(alt) alternatives[hpo_term] = current_alternatives + else: alternatives[hpo_term] = [alt] + elif line.startswith('consider:'): consider = line.strip().split('consider: ')[1] if hpo_term in considerations.keys(): current_considerations = considerations[hpo_term] current_considerations.append(consider) considerations[hpo_term] = current_considerations + else: considerations[hpo_term] = [consider] @@ -99,7 +107,7 @@ def create_hpo_graph(hpo_ontology, phenotype_hpoa, hpo_graph_file, hpo_replaceme hpo_edges[hpo_term] = parents if replacement != "": replacements[hpo_term] = replacement - + hpo_edges["replacements"] = replacements logger.info("Generate HPO graph...") @@ -116,17 +124,6 @@ def create_hpo_graph(hpo_ontology, phenotype_hpoa, hpo_graph_file, hpo_replaceme counts_dict[hpo_term] = count_value total_counts += count_value - #total_counts = 0 - #with open(hpo_counts, "r") as count_file: - # for line in count_file: - # if line.startswith("HPO-ID"): - # continue - # splitted_line = line.strip().split("\t") - # hpo_term = splitted_line[0] - # count = int(splitted_line[1]) - # counts_dict[hpo_term] = count - # total_counts += count - # get replacements of obsolete nodes replacements = dict(hpo_edges.get("replacements", [])) @@ -139,40 +136,49 @@ def create_hpo_graph(hpo_ontology, phenotype_hpoa, hpo_graph_file, hpo_replaceme hpo_replacement_information = {"alternatives": alternatives, "considerations": considerations, "replacements": replacements} # TODO: remove support for networkx v1.x and get rid of the duplicated if-else conditions (networkx v2.x and v3.x use the same syntax for the graphs) - for node in hpo_edges.keys(): hpo_graph.add_node(node) if str(nx.__version__).startswith("1."): hpo_graph.node[node]["count"] = 0.0 + elif str(nx.__version__).startswith("2."): hpo_graph.nodes[node]["count"] = 0.0 + elif str(nx.__version__).startswith("3."): hpo_graph.nodes[node]["count"] = 0.0 + else: logger.error("There seems to be a problem with your installation of NetworkX, make sure that you have either v1, v2 or v3 installed!") ancestors = [(x,node) for x in hpo_edges[node]] hpo_graph.add_edges_from(ancestors) + if node in replacements.keys(): if str(nx.__version__).startswith("1."): hpo_graph.node[node]['replaced_by'] = replacements[node] hpo_graph.node[node]['IC'] = -math.log(1.0 / total_counts) + elif str(nx.__version__).startswith("2."): hpo_graph.nodes[node]['replaced_by'] = replacements[node] hpo_graph.nodes[node]['IC'] = -math.log(1.0 / total_counts) + elif str(nx.__version__).startswith("3."): hpo_graph.nodes[node]["replaced_by"] = replacements[node] hpo_graph.nodes[node]["IC"] = -math.log(1.0 /total_counts) + else: logger.error("There seems to be a problem with your installation of NetworkX, make sure that you have either v1, v2 or v3 installed!") for node in hpo_graph.nodes(): if str(nx.__version__).startswith("1."): hpo_graph.node[node]["count"] = 0.0 + elif str(nx.__version__).startswith("2."): hpo_graph.nodes[node]["count"] = 0.0 + elif str(nx.__version__).startswith("3."): hpo_graph.nodes[node]["count"] = 0.0 + else: logger.error("There seems to be a problem with your installation of NetworkX, make sure that you have either v1, v2 or v3 installed!") @@ -180,53 +186,69 @@ def create_hpo_graph(hpo_ontology, phenotype_hpoa, hpo_graph_file, hpo_replaceme for node in counts_dict.keys(): if str(nx.__version__).startswith("1."): hpo_graph.node[node]["count"] = counts_dict[node] + elif str(nx.__version__).startswith("2."): hpo_graph.nodes[node]["count"] = counts_dict[node] + elif str(nx.__version__).startswith("3."): hpo_graph.nodes[node]["count"] = counts_dict[node] + else: logger.error("There seems to be a problem with your installation of NetworkX, make sure that you have either v1, v2 or v3 installed!") hpo_graph.nodes(data="count") - # now fill it with the actual value. + # now fill it with the actual value. for node in hpo_edges.keys(): descendants = nx.descendants(hpo_graph,node) if str(nx.__version__).startswith("1."): count = hpo_graph.node[node]["count"] + elif str(nx.__version__).startswith("2."): count = hpo_graph.nodes[node]["count"] + elif str(nx.__version__).startswith("3."): count = hpo_graph.nodes[node]["count"] + else: logger.error("There seems to be a problem with your installation of NetworkX, make sure that you have either v1, v2 or v3 installed!") for descendant in descendants: if str(nx.__version__).startswith("1."): count += hpo_graph.node[descendant]["count"] + elif str(nx.__version__).startswith("2."): count += hpo_graph.nodes[descendant]["count"] + elif str(nx.__version__).startswith("3."): count += hpo_graph.nodes[descendant]["count"] + else: logger.error("There seems to be a problem with your installation of NetworkX, make sure that you have either v1, v2 or v3 installed!") if count > 0 : if str(nx.__version__).startswith("1."): hpo_graph.node[node]["IC"] = -math.log(float(count) / total_counts) + elif str(nx.__version__).startswith("2."): hpo_graph.nodes[node]["IC"] = -math.log(float(count) / total_counts) + elif str(nx.__version__).startswith("3."): hpo_graph.nodes[node]["IC"] = -math.log(float(count) / total_counts) + else: logger.error("There seems to be a problem with your installation of NetworkX, make sure that you have either v1, v2 or v3 installed!") + else : if str(nx.__version__).startswith("1."): hpo_graph.node[node]["IC"] = -math.log(1.0 / total_counts) # missing nodes, set as rare as possible + elif str(nx.__version__).startswith("2."): hpo_graph.nodes[node]["IC"] = -math.log(1.0 / total_counts) # missing nodes, set as rare as possible + elif str(nx.__version__).startswith("3."): hpo_graph.nodes[node]["IC"] = -math.log(1.0 / total_counts) # missing nodes, set as rare as possible + else: logger.error("There seems to be a problem with your installation of NetworkX, make sure that you have either v1, v2 or v3 installed!") @@ -289,13 +311,16 @@ def create_gene2interacting_mapping(string_mapping, string_db_links, string_inte if ((protein_a in string2name.keys()) and (protein_b in string2name.keys())) and ((confidence_experimental >= 900) or (confidence_database >= 900)): gene_name = string2name[protein_a] interacting_gene = string2name[protein_b] + if gene_name in string_interaction_mapping.keys(): string_interaction_mapping[gene_name].append({"interacting_gene": interacting_gene, "confidence_experimental": confidence_experimental, "confidence_database": confidence_database}) + else: string_interaction_mapping[gene_name] = [{"interacting_gene": interacting_gene, "confidence_experimental": confidence_experimental, "confidence_database": confidence_database}] with open(string_interactions, "w") as string_interactions_f: json.dump(string_interaction_mapping, string_interactions_f) + logger.info(f"Gene to interacting gene mapping successfully generated and saved as {string_interactions}") # this file can be obtained through ensembl biomart @@ -308,6 +333,7 @@ def create_transcript_length_mapping(transcript_lengths, transcript_information) for line in transc: if line.startswith("Gene") or line.startswith("#"): continue + else: # header: # #Gene stable ID [TAB] Transcript stable ID [TAB] CDS Length [TAB] Transcript length (including UTRs and CDS) [TAB] Strand @@ -320,11 +346,12 @@ def create_transcript_length_mapping(transcript_lengths, transcript_information) with open(transcript_lengths, "w") as transcript_lengths_f: json.dump(transcript_mapping, transcript_lengths_f) + logger.info(f"Transcript to CDS length mapping successfully generated and saved as {transcript_lengths}") if __name__=="__main__": - parser = argparse.ArgumentParser("Script to generate the HPO resources needed in the prioritization step of AIdiva") + parser = argparse.ArgumentParser("Script to generate the HPO resources needed in the prioritization step of aiDIVA") parser.add_argument("--hpo_ontology", type=str, dest="hpo_ontology", metavar="hp.obo", required=False, help="File containing the HPO ontology\n") parser.add_argument("--gene_phenotype", type=str, dest="gene_phenotype", metavar="phenotype_to_genes.txt", required=False, help="File that contains information about the genes and the associated phenotypes\n") parser.add_argument("--gene_hpo", type=str, dest="gene_hpo", metavar="gene2hpo.json", required=False, help="File to save the generated gene2hpo_dict\n") @@ -345,12 +372,12 @@ def create_transcript_length_mapping(transcript_lengths, transcript_information) if args.hpo_ontology and args.phenotype_hpoa and args.hpo_graph and args.hpo_replacements: create_hpo_graph(args.hpo_ontology, args.phenotype_hpoa, args.hpo_graph, args.hpo_replacements) - + if args.hgnc_symbols and args.hgnc_gene: create_gene2hgnc_mapping(args.hgnc_symbols, args.hgnc_gene) - + if args.string_mapping and args.string_links and args.gene_interacting: create_gene2interacting_mapping(args.string_mapping, args.string_links, args.gene_interacting) - + if args.transcript_information and args.transcript_length_mapping: create_transcript_length_mapping(args.transcript_length_mapping, args.transcript_information) diff --git a/scripts/prepare_CADD_vcf.py b/scripts/prepare_CADD_vcf.py index d1c79e7..89aee71 100644 --- a/scripts/prepare_CADD_vcf.py +++ b/scripts/prepare_CADD_vcf.py @@ -9,14 +9,15 @@ writer.write("##fileformat=VCFv4.1\n") writer.write("##INFO=\n") writer.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") - + for line in infile: if line.startswith("#"): continue split = line.replace("\n", "").split("\t") - + if str(split[4]) != "": writer.write(f"{split[0]}\t{split[1]}\t.\t{split[2]}\t{split[3]}\t.\t.\tCADD_PHRED={str(split[5])}\n") + else: print("WARNING: Skip missing CADD phred score!") diff --git a/scripts/prepare_Capice_vcf.py b/scripts/prepare_Capice_vcf.py index ca0f5c0..9254b1f 100644 --- a/scripts/prepare_Capice_vcf.py +++ b/scripts/prepare_Capice_vcf.py @@ -9,14 +9,15 @@ writer.write("##fileformat=VCFv4.1\n") writer.write("##INFO=\n") writer.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") - + for line in infile: if line.startswith("#"): continue split = line.replace("\n", "").split("\t") - + if str(split[4]) != "": writer.write(f"{split[0]}\t{split[1]}\t.\t{split[2]}\t{split[3]}\t.\t.\tCAPICE={str(split[4])}\n") + else: print("WARNING: Skip missing Capice score!") diff --git a/scripts/prepare_ClinVar_vcf.py b/scripts/prepare_ClinVar_vcf.py index 8bd4f90..9ca5478 100644 --- a/scripts/prepare_ClinVar_vcf.py +++ b/scripts/prepare_ClinVar_vcf.py @@ -9,7 +9,7 @@ writer.write("##fileformat=VCFv4.1\n") writer.write("##INFO=\n") writer.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") - + for line in infile: if line.startswith("#"): continue @@ -20,7 +20,7 @@ continue info_fields = split[7].split(";") - + disease_name = "" disease_name_incl = "" sig_info = "" @@ -30,20 +30,26 @@ for field in info_fields: if field.startswith("CLNDN="): disease_name = field.split("=")[1] + elif field.startswith("CLNDNINCL="): disease_name_incl = field.split("=")[1] + elif field.startswith("CLNSIG="): sig_info = field.split("=")[1] + elif field.startswith("CLNSIGCONF="): sig_info_conf = field.split("=")[1] + elif field.startswith("CLNSIGINCL="): sig_info_incl = field.split("=")[1] + else: continue - + # TODO: resolve conflicting interpretations of pathogenicity if possible and include them in the resulting file if str(sig_info) != "" and sig_info != "Conflicting_interpretations_of_pathogenicity": writer.write(f"{split[0]}\t{split[1]}\t{split[2]}\t{split[3]}\t{split[4]}\t{split[5]}\t{split[6]}\tDETAILS={str(sig_info)}%20{str(disease_name)}%3D{str(disease_name_incl)}\n") + else: print("WARNING: Skip conflicting or empty significance entry!") \ No newline at end of file diff --git a/scripts/prepare_Condel_vcf.py b/scripts/prepare_Condel_vcf.py index 55aa245..7ccf70f 100644 --- a/scripts/prepare_Condel_vcf.py +++ b/scripts/prepare_Condel_vcf.py @@ -16,5 +16,6 @@ for row in reader: if str(row["CONDEL"]) != "": writer.write(f"{row['CHR']}\t{row['START']}\t.\t{row['REF']}\t{row['ALT']}\t.\t.\tEIGEN_PHRED={str(row['CONDEL'])}\n") + else: print("WARNING: Skip missing Condel score!") diff --git a/scripts/prepare_EigenPhred_vcf.py b/scripts/prepare_EigenPhred_vcf.py index 46b7992..8cabd36 100644 --- a/scripts/prepare_EigenPhred_vcf.py +++ b/scripts/prepare_EigenPhred_vcf.py @@ -16,5 +16,6 @@ for row in reader: if str(row["Eigen-phred"]) != "": writer.write(f"{row['chr']}\t{row['position']}\t.\t{row['ref']}\t{row['alt']}\t.\t.\tEIGEN_PHRED={str(row['Eigen-phred'])}\n") + else: print("WARNING: Skip missing Eigen-phred score!") diff --git a/scripts/prepare_FathmmXF_vcf.py b/scripts/prepare_FathmmXF_vcf.py index 3cae935..604004a 100644 --- a/scripts/prepare_FathmmXF_vcf.py +++ b/scripts/prepare_FathmmXF_vcf.py @@ -16,5 +16,6 @@ # skip missing scores to keep the annotation file small if str(row["FATHMM_XF"]) != "": writer.write(f"{row['#CHROM']}\t{row['POS']}\t.\t{row['REF']}\t{row['ALT']}\t.\t.\FATHMM_XF={str(row['FATHMM_XF'])}\n") + else: print("WARNING: Skip missing FathmmXF score!") \ No newline at end of file diff --git a/scripts/prepare_MutationAssessor_vcf.py b/scripts/prepare_MutationAssessor_vcf.py index 5e338e6..03ea219 100644 --- a/scripts/prepare_MutationAssessor_vcf.py +++ b/scripts/prepare_MutationAssessor_vcf.py @@ -17,5 +17,6 @@ # skip missing scores to keep the annotation file small if str(row["FI score"]) != "": writer.write(f"{mutation[1]}\t{mutation[2]}\t.\t{mutation[3]}\t{mutation[4]}\t.\t.\tMutationAssessor={str(row['FI score'])}\n") + else: print("WARNING: Skip missing MutationAssessor score!") \ No newline at end of file diff --git a/scripts/prepare_REVEL_vcf.py b/scripts/prepare_REVEL_vcf.py index e96b471..4880e54 100644 --- a/scripts/prepare_REVEL_vcf.py +++ b/scripts/prepare_REVEL_vcf.py @@ -24,7 +24,9 @@ grch37_writer.write(f"{row['chr']}\t{row['hg19_pos']}\t.\t{row['ref']}\t{row['alt']}\t.\t.\tREVEL={str(row['REVEL'])}\n") if row["grch38_pos"] != ".": grch38_writer.write(f"{row['chr']}\t{row['grch38_pos']}\t.\t{row['ref']}\t{row['alt']}\t.\t.\tREVEL={str(row['REVEL'])}\n") + else: print("WARNING: Skip GRCh38 missing REVEL score!") + else: print("WARNING: Skip missing REVEL score!") diff --git a/scripts/prepare_RepeatMasker_bed.py b/scripts/prepare_RepeatMasker_bed.py index 5ca95e1..3bfd8bd 100644 --- a/scripts/prepare_RepeatMasker_bed.py +++ b/scripts/prepare_RepeatMasker_bed.py @@ -7,7 +7,7 @@ outfile = sys.argv[2] with gzip.open(infile, "rt") as infile, open(outfile, "w") as writer: - + for line in infile: current_line = line.lstrip().replace("\n", "") current_line = re.sub(r"\s+", " ", current_line) @@ -22,8 +22,9 @@ continue split = current_line.split(" ") - + if str(split[9]) != "" and str(split[10]) != "": writer.write(f"{split[4]}\t{str(int(split[5])-1)}\t{split[6]}\t{split[9]} ({split[10]})\n") + else: print("WARNING: Skip missing repeat!") \ No newline at end of file diff --git a/scripts/prepare_dbscSNV_vcf.py b/scripts/prepare_dbscSNV_vcf.py index 5f4ce67..60970f0 100644 --- a/scripts/prepare_dbscSNV_vcf.py +++ b/scripts/prepare_dbscSNV_vcf.py @@ -1,3 +1,4 @@ +## TODO: Delete file it is not needed anymore import csv import sys @@ -26,7 +27,9 @@ writer_grch37.write(f"{row['chr']}\t{row['pos']}\t.\t{row['ref']}\t{row['alt']}\t.\t.\tADA_SCORE={str(row['ada_score'])};RF_SCORE={str(row['rf_score'])}\n") if row["hg38_pos"] != ".": writer_grch38.write(f"{row['hg38_chr']}\t{row['hg38_pos']}\t.\t{row['ref']}\t{row['alt']}\t.\t.\tADA_SCORE={str(row['ada_score'])};RF_SCORE={str(row['rf_score'])}\n") + else: print("WARNING: Skip GRCh38 missing dbscSNV score!") + else: print("WARNING: Skip missing scores!") \ No newline at end of file diff --git a/scripts/prepare_gnomAD_vcf.py b/scripts/prepare_gnomAD_vcf.py index 7cc4178..219a1a0 100644 --- a/scripts/prepare_gnomAD_vcf.py +++ b/scripts/prepare_gnomAD_vcf.py @@ -1,6 +1,4 @@ import gzip -from re import S -from statistics import quantiles import sys @@ -13,35 +11,42 @@ if line.startswith("#"): if line.startswith("##fileformat="): prepared_gnomad.write(line) + elif line.startswith("##hailversion="): prepared_gnomad.write(line) + elif line.startswith("##INFO=\n") + elif line.startswith("##INFO=") + elif line.startswith("##INFO=") + elif line.startswith("##INFO=") + elif line.startswith("##INFO=") + elif line.startswith("##INFO=") + elif line.startswith("##contig="): prepared_gnomad.write(line) + elif line.startswith("#CHROM"): prepared_gnomad.write(line) + else: continue - - - - - else: splitted_line = line.replace("\n", "").split("\t") @@ -59,20 +64,28 @@ for entry in info_fields: if entry.startswith("AN="): an_entry = entry.split("=")[1] + elif entry.startswith("AF="): af_entry = entry.split("=")[1] + elif entry.startswith("nhomalt="): hom_entry = entry.split("=")[1] + elif entry.startswith("AFR_AF="): afr_af_entry = entry.split("=")[1] + elif entry.startswith("AMR_AF="): amr_af_entry = entry.split("=")[1] + elif entry.startswith("EAS_AF="): eas_af_entry = entry.split("=")[1] + elif entry.startswith("NFE_AF="): nfe_af_entry = entry.split("=")[1] + elif entry.startswith("SAS_AF="): sas_af_entry = entry.split("=")[1] + else: continue @@ -87,5 +100,6 @@ if str(an_entry) != "" and str(af_entry) != "" and str(hom_entry) != "": prepared_gnomad.write(f"{chrom}\t{pos}\t{id}\t{ref}\t{alt}\t{qual}\t{filt}\t{info}\n") + else: print("WARNING: Skip missing gnomAD entry!")