prepare release 1.0.3 (#28)

* Insert 'chr' to chrom_id if missing to prevent problems if the input uses the short version of chromosome identifiers * - Change logging behaviour to prevent bloating of logfile if not in DEBUG mode - log file can now be saved at a custom location - workdir now points to a temporary directory if not specified * Prevent problems if empty lines are in the family, hpo and exclusion files * Update variant consequence terms, a few new terms were added to the databases. Added TODO to handle such things in general to not break the program if new terms are introduced. * Add TODO mark * Fix a bug in OMIM annotation and change PERL5LIB path * Update consequence terms and add condition to handle unknown consequence terms * Allow HPO terms to be passed directly as list instead of a file * Introduce new column (GSVAR_VARIANT) in result files with GSvar variant representation to make comparison with inhouse samples easier. * Introduce change to handle newer networkx version (3.x) (TEST AND VERIFICATION NEEDED) * Bugfix: variable was not correctly renamed * Add flag to make VCF result file optional and add additional result files according to inheritance mode if TRIO sample analysis * Add flag to indicate opposing SIFT and PolyPhen scores. Furthermore add functionality to obtain the percentage of missing features per variant. * Bugfix * Fix typo in variable name * Bugfix * Suppress runtime warning if mean of empty slice appears (we expect this warning to happen sometimes) * Update script to handle the current updated files * Rename variable to match the new name * Update HPO resources * Various improvements to fine tune the results * Make the tool more modular. Introduced separate run script to only run annotation. * Remove unused imports * Move gnomAD AF annotation away from VEP. Change some internal representations. Optimize weighting in prioritization step. * Bugfix: Fix wrong column name * Fix splicing annotation. Add additional annotation (SpliceAI). * Add handling for SpliceAI annotation * Use SpliceAI instead of dbscSNV * Fix missing bracket * Fix bug where to many predictions were set to nan * - ADDED new features (Alpha Missense, EVE) - ADDED binary feature indicating the impact class of the variant - IMPROVED various filter steps in the prioritization and scoring part - EXCHHANGED the splicing scores (dbscSNV) with SpliceAI - IMPROVED missing value imputation - IMPROVED InDel combination * Add missing liftOver instruction * ADD option to skip prefiltering in annotation script (if already done). Update documentation. * Add missing feature handling * Fix feature (IS_INDEL) * Change naming of the tool to match the current state * Change tool naming * Update README * Update README * Bugfix to handle the situation if VEP annotation for a variant is empty. * Update scripts and documentation * Add flag to indicate the use of inhouse samples * FIXED all merge conflicts ADD new parameter --top_rank to specify the maximum rank if only the top results are reported Refactoring was done * ADD rank column to the result file. Fully integrate canonical transcript filtering when choosing the right feature annotation if multiple transcripts where annotated for a variant. * Update code and remove currently unused annotation! * Update README --------- Co-authored-by: Dominic Boceck <dominic.boceck@gmail.com>
imgag · Jul 9, 2024 · 164f065 · 164f065
1 parent 03d3b92
commit 164f065
Show file tree

Hide file tree

Showing 30 changed files with 920 additions and 379 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ The following additional libraries need to be installed in order to use the prog
 + scipy (v1.10.1)
 + scikit-learn (v1.3.0)
 
-If a newer scikit-learn version is used the models provided should still work (they were created using the version v0.19.1 and the import was tested with v0.21.3 and v0.22.2).
+If a newer scikit-learn version is used it is advised to create a new model with the newer scikit version.
 
 
 ## Annotation resources and tools

diff --git a/aidiva/helper_modules/convert_indels_to_snps_and_create_vcf.py b/aidiva/helper_modules/convert_indels_to_snps_and_create_vcf.py
@@ -5,27 +5,31 @@
 import random
 
 
+RANDOM_SEED = 14038
+
 logger = logging.getLogger(__name__)
 
 
 def write_data_information_to_file(input_data, outfile, ref_sequence, header):
-    data_grouped = [group for key, group in input_data.groupby("CHROM")]
+    data_grouped = [group for key, group in input_data.groupby("#CHROM")]
     in_fasta = pysam.FastaFile(ref_sequence)
-    random.seed(14038)
+    random.seed(RANDOM_SEED)
 
     for line in header:
         if line.startswith("#CHROM"):
             outfile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
+
         else:
             outfile.write(line)
 
     for group in data_grouped:
-        if "chr" in str(group["CHROM"].iloc[0]):
-            chrom_id = str(group["CHROM"].iloc[0])
+        if "chr" in str(group["#CHROM"].iloc[0]):
+            chrom_id = str(group["#CHROM"].iloc[0])
+
         else:
-            chrom_id = "chr" + str(group["CHROM"].iloc[0])
+            chrom_id = "chr" + str(group["#CHROM"].iloc[0])
 
-        for row in group.itertuples():
+        for row in group.itertuples(index=False):
             # make sure that the window starts at the beginning of the reference sequence
             window_start = max(int(row.POS) - 3, 1)
 
@@ -37,47 +41,51 @@ def write_data_information_to_file(input_data, outfile, ref_sequence, header):
                 alt_variant = ""
                 if (extended_ref_seq[i] == "A") or (extended_ref_seq[i] == "T"):
                     alt_variant = random.choice(["G", "C"])
+
                 elif (extended_ref_seq[i] == "G") or (extended_ref_seq[i] == "C"):
                     alt_variant = random.choice(["A", "T"])
+
                 elif (extended_ref_seq[i] == "N"):
                     logger.debug("Reference base was skipped because it was 'N'!")
                     continue
+
                 else:
                     logger.error("The given reference sequence seems to be corrupted!")
 
-                outfile.write(str(row.CHROM).strip() + "\t" + str(window_start + i + 1).strip() + "\t" + "." + "\t" + str(extended_ref_seq[i]).strip() + "\t" + str(alt_variant).strip() + "\t" + "." + "\t" + "." + "\t" + str(row.INFO).strip() + "\n")
+                outfile.write(str(chrom_id).strip() + "\t" + str(window_start + i + 1).strip() + "\t" + "." + "\t" + str(extended_ref_seq[i]).strip() + "\t" + str(alt_variant).strip() + "\t" + "." + "\t" + "." + "\t" + str(row.INFO).strip() + "\n")
 
 
 def import_vcf_data(in_data):
     header_line = ""
     comment_lines = []
 
+    # WARNING: reset file pointer to begin after reading header to start processing from the start again (is done by closing the file before reading again)
     with open(in_data, "r") as input_vcf:
         # extract header from vcf file
         for line in input_vcf:
             if line.strip().startswith("##"):
                 comment_lines.append(line)
-            if line.strip().startswith("#CHROM"):
+
+            elif line.strip().startswith("#CHROM"):
                 header_line = line.strip()
                 comment_lines.append(line)
                 break # now the variant entries are coming
+
             else:
                 continue
 
         if header_line == "":
             logger.error("The VCF seems to be corrupted, missing header line!")
 
-        # reset file pointer to begin reading at the beginning (is done by closing the file before reading again)
-
     data = pd.read_csv(in_data, names=header_line.split("\t"), sep="\t", comment="#", low_memory=False)
     data.fillna(".", inplace=True)
-    data = data.rename(columns={"#CHROM": "CHROM"})
 
     return data, comment_lines
 
 
 def convert_indel_vcf_to_expanded_indel_vcf(in_data, out_data, ref_folder):
     input_data, header = import_vcf_data(in_data)
+
     with open(out_data, "w", newline="") as outfile:
         write_data_information_to_file(input_data, outfile, ref_folder, header)