From 32fe0895a9eec2c257d725efd2924f078f8bd49d Mon Sep 17 00:00:00 2001 From: Incardona Pietro Date: Sat, 9 Jul 2022 02:03:02 +0200 Subject: [PATCH] Closes: #568 Related-Issue: #568 Projected-Results-Impact: none --- HISTORY.rst | 1 + geneinfo/tests/factories.py | 3 + variants/file_export.py | 21 +++- variants/models.py | 91 +++++++++++++++- variants/queries.py | 14 ++- variants/tests/factories.py | 5 +- variants/tests/test_file_export.py | 162 +++++++++++++++++++++++++---- variants/views.py | 37 +------ 8 files changed, 277 insertions(+), 57 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index cbc9d8e99..a7bb5b7e7 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -12,6 +12,7 @@ End-User Summary - Starting with development of Bollonaster (VarFish v2) - Documenting problem with extra annotations in ``20210728` data release (#450). Includes instructions on how to apply patch to get ``20210728b``. +- Add Transcripts GnomadAD constraints and clinvar reports in the export(#568) - Extra annotations in export completed and tested (#495). - Removing problematic username modification behaviour on login page (#459). - Displaying login page text from settings again (#458). diff --git a/geneinfo/tests/factories.py b/geneinfo/tests/factories.py index a1700be16..820720fb2 100644 --- a/geneinfo/tests/factories.py +++ b/geneinfo/tests/factories.py @@ -176,6 +176,9 @@ class Meta: obs_lof = factory.Sequence(lambda n: n) lof_z = factory.Sequence(lambda n: 1 / 2 ** (n % 12)) oe_lof = factory.Sequence(lambda n: 1 / 2 ** (n % 12)) + pLI = factory.Sequence(lambda n: 1 / 2 ** (n % 12) + 1.234) + oe_lof_upper = factory.Sequence(lambda n: 1 / 3 ** (n % 12)) + oe_lof_lower = factory.Sequence(lambda n: 1 / 0.75 ** (n % 12)) class GeneIdToInheritanceFactory(factory.django.DjangoModelFactory): diff --git a/variants/file_export.py b/variants/file_export.py index 217b09884..b181be37e 100644 --- a/variants/file_export.py +++ b/variants/file_export.py @@ -25,6 +25,7 @@ annotate_with_phenotype_scores, annotate_with_pathogenicity_scores, annotate_with_joint_scores, + annotate_with_transcripts, unroll_extra_annos_result, prioritize_genes, VariantScoresFactory, @@ -104,6 +105,12 @@ def to_str(val): ("gene_name", "Gene Name", str), ("gene_family", "Gene Family", str), ("pubmed_id", "Gene Pubmed ID", str), + ("gnomad_pLI", "Gnomad constrains pLI", float), + ("gnomad_mis_z", "Gnomad constrains z-score missense", float), + ("gnomad_oe_lof", "Gnomad constrains lof observed/expected", float), + ("gnomad_oe_lof_upper", "Gnomad constrains lof observed/expected upper", float), + ("gnomad_oe_lof_lower", "Gnomad constrains lof observed/expected lower", float), + ("pathogenicity_summary", "ClinVar pathogenicity summary", str), ) if settings.KIOSK_MODE: HEADER_FIXED = tuple(filter(lambda x: not x[0].startswith("inhouse_"), HEADER_FIXED)) @@ -120,6 +127,8 @@ def to_str(val): ("pathogenicity_rank", "Pathogenicity Rank", int), ) +HEADERS_TRANSCRIPTS = (("transcripts", "Transcript ids", str),) + #: Names of the joint scoring header columns. HEADERS_JOINT_SCORES = ( ("joint_score", "Pheno+Patho Score", float), @@ -229,6 +238,9 @@ def __getitem__(self, key): return self.genotype return self.__wrapped__.__getitem__(key) +def _is_jannovar_enabled(): + """Return if jannover is enabled for exporting transcripts.""" + return settings.VARFISH_ENABLE_JANNOVAR class CaseExporterBase: """Base class for export of (filtered) case data from single case or all cases of a project. @@ -339,6 +351,8 @@ def _yield_columns(self, members): header += HEADERS_PHENO_SCORES if self._is_pathogenicity_enabled(): header += HEADERS_PATHO_SCORES + if _is_jannovar_enabled(): + header += HEADERS_TRANSCRIPTS if self._is_prioritization_enabled() and self._is_pathogenicity_enabled(): header += HEADERS_JOINT_SCORES if self.query_args["export_flags"]: @@ -364,6 +378,8 @@ def _yield_smallvars(self): with contextlib.closing(self.query.run(self.query_args)) as result: self.job.add_log_entry("Executing phenotype score query...") _result = list(result) + if _is_jannovar_enabled(): + _result = annotate_with_transcripts(_result, self.query_args["database_select"]) if self._is_prioritization_enabled(): gene_scores = self._fetch_gene_scores([entry.entrez_id for entry in _result]) _result = annotate_with_phenotype_scores(_result, gene_scores) @@ -536,7 +552,10 @@ def _write_variants_data(self): if column["name"] == "chromosome": row.append("chr" + getattr(small_var, "chromosome")) elif column["fixed"]: - row.append(getattr(small_var, column["name"])) + if column["name"] == "transcripts": + row.append(getattr(small_var, column["name"]).replace("\n", "|")) + else: + row.append(getattr(small_var, column["name"])) else: member, field = column["name"].rsplit(".", 1) if field == "aaf": diff --git a/variants/models.py b/variants/models.py index c704c2442..4e917aee3 100644 --- a/variants/models.py +++ b/variants/models.py @@ -60,7 +60,6 @@ from variants.helpers import get_meta from projectroles.app_settings import AppSettingAPI - app_settings = AppSettingAPI() #: Django user model. @@ -87,6 +86,41 @@ } +def load_molecular_impact(kwargs): + """Load molecular impact from Jannovar REST API if configured.""" + if not settings.VARFISH_ENABLE_JANNOVAR: + return [] + + url_tpl = ( + "%(base_url)sannotate-var/%(database)s/%(genome)s/%(chromosome)s/%(position)s/%(reference)s/" + "%(alternative)s" + ) + genome = {"GRCh37": "hg19", "GRCh38": "hg38"}.get(kwargs["release"], "hg19") + url = url_tpl % { + "base_url": settings.VARFISH_JANNOVAR_REST_API_URL, + "database": kwargs["database"], + "genome": genome, + "chromosome": kwargs["chromosome"], + "position": kwargs["start"], + "reference": kwargs["reference"], + "alternative": kwargs["alternative"], + } + try: + res = requests.request(method="get", url=url) + if not res.status_code == 200: + raise ConnectionError( + "ERROR: Server responded with status {} and message {}".format( + res.status_code, res.text + ) + ) + else: + return res.json() + except requests.ConnectionError as e: + raise ConnectionError( + "ERROR: Server at {} not responding.".format(settings.VARFISH_JANNOVAR_REST_API_URL) + ) from e + + def only_source_name(full_name): """Helper function that strips SNAPPY suffixes for samples.""" if full_name.count("-") >= 3: @@ -2237,6 +2271,38 @@ def __getitem__(self, key): return self.__wrapped__.__getitem__(key) +class RowWithTranscripts(wrapt.ObjectProxy): + """Wrap a result row and add members for phenotype score and rank.""" + + def __init__(self, obj, database): + super().__init__(obj) + self._self_transcripts = None + self._self_database = database + + @property + def transcripts(self): + return self._self_transcripts + + @property + def database(self): + return self._self_database + + @transcripts.setter + def transcripts(self, value): + self._self_transcripts = value + + def __getattr__(self, item): + return self.__getitem__(item) + + def __getitem__(self, key): + if key == "transcripts": + return self.transcripts + elif key == "database": + return self.database + else: + return self.__wrapped__.__getitem__(key) + + def annotate_with_phenotype_scores(rows, gene_scores): """Annotate the results in ``rows`` with phenotype scores stored in ``small_variant_query``. @@ -2264,6 +2330,29 @@ def annotate_with_phenotype_scores(rows, gene_scores): return rows +def annotate_with_transcripts(rows, database): + """Annotate the results in ``rows`` with transcripts (RefSeq or Ensembl) + + """ + rows = [RowWithTranscripts(row, database) for row in rows] + for row in rows: + transcripts = load_molecular_impact(row) + row.transcripts = "\n".join( + [ + t["transcriptId"] + + ";" + + ",".join(t["variantEffects"]) + + ";" + + t["hgvsProtein"] + + ";" + + t["hgvsNucleotides"] + for t in transcripts + ] + ) + + return rows + + # TODO: Improve wrapper so we can assign obj.pathogenicity_rank and score class RowWithPathogenicityScore(wrapt.ObjectProxy): """Wrap a result row and add members for pathogenicity score and rank.""" diff --git a/variants/queries.py b/variants/queries.py index a42b6f359..9035cb8f9 100644 --- a/variants/queries.py +++ b/variants/queries.py @@ -1344,7 +1344,17 @@ def extend_selectable(self, query_parts): class ExtendQueryPartsGnomadConstraintsJoin(ExtendQueryPartsBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.fields = ["pLI", "mis_z", "syn_z", "oe_lof", "oe_lof_upper", "oe_lof_lower"] + self.fields = [ + "pLI", + "mis_z", + "syn_z", + "oe_mis", + "oe_mis_upper", + "oe_mis_lower", + "oe_lof", + "oe_lof_upper", + "oe_lof_lower", + ] self.subquery = ( select( [ @@ -1478,6 +1488,7 @@ class CaseExportTableQueryPartsBuilder(QueryPartsBuilder): ExtendQueryPartsHgncAndConservationJoin, ExtendQueryPartsAcmgJoin, ExtendQueryPartsMgiJoin, + ExtendQueryPartsGnomadConstraintsJoin, ] @@ -1518,6 +1529,7 @@ class ProjectExportTableQueryPartsBuilder(QueryPartsBuilder): ExtendQueryPartsGeneSymbolJoin, ExtendQueryPartsAcmgJoin, ExtendQueryPartsMgiJoin, + ExtendQueryPartsGnomadConstraintsJoin, ] diff --git a/variants/tests/factories.py b/variants/tests/factories.py index 557bed0eb..e252ef4b4 100644 --- a/variants/tests/factories.py +++ b/variants/tests/factories.py @@ -655,6 +655,9 @@ class Meta: state = "active" +CHROMOSOME_LIST_TESTING = [ + str(chrom) for chrom in sorted(list(range(1, 23)) + list(range(1, 23))) + ["X", "X", "Y", "Y"] +] CHROMOSOME_MAPPING = {str(chrom): i + 1 for i, chrom in enumerate(list(range(1, 23)) + ["X", "Y"])} CHROMOSOME_MAPPING.update({f"chr{chrom}": i for chrom, i in CHROMOSOME_MAPPING.items()}) @@ -671,7 +674,7 @@ class Params: genotypes = default_genotypes release = "GRCh37" - chromosome = factory.Iterator(list(CHROMOSOME_MAPPING.keys())) + chromosome = factory.Iterator(CHROMOSOME_LIST_TESTING) chromosome_no = factory.LazyAttribute(lambda o: CHROMOSOME_MAPPING[o.chromosome]) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.LazyAttribute(lambda o: o.start + len(o.reference) - len(o.alternative)) diff --git a/variants/tests/test_file_export.py b/variants/tests/test_file_export.py index 74cabbcab..6df6fe8d1 100644 --- a/variants/tests/test_file_export.py +++ b/variants/tests/test_file_export.py @@ -6,13 +6,17 @@ import tempfile from unittest.mock import patch +import django from django.utils import timezone import openpyxl +from requests_mock import Mocker from test_plus.test import TestCase from timeline.models import ProjectEvent +from clinvar.tests.factories import ClinvarFactory from cohorts.tests.factories import TestCohortBase from extra_annos.tests.factories import ExtraAnnoFieldFactory, ExtraAnnoFactory +from geneinfo.tests.factories import GnomadConstraintsFactory from variants.tests.factories import ( SmallVariantFactory, ResubmitFormDataFactory, @@ -72,6 +76,58 @@ def setUp(self): file_type="xlsx", ) + GnomadConstraintsFactory.reset_sequence() + + # Create two entries for first variant that is in clinvar (second variant in total) + for small_var in self.small_vars: + ClinvarFactory( + release=small_var.release, + chromosome=small_var.chromosome, + start=small_var.start, + end=small_var.end, + bin=small_var.bin, + reference=small_var.reference, + alternative=small_var.alternative, + pathogenicity="pathogenic", + pathogenicity_summary="uncertain significance", + ) + GnomadConstraintsFactory(ensembl_gene_id=small_var.ensembl_gene_id) + + def _set_janno_mocker(self, database, mock_): + if database == "refseq": + transcript_prefix = "NM" + else: + transcript_prefix = "ENST" + + for small_var in self.small_vars: + mock_.get( + "https://jannovar.example.com/annotate-var/%s/hg19/%s/%s/%s/%s" + % ( + database, + small_var.chromosome, + small_var.start, + small_var.reference, + small_var.alternative, + ), + status_code=200, + json=[ + { + "transcriptId": transcript_prefix + "_058167.2", + "variantEffects": ["three_prime_utr_exon_variant"], + "isCoding": True, + "hgvsProtein": "p.(=)", + "hgvsNucleotides": "c.*60G>A", + }, + { + "transcriptId": transcript_prefix + "_194315.1", + "variantEffects": ["three_prime_utr_exon_variant"], + "isCoding": True, + "hgvsProtein": "p.(=)", + "hgvsNucleotides": "c.*60G>A", + }, + ], + ) + class CaseExporterTest(ExportTestBase): def setUp(self): @@ -81,16 +137,54 @@ def setUp(self): ResubmitFormDataFactory(submit="download", names=self.case.get_members()) ) - def test_export_tsv(self): + def _test_export_xlsx(self, database, mock_): + self._set_janno_mocker(database, mock_) + + self.export_job.query_args["database_select"] = database + with file_export.CaseExporterXlsx(self.export_job, self.export_job.case) as exporter: + result = exporter.generate() + with tempfile.NamedTemporaryFile(suffix=".xlsx") as temp_file: + temp_file.write(result) + temp_file.flush() + workbook = openpyxl.load_workbook(temp_file.name) + # TODO: also test without commments + self.assertEquals(workbook.sheetnames, ["Variants", "Comments", "Metadata"]) + variants_sheet = workbook["Variants"] + arrs = [[cell.value for cell in row] for row in variants_sheet.rows] + self._test_tabular(arrs, False, django.conf.settings.VARFISH_ENABLE_JANNOVAR, database) + + def _test_export_tsv(self, database, mock_): + self._set_janno_mocker(database, mock_) + + self.export_job.query_args["database_select"] = database with file_export.CaseExporterTsv(self.export_job, self.export_job.case) as exporter: result = str(exporter.generate(), "utf-8") arrs = [line.split("\t") for line in result.split("\n")] - self._test_tabular(arrs, True) + self._test_tabular(arrs, True, django.conf.settings.VARFISH_ENABLE_JANNOVAR, database) - def _test_tabular(self, arrs, has_trailing): + @Mocker() + def test_export_tsv(self, mock_): + self._test_export_tsv("refseq", mock_) + + @patch("django.conf.settings.VARFISH_ENABLE_JANNOVAR", True) + @patch("django.conf.settings.VARFISH_JANNOVAR_REST_API_URL", "https://jannovar.example.com/") + @Mocker() + def test_export_tsv_refseq(self, mock_): + self._test_export_tsv("refseq", mock_) + + @patch("django.conf.settings.VARFISH_ENABLE_JANNOVAR", True) + @patch("django.conf.settings.VARFISH_JANNOVAR_REST_API_URL", "https://jannovar.example.com/") + @Mocker() + def test_export_tsv_ensembl(self, mock_): + self._test_export_tsv("ensembl", mock_) + + def _test_tabular(self, arrs, has_trailing, janno_enable, database): self.assertEquals(len(arrs), 4 + int(has_trailing)) # TODO: also test without flags and comments - self.assertEquals(len(arrs[0]), 50) + if not janno_enable: + self.assertEquals(len(arrs[0]), 56) + else: + self.assertEquals(len(arrs[0]), 57) self.assertSequenceEqual(arrs[0][:3], ["Chromosome", "Position", "Reference bases"]) self.assertSequenceEqual( arrs[0][-5:], @@ -127,6 +221,35 @@ def _test_tabular(self, arrs, has_trailing): self.assertSequenceEqual(arrs[i + 1][-6:-5], ["9.89"]) elif i == 2: self.assertSequenceEqual(arrs[i + 1][-6:-5], ["10.78"]) + self.assertSequenceEqual( + [ + arrs[i + 1][31][0:6], + arrs[i + 1][32][0:6], + arrs[i + 1][33][0:6], + arrs[i + 1][34][0:6], + arrs[i + 1][35][0:6], + ], + [ + str(1 / 2 ** (i % 12) + 1.234)[0:6], + str(1 / 2 ** (i % 12))[0:6], + str(1 / 2 ** (i % 12))[0:6], + str(1 / 3 ** (i % 12))[0:6], + str((1 / 0.75) ** (i % 12))[0:6], + ], + ) + if janno_enable: + if database == "refseq": + self.assertEquals( + arrs[i + 1][37].replace("\n", "|"), + "NM_058167.2;three_prime_utr_exon_variant;p.(=);c.*60G>A|NM_194315.1;three_prime_utr_exon_variant;p.(=);c.*60G>A", + ) + else: + self.assertEquals( + arrs[i + 1][37].replace("\n", "|"), + "ENST_058167.2;three_prime_utr_exon_variant;p.(=);c.*60G>A|ENST_194315.1;three_prime_utr_exon_variant;p.(=);c.*60G>A", + ) + + self.assertEquals(arrs[i + 1][36], "uncertain significance") if has_trailing: self.assertSequenceEqual(arrs[4], [""]) @@ -170,18 +293,21 @@ def test_export_vcf(self): ) self.assertEquals(content[3], "") - def test_export_xlsx(self): - with file_export.CaseExporterXlsx(self.export_job, self.export_job.case) as exporter: - result = exporter.generate() - with tempfile.NamedTemporaryFile(suffix=".xlsx") as temp_file: - temp_file.write(result) - temp_file.flush() - workbook = openpyxl.load_workbook(temp_file.name) - # TODO: also test without commments - self.assertEquals(workbook.sheetnames, ["Variants", "Comments", "Metadata"]) - variants_sheet = workbook["Variants"] - arrs = [[cell.value for cell in row] for row in variants_sheet.rows] - self._test_tabular(arrs, False) + @Mocker() + def test_export_xlsx(self, mock): + self._test_export_xlsx("refseq", mock) + + @patch("django.conf.settings.VARFISH_ENABLE_JANNOVAR", True) + @patch("django.conf.settings.VARFISH_JANNOVAR_REST_API_URL", "https://jannovar.example.com/") + @Mocker() + def test_export_xlsx_refseq(self, mock): + self._test_export_xlsx("refseq", mock) + + @patch("django.conf.settings.VARFISH_ENABLE_JANNOVAR", True) + @patch("django.conf.settings.VARFISH_JANNOVAR_REST_API_URL", "https://jannovar.example.com/") + @Mocker() + def test_export_xlsx_ensembl(self, mock): + self._test_export_xlsx("ensembl", mock) class ProjectExportTest(TestCase): @@ -235,7 +361,7 @@ def test_export_tsv(self): def _test_tabular(self, arrs, has_trailing): self.assertEquals(len(arrs), 5 + int(has_trailing)) # TODO: also test without flags and comments - self.assertEquals(len(arrs[0]), 50) + self.assertEquals(len(arrs[0]), 56) self.assertSequenceEqual(arrs[0][:3], ["Sample", "Chromosome", "Position"]) self.assertEqual(arrs[0][-1], "sample Alternate allele fraction") members = sorted(self.project.get_members()) @@ -439,7 +565,7 @@ def test_export_tsv_as_contributor_for_cohort_by_superuser(self): def _test_tabular(self, arrs, ref, has_trailing, smallvars): self.assertEquals(len(arrs), ref + int(has_trailing)) # TODO: also test without flags and comments - self.assertEquals(len(arrs[0]), 50) + self.assertEquals(len(arrs[0]), 56) self.assertSequenceEqual(arrs[0][:3], ["Sample", "Chromosome", "Position"]) self.assertEqual(arrs[0][-1], "sample Alternate allele fraction") for i, small_var in enumerate(sorted(smallvars, key=lambda x: (x.chromosome_no, x.start))): diff --git a/variants/views.py b/variants/views.py index 2043c24b7..edcb29486 100644 --- a/variants/views.py +++ b/variants/views.py @@ -116,6 +116,7 @@ ClearOldKioskCasesBgJob, ClearInactiveVariantSetsBgJob, ClearExpiredExportedFilesBgJob, + load_molecular_impact, ) from .forms import ( ExportFileResubmitForm, @@ -3052,40 +3053,6 @@ def _load_small_var(self, kwargs): alternative=kwargs["alternative"], ).first() - def _load_molecular_impact(self, kwargs): - """Load molecular impact from Jannovar REST API if configured.""" - if not settings.VARFISH_ENABLE_JANNOVAR: - return [] - - url_tpl = ( - "%(base_url)sannotate-var/%(database)s/%(genome)s/%(chromosome)s/%(position)s/%(reference)s/" - "%(alternative)s" - ) - genome = {"GRCh37": "hg19", "GRCh38": "hg38"}.get(kwargs["release"], "hg19") - url = url_tpl % { - "base_url": settings.VARFISH_JANNOVAR_REST_API_URL, - "database": kwargs["database"], - "genome": genome, - "chromosome": kwargs["chromosome"], - "position": kwargs["start"], - "reference": kwargs["reference"], - "alternative": kwargs["alternative"], - } - try: - res = requests.request(method="get", url=url) - if not res.status_code == 200: - raise ConnectionError( - "ERROR: Server responded with status {} and message {}".format( - res.status_code, res.text - ) - ) - else: - return res.json() - except requests.ConnectionError as e: - raise ConnectionError( - "ERROR: Server at {} not responding.".format(settings.VARFISH_JANNOVAR_REST_API_URL) - ) from e - def _get_population_freqs(self, kwargs): if kwargs.get("chromosome") == "MT": return {} @@ -3285,7 +3252,7 @@ def get_context_data(self, object): result["clinvar"] = self._load_clinvar(self.kwargs) result["knowngeneaa"] = self._load_knowngene_aa(self.kwargs) result["small_var"] = self._load_small_var(self.kwargs) - result["effect_details"] = self._load_molecular_impact(self.kwargs) + result["effect_details"] = load_molecular_impact(self.kwargs) result["extra_annos"] = self.get_extra_annos(self.kwargs) if self.request.GET.get("render_full", "no").lower() in ("yes", "true"): result["base_template"] = "projectroles/base.html"