Thormann A, Halachev M, McLaren W, et al. **Flexible and scalable diagnostic filtering of genomic variants using G2P with Ensembl VEP**. Nature Communications. 2019 May;10(1):2373. DOI: 10.1038/s41467-019-10016-3. PMID: 31147538; PMCID: PMC6542828. 

### TODO

* Use all G2ph data instead of just cancer

In [1]:
from retraction_audit.audit import RetractionLookup
import os

os.environ["RETRACTION_WATCH_EMAIL"] = "james.sharpsteen@gmail.com"
os.environ["NCBI_AUTH_EMAIL"] = "james.sharpsteen@gmail.com"

lookup = RetractionLookup(True)

In [3]:
from wags_tails import CustomData
from wags_tails.utils.downloads import handle_gzip, download_http
from wags_tails.utils.versioning import DATE_VERSION_PATTERN
import requests
import datetime
import json
import polars as pl
from datetime import datetime, timezone

def _latest_version_cb() -> str:
    return datetime.now(tz=timezone.utc).strftime(DATE_VERSION_PATTERN)

_tqdm_params = {
    "disable": False,
    "unit": "B",
    "ncols": 80,
    "unit_divisor": 1024,
    "unit_scale": True,
}


data_files = []
for datatype in ("Cancer", "Cardiac", "DD", "Eye", "Skeletal", "Skin"):
    def _download_cb(version, outfile_path) -> None:
        download_http(
            f"https://www.ebi.ac.uk/gene2phenotype/downloads/{datatype}G2P.csv.gz",
            outfile_path,
            tqdm_params=_tqdm_params,
            handler=handle_gzip
        )
    data_fetcher = CustomData(
        "gene2phenotype",
        "csv",
        _latest_version_cb,
        _download_cb,
        file_name=f"gene2phenotype_{datatype.lower()}"
    )
    
    g2ph = data_fetcher.get_latest()[0]
    data_files.append(g2ph)

Downloading CardiacG2P.csv.gz...


100%|██████████████████████████████████████| 35.2k/35.2k [00:00<00:00, 91.0kB/s]


Downloading DDG2P.csv.gz...


100%|█████████████████████████████████████████| 237k/237k [00:01<00:00, 217kB/s]


Downloading EyeG2P.csv.gz...


100%|███████████████████████████████████████| 73.0k/73.0k [00:00<00:00, 213kB/s]


Downloading SkeletalG2P.csv.gz...


100%|███████████████████████████████████████| 44.2k/44.2k [00:00<00:00, 410kB/s]


Downloading SkinG2P.csv.gz...


100%|███████████████████████████████████████| 54.6k/54.6k [00:00<00:00, 133kB/s]


In [10]:
pmids = []

for file in data_files:
    data = pl.read_csv(file, null_values=["No gene mim", "No disease mim"])
    file_pmids = list(data["pmids"].str.split(";").explode())
    pmids += file_pmids

pmids = list(set(pmids))

In [18]:
retracted_ev = []
for pmid in pmids:
    retraction_record = lookup.get_retraction_by_pmid(str(pmid))
    if retraction_record:
        retracted_ev.append(retraction_record)

In [19]:
print(len(retracted_ev))
from tools import store_results, get_latest_results
store_results("Gene2Phenotype", len(retracted_ev), retracted_ev, len(pmids))
get_latest_results()

4


[('CIViC', 1, 3667, '2024-03-15T00:03:17.308166+00:00'),
 ('DGIDB', 20, 11403, '2024-03-15T00:08:13.144921+00:00'),
 ('PMKB', 0, 1562, '2024-03-15T01:45:21.760435+00:00'),
 ('PharmGKB', 2, 17150, '2024-03-15T13:35:38.913493+00:00'),
 ('DOKB', 2, 4318, '2024-03-15T14:38:21.085659+00:00'),
 ('MOAlmanac', 0, 166, '2024-03-17T00:31:14.361317+00:00'),
 ('COSMIC', 15, 20600, '2024-03-19T13:15:43.634143+00:00'),
 ('Gene2Phenotype', 4, 8369, '2024-03-19T13:24:01.323836+00:00')]

In [20]:
retracted_ev

[RetractionData(pmid='21665000', doi='10.1016/j.ajhg.2011.05.009', article_title='A BLOC-1 Mutation Screen Reveals that PLDN Is Mutated in Hermansky-Pudlak Syndrome Type 9', journal='American Journal of Human Genetics', authors=['Andrew R Cullinane', 'James A Curry', 'Carmelo Carmona-Rivera', 'C Gail Summers', 'Carla Ciccone', 'Nicholas D Cardillo', 'Heidi M Dorward', 'Richard A Hess', 'James G White', 'David R Adams', 'Marjan Huizing', 'William A Gahl'], retraction_watch_refs=['http://retractionwatch.com/2017/05/16/journal-retracts-paper-eight-months-u-s-feds-announce-findings-misconduct/', 'http://retractionwatch.com/2016/08/30/former-nih-postdoc-doctored-data/'], retraction_doi='10.1016/j.ajhg.2017.04.011', retraction_pmid='28475864', retraction_reasons=['Duplication of Image', 'Falsification/Fabrication of Results', 'Investigation by ORI', 'Misconduct - Official Investigation/Finding', 'Misconduct by Author'], retraction_type=<FlagType.RETRACTED: 'Retraction'>),
 RetractionData(pmi