# Extracting evidence that is already cited in GGPONC

In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
import pandas as pd

In [3]:
ggponc_release_eval = 'v2.1_2023_03_30'

In [4]:
df_ggponc_translations = pd.read_csv(
    f"../data/ggponc/{ggponc_release_eval}/guideline_translations.csv"
).rename(columns={"id": "guideline_id"})

In [5]:
df_ggponc_literature = pd.read_csv(
    f"../data/ggponc/{ggponc_release_eval}/xml/literature_index.tsv", sep="\t"
).rename(columns={"title": "german_name"})

In [6]:
df_ggponc = df_ggponc_literature.merge(df_ggponc_translations, on="german_name")

In [7]:
df_ggponc["title"] = df_ggponc["id"].str.extract("<i>(.+)<\/i>")

In [8]:
df_ggponc["pm_id"] = (
    df_ggponc["id"]
    .str.extract(r"https://pubmed.ncbi.nlm.nih.gov/(\d{8})")
    .astype("Int64")
)

### Retrieving IDs

In [9]:
df_ggponc["nct_id"] = pd.NA
df_ggponc["doi"] = pd.NA
df_ggponc["cn_id"] = pd.NA

In [10]:
from integration.config import load_config
from integration.db import get_engine
from sqlalchemy.orm import sessionmaker

cfg = load_config("../config_v2.1_2023_03_30.ini")
engine = get_engine(cfg['DB']['url'])
session = sessionmaker(bind=engine)()

In [11]:
from integration.citation_utils import (
    get_title_to_id_mapping_pubmed,
    get_title_to_id_mapping_clinicaltrials,
)

df_pm = get_title_to_id_mapping_pubmed(session)
df_ct = get_title_to_id_mapping_clinicaltrials(session)

In [12]:
from tqdm.auto import tqdm
from integration.citation_utils import retrieve_all_identifiers
import os

tqdm.pandas(desc="Retrieving available identifiers")

df_ggponc = df_ggponc.progress_apply(  # type: ignore
    lambda row: retrieve_all_identifiers(
        row=row,
        entrez_email=os.environ.get("PUBMED_USER"),
        entrez_api_key=os.environ.get("PUBMED_API_KEY"),
        doi_pm_id_cache="../data/literature/doi_to_pm_id.json",
        cn_pm_id_cache="../data/literature/cn_id_to_pm_id.json",
        title_nct_id_cache="../data/literature/fuzzy_title_to_nct_id.json",
        title_pm_id_cache="../data/literature/fuzzy_title_to_pm_id.json",
        df_pm=df_pm,
        df_ct=df_ct,
    ),
    axis=1,
)
df_ggponc = df_ggponc.astype(
    {col: "Int64" for col in df_ggponc.columns if "pm_id" in col.lower()}
)

Retrieving available identifiers:   0%|          | 0/26149 [00:00<?, ?it/s]

In [13]:
df_ggponc.dropna(subset="pm_id")

Unnamed: 0,id,german_name,num,ref_id,ref,guideline_id,english_name,title,pm_id,nct_id,doi,cn_id,pm_id_doi,pm_id_cn,nct_id_cn,pmc_id_cn,nct_id_db,pm_id_db
22,"Perea-Milla Lopez, E., Minarro-Del Moral, R. M...",Mundhöhlenkarzinom,23,117432,22,mundhoehlenkarzinom,Oral cavity cancer,"Lifestyles, environmental and phenotypic facto...",12771984,,,,,,,,,12771984
26,"Perea-Milla Lopez, E., Minarro-Del Moral, R. M...",Mundhöhlenkarzinom,27,153991,22,mundhoehlenkarzinom,Oral cavity cancer,"Lifestyles, environmental and phenotypic facto...",12771984,,,,,,,,,12771984
44,"Chung, C. H., Zhang, Q., Kong, C. S., Harris, ...",Mundhöhlenkarzinom,45,117795,13,mundhoehlenkarzinom,Oral cavity cancer,p16 protein expression and human papillomaviru...,25267748,,,,,,,,,25267748
76,"Syrjanen, S., Lodi, G., von Bultzingslowen, I....",Mundhöhlenkarzinom,77,117849,67,mundhoehlenkarzinom,Oral cavity cancer,Human papillomaviruses in oral carcinoma and o...,21382139,,,,,,,,,21382139
86,"Humphris, G. M., Freeman, R., Clarke, H. M., <...",Mundhöhlenkarzinom,87,154027,53,mundhoehlenkarzinom,Oral cavity cancer,Risk perception of oral cancer in smokers atte...,15380170,,,,,,,,,15380170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26098,"Judson, I., Morden, J. P., Kilburn, L., Leahy,...",Adulte Weichgewebesarkome,742,304426,742,adulte-weichgewebesarkome,Adult soft tissue sarcomas,Cediranib in patients with alveolar soft-part ...,31160249,,,,,,,,,31160249
26100,"Wilky, B. A., Trucco, M. M., Subhawong, T. K.,...",Adulte Weichgewebesarkome,744,304428,744,adulte-weichgewebesarkome,Adult soft tissue sarcomas,Axitinib plus pembrolizumab in patients with a...,31078463,,,,,,,,,31078463
26103,"Schoffski, P., Sufliarsky, J., Gelderblom, H.,...",Adulte Weichgewebesarkome,747,304431,747,adulte-weichgewebesarkome,Adult soft tissue sarcomas,"Crizotinib in patients with advanced, inoperab...",29669701,,,,,,,,,29669701
26106,"Bissler, J. J., Kingswood, J. C., Radzikowska,...",Adulte Weichgewebesarkome,750,304435,750,adulte-weichgewebesarkome,Adult soft tissue sarcomas,Everolimus for angiomyolipoma associated with ...,23312829,,,,,,,,,23312829


In [14]:
out_file = f"../data/screening/ggponc_{ggponc_release_eval}_literature.csv"
df_ggponc[["guideline_id", "title", "pm_id"]].to_csv(
    out_file, index=False
)
out_file

'../data/screening/ggponc_v2.1_2023_03_30_literature.csv'