# Statistics of Database Content


In [1]:
from sqlalchemy import select

In [2]:
from api.app import app
from fastapi.testclient import TestClient
import os

with TestClient(app) as client:
    # Initialize state
    pass    

session = app.state.session()

# GGPONC

In [3]:
from integration.orm import ggponc

In [4]:
len(session.scalars(select(ggponc.Guideline)).fetchall())

34

In [5]:
import pandas as pd

def top_10(cuis):
    s = pd.Series(cuis).value_counts()[0:10]
    df = pd.DataFrame(s)
    df['name'] = df.index.map(app.state.concept_parser.umls_parser.get_umls_text)
    return df

def assert_not_null(cuis):
    assert all([c != None for c in cuis])

In [6]:
top_pop = session.scalars(select(ggponc.Population.cui)).fetchall()
all_pops = session.scalars(select(ggponc.SubPopulation.cui)).fetchall()
ggponc_pops = top_pop + all_pops
assert_not_null(ggponc_pops)
len(ggponc_pops), len(top_pop), len(all_pops)

(17530, 39, 17491)

In [7]:
len(ggponc_pops), len(set(ggponc_pops))

(17530, 12005)

In [8]:
ggponc_intv = session.scalars(select(ggponc.Entity.cui).where(ggponc.Entity.type_.in_(['Clinical_Drug', 'Therapeutic', 'Diagnostic']))).fetchall()
assert_not_null(ggponc_intv)

In [9]:
len(ggponc_intv), len(set(ggponc_intv))

(129119, 15852)

In [10]:
top_10(ggponc_intv)

Unnamed: 0,0,name
C0087111,4329,Therapeutic Procedure
C1522449,2618,Radiation Therapy
C3665472,2329,Chemotherapy
C0184661,1427,Intervention or Procedure
C0543467,1374,Surgical Procedure
C0011900,1192,Diagnosis
C0436307,1014,Chemoradiotherapy
C0030231,938,Palliative Therapy
C0008838,719,Cisplatin
C1699633,691,Positron Emission Tomography and Computed Tomo...


In [11]:
cuis_l1 = app.state.relationship_mapper.get_related_concepts('C0033578', direction='broad2narrow', max_depth=1)

In [12]:
for c in cuis_l1:
    print(c, app.state.concept_parser.umls_parser.get_umls_text(c))

C2931456 Familial prostate cancer
C0376358 Prostate cancer
C0282612 Prostatic Intraepithelial Neoplasia
C2677772 Prostate Cancer, Hereditary, 14
C1335409 Prostate Phyllodes Tumor
C2678047 Prostate Cancer, Hereditary, X-Linked 2
C2678479 Prostate Cancer, Hereditary, 12
C1837595 PROSTATE CANCER, HEREDITARY, 3
C1837593 Prostate Cancer, Hereditary, 4
C1836005 Prostate Cancer, Hereditary, 6
C1970192 Prostate Cancer, Hereditary, 10
C1864472 PROSTATE CANCER, HEREDITARY, 8
C1335515 Prostate Neuroendocrine Neoplasm
C2677821 Prostate Cancer, Hereditary, 13
C1970250 Prostate Cancer, Hereditary, 9
C1836436 Prostate Cancer, Hereditary, 5
C4759295 Non-metastatic prostate cancer
C0496923 Neoplasm of uncertain behavior of prostate
C1334615 Malignant Prostate Phyllodes Tumor
C2677771 Prostate Cancer, Hereditary, 15
C2677773 Prostate Cancer, Hereditary, 11
C1332353 Atypical Small Acinar Proliferation of the Prostate Gland
C1853195 Prostate Cancer, Hereditary, 7
C0154088 Carcinoma in situ of prostate
C01

In [13]:
cuis_l2 = app.state.relationship_mapper.get_related_concepts('C0376358', direction='broad2narrow', max_depth=1)
for c in cuis_l2:
    print(c, app.state.concept_parser.umls_parser.get_umls_text(c))

C1297952 Malignant tumor involving prostate by direct extension from bladder
C5205935 Prostate Wilms Tumor
C2931456 Familial prostate cancer
C0349672 Prostate Ductal Adenocarcinoma
C5205936 Extrarenal Rhabdoid Tumor of the Prostate
C0347001 Metastatic Malignant Neoplasm in the Prostate Gland
C1302530 Prostate Squamous Cell Carcinoma
C0238393 Prostate Sarcoma
C1328504 Castration-Resistant Prostate Carcinoma
C3898877 Hormone-Resistant Prostate Carcinoma
C0279882 None
C4721208 Metastatic castration-resistant prostate cancer
C5231122 Infiltrating duct carcinoma of prostate
C1335516 Prostate Non-Hodgkin Lymphoma
C0280280 None
C1335514 Prostate Myeloid Sarcoma
C1300585 Small cell carcinoma of prostate
C1330959 Primary malignant neoplasm of prostate
C1334615 Malignant Prostate Phyllodes Tumor
C1335512 Prostate Lymphoma
C0007112 Prostate Adenocarcinoma
C5205910 Prostate Malignant Solitary Fibrous Tumor
C4303101 Recurrent malignant neoplasm of prostate
C5231082 Acinar cell cystadenocarcinoma of

# PubMed

In [14]:
from integration.orm import pubmed

In [15]:
len(session.scalars(select(pubmed.Trial)).fetchall())

840116

In [18]:
pubmed_pops = session.scalars(select(pubmed.UmlsPopulation.cui)).fetchall()

In [19]:
assert_not_null(pubmed_pops)
len(pubmed_pops), len(set(pubmed_pops))

(7457589, 52866)

In [20]:
top_10(pubmed_pops)

Unnamed: 0,0,name
C0030705,515952,Patient
C0439234,130447,Year
C0001779,125652,Age
C0043210,85865,Woman
C0679646,83530,Participant
C0001675,83247,Adult
C0162574,82525,Advanced Glycation End Product
C0001792,68599,Elderly
C0027361,66310,People
C0008972,62115,Clinical Study


In [21]:
pubmed_intv = session.scalars(select(pubmed.UmlsIntervention.cui)).fetchall()

In [22]:
assert_not_null(pubmed_intv)
len(pubmed_intv), len(set(pubmed_intv))

(5719729, 69108)

In [23]:
top_10(pubmed_intv)

Unnamed: 0,0,name
C0032042,145160,Placebo
C0184661,60850,Intervention or Procedure
C0009932,56675,Control Group
C0039798,46308,therapeutic aspects
C0087111,43896,Therapeutic Procedure
C1533734,32564,Administration
C0441833,30356,Group
C1522704,29750,Exercise Pain Management
C1522326,29154,Treat
C1705169,28003,Biomaterial Treatment


# ClinicalTrials.gov

In [24]:
from integration.orm import aact

In [25]:
len(session.scalars(select(aact.Trial)).fetchall())

514167

In [26]:
aact_pops = session.scalars(select(aact.MeshCondition.cui)).fetchall()

In [27]:
assert_not_null(aact_pops)
len(aact_pops), len(set(aact_pops))

(3491957, 4475)

In [28]:
top_10(aact_pops)

Unnamed: 0,0,name
C0027651,96131,NEOPLASM
C0030660,88020,Pathologic Process
C0027653,63528,Neoplasm by Site
C0027765,62545,Nervous System Disorder
C0007222,59516,Cardiovascular Disorder
C0080276,46908,Genitourinary System Disorder
C0035242,46272,Respiratory Tract Diseases
C3714514,44888,Infection
C0027652,43604,Neoplasms by Histologic Type
C0012242,42218,Disorder of digestive system


In [29]:
aact_intv = session.scalars(select(aact.MeshIntervention.cui).where(aact.MeshIntervention.cui != None)).fetchall()

In [30]:
assert_not_null(aact_intv)
len(aact_intv), len(set(aact_intv))

(1570195, 4000)

In [31]:
top_10(aact_intv)

Unnamed: 0,0,name
C1258063,103526,Physiological Effects of Drugs
C1258062,98733,Molecular Mechanisms of Pharmacological Action
C0003392,52591,Antineoplastic Agent
C0014432,48122,Enzyme inhibitor
C0243049,33059,Peripheral Nervous System Agents
C0021054,25448,Immunologic substance
C0243051,24928,Neurotransmitter Agents
C0003204,24797,Anti-Infective Agent
C0243058,21018,Sensory System Agents
C0007681,19233,Central Nervous System Depressants


# CIViC

In [32]:
from integration.orm import civic

In [33]:
len(session.scalars(select(civic.Evidence)).fetchall())

10939

In [34]:
civic_diseases = session.scalars(select(civic.Evidence.disease_cui).where(civic.Evidence.disease_cui != None)).fetchall()
civic_phenotypes = session.scalars(select(civic.Phenotype.cui).where(civic.Phenotype.cui != None)).fetchall()
civic_pops = civic_diseases + civic_phenotypes

In [35]:
assert_not_null(civic_pops)
len(civic_pops), len(set(civic_pops))

(16180, 548)

In [36]:
top_10(civic_pops)

Unnamed: 0,0,name
C0019562,2822,Von Hippel Lindau Syndrome
C0007134,885,Renal cell carcinoma
C0206734,828,Hemangioblastoma
C0007131,808,Non-small cell lung carcinoma
C0730303,793,Retinal capillary hemangioma
C0009404,708,Neoplasm of the large intestine
C0031511,686,Pheochromocytoma
C0023473,504,
C0023467,401,Acute myeloid leukemia
C0030283,398,Pancreatic cysts


In [37]:
civic_intv = session.scalars(select(civic.Therapy.cui).where(civic.Therapy.cui != None)).fetchall()

In [38]:
assert_not_null(civic_intv)
len(civic_intv), len(set(civic_intv))

(6442, 433)

In [39]:
top_10(civic_intv)

Unnamed: 0,0,name
C0995188,387,Cetuximab
C1455147,302,Dasatinib
C0935989,262,Imatinib
C1135135,260,Erlotinib
C2974289,260,Crizotinib
C3192263,254,Vemurafenib
C1122962,180,Gefitinib
C1176020,159,Sunitinib
C2697961,138,Trametinib
C0728747,126,Trastuzumab
