# PKG 2.0 — Explore All Tables

One cell per table. Each cell loads from local `data/pkg2/*.tsv.gz`, shows row count, schema, and a sample.

In [4]:
import gzip
import io
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data/pkg2")

def read_truncated_gzip(path: Path) -> pd.DataFrame:
    """Read a possibly-truncated .tsv.gz, recovering all complete rows."""
    raw = path.read_bytes()
    try:
        text = gzip.decompress(raw).decode("utf-8", errors="replace")
    except EOFError:
        # Decompress as much as possible from the truncated stream
        d = gzip.GzipFile(fileobj=io.BytesIO(raw))
        chunks = []
        while True:
            try:
                chunk = d.read(64 * 1024 * 1024)  # 64 MB
                if not chunk:
                    break
                chunks.append(chunk)
            except EOFError:
                break
        text = b"".join(chunks).decode("utf-8", errors="replace")
        # Drop the last (likely incomplete) line
        text = text[:text.rfind("\n")]
    return pd.read_csv(io.StringIO(text), sep="\t", low_memory=False)

def preview(filename: str, nrows: int = 10) -> pd.DataFrame:
    """Load a TSV.gz (handling truncated gzip), print stats, return full df."""
    path = DATA_DIR / filename
    df = read_truncated_gzip(path)
    print(f"File:  {filename}")
    print(f"Rows:  {len(df):,}")
    print(f"Cols:  {len(df.columns)}")
    print()
    print("Schema:")
    for col in df.columns:
        print(f"  {col:30s} {df[col].dtype}")
    print()
    display(df.head(nrows))
    return df

---
## C23 — BioEntities
Biomedical entities (genes, diseases, drugs, proteins, pathways) — the nodes of the graph.

In [5]:
df_c23 = preview("C23_BioEntities.tsv.gz")

File:  C23_BioEntities.tsv.gz
Rows:  332,743
Cols:  3

Schema:
  EntityId                       object
  Type                           object
  Mention                        object



Unnamed: 0,EntityId,Type,Mention
0,CHEBI10003,drug,repromicin
1,CHEBI100147,drug,nalidixic acid
2,CHEBI10015,drug,votacin
3,CHEBI10016,drug,vobtusine
4,CHEBI10017,drug,volemitol
5,CHEBI10018,drug,volkensin
6,CHEBI10023,drug,VCZ
7,CHEBI100241,drug,ciprofloxacine
8,CHEBI100246,drug,NFLX
9,CHEBI10036,drug,wax esters


---
## C01 — Papers
Paper metadata with publication year, citation counts, and clinical relevance flag.

In [6]:
df_c01 = preview("C01_Papers.tsv.gz")

File:  C01_Papers.tsv.gz
Rows:  16,214,968
Cols:  17

Schema:
  id                             int64
  PMID                           int64
  PubYear                        int64
  ArticleTitle                   object
  AuthorNum                      int64
  MedlineCitation_Status         object
  CitedCount                     int64
  StdCitedCount                  float64
  CitedCount_ClinicalArticle     int64
  CitedCount_ClinicalTrailStudy  int64
  CitedCount_Patent              int64
  IsClinicalArticle              int64
  IsResearchArticle              int64
  Human                          float64
  Animal                         float64
  MolecularCellular              float64
  APT                            float64



Unnamed: 0,id,PMID,PubYear,ArticleTitle,AuthorNum,MedlineCitation_Status,CitedCount,StdCitedCount,CitedCount_ClinicalArticle,CitedCount_ClinicalTrailStudy,CitedCount_Patent,IsClinicalArticle,IsResearchArticle,Human,Animal,MolecularCellular,APT
0,1,1,1975,Formate assay in body fluids: application in m...,4,MEDLINE,109,0.5531,0,3,0,0,1,0.17,0.33,0.5,0.25
1,2,2,1975,Delineation of the intimate details of the bac...,2,MEDLINE,54,0.2236,0,0,0,0,1,0.0,0.0,1.0,0.25
2,3,3,1975,Metal substitutions incarbonic anhydrase: a ha...,2,MEDLINE,20,0.0199,1,0,0,0,1,0.14,0.29,0.57,0.25
3,4,4,1975,Effect of chloroquine on cultured fibroblasts:...,3,MEDLINE,73,0.3374,0,0,0,0,1,0.2,0.0,0.8,0.05
4,5,5,1975,Atomic models for the polypeptide backbones of...,2,MEDLINE,22,0.0319,0,0,0,0,1,0.0,0.67,0.33,0.05
5,6,6,1975,Studies of oxygen binding energy to hemoglobin...,3,MEDLINE,12,-0.028,0,0,0,0,1,0.25,0.0,0.75,0.25
6,7,7,1975,Maturation of the adrenal medulla--IV. Effects...,2,MEDLINE,26,0.0559,0,0,0,0,1,0.25,0.75,0.0,0.05
7,8,9,1975,Radiochemical assay of glutathione S-epoxide t...,2,MEDLINE,57,0.2416,0,1,0,0,1,0.0,0.67,0.33,0.05
8,9,8,1975,Comparison between procaine and isocarboxazid ...,2,MEDLINE,44,0.1637,1,0,0,0,0,0.0,0.33,0.67,0.05
9,10,10,1975,Digitoxin metabolism by rat liver microsomes.,3,MEDLINE,68,0.3075,0,0,0,0,1,0.0,0.5,0.5,0.25


---
## C06 — Link Papers ↔ BioEntities
Linkages between papers and biomedical entities with mention positions.

In [7]:
df_c06 = preview("C06_Link_Papers_BioEntities.tsv.gz")

File:  C06_Link_Papers_BioEntities.tsv.gz
Rows:  6,962,085
Cols:  17

Schema:
  id                             int64
  PMID                           int64
  StartPosition                  int64
  EndPosition                    int64
  Mention                        object
  Type                           object
  is_neural_normalized           int64
  prob                           float64
  EntityId                       object
  mesh                           object
  mim                            float64
  CL                             float64
  cellosaurus                    float64
  NCBITaxon                      float64
  NCBIGene                       float64
  CHEBI                          float64
  FileName                       object



Unnamed: 0,id,PMID,StartPosition,EndPosition,Mention,Type,is_neural_normalized,prob,EntityId,mesh,mim,CL,cellosaurus,NCBITaxon,NCBIGene,CHEBI,FileName
0,1,1,0,7,Formate,drug,0,0.999329,CHEBI52343,,,,,,,52343.0,pubmed22n0001.json
1,2,1,45,53,methanol,drug,0,0.99972,meshD000432,D000432,,,,,,,pubmed22n0001.json
2,3,2,68,87,pyridine nucleotide,drug,0,0.99644,CHEBI36980,,,,,,,36980.0,pubmed22n0001.json
3,4,3,44,50,halide,drug,0,0.999488,CHEBI16042,,,,,,,16042.0,pubmed22n0001.json
4,5,4,10,21,chloroquine,drug,0,0.999821,meshD002738,D002738,,,,,,,pubmed22n0001.json
5,6,4,25,45,cultured fibroblasts,cell_line,0,0.94629,CUI-less,,,,,,,,pubmed22n0001.json
6,7,4,58,78,lysosomal hydrolases,gene,1,0.995118,NCBIGene3988,,,,,,3988.0,,pubmed22n0001.json
7,8,5,47,61,myohemerythrin,gene,1,0.973337,NCBIGene3048,,,,,,3048.0,,pubmed22n0001.json
8,9,5,66,77,hemerythrin,gene,1,0.98046,NCBIGene7248,,,,,,7248.0,,pubmed22n0001.json
9,10,6,11,17,oxygen,drug,0,0.999151,meshD010100,D010100,,,,,,,pubmed22n0001.json


---
## C11 — Clinical Trials
Clinical trial data with NCT IDs, titles, and start dates.

In [8]:
df_c11 = preview("C11_ClinicalTrials.tsv.gz")

ParserError: Error tokenizing data. C error: Expected 18 fields in line 10544, saw 24


---
## C13 — Link Clinical Trials ↔ BioEntities
Links between clinical trials and biomedical entities.

In [None]:
df_c13 = preview("C13_Link_ClinicalTrials_BioEntities.tsv.gz")

---
## C15 — Patents
Patent data with grant dates, titles, and abstracts.

In [None]:
df_c15 = preview("C15_Patents.tsv.gz")

---
## C18 — Link Patents ↔ BioEntities
Links between patents and biomedical entities with mention positions.

In [None]:
df_c18 = preview("C18_Link_Patents_BioEntities.tsv.gz")

---
## C21 — Bioentity Relationships
Core edges of the knowledge graph — relationships between bioentities grounded in papers.

In [9]:
df_c21 = preview("C21_Bioentity_Relationships.tsv.gz")

File:  C21_Bioentity_Relationships.tsv.gz
Rows:  23,208,487
Cols:  5

Schema:
  PMID                           int64
  entity_id1                     object
  entity_id2                     object
  relation_id                    int64
  relation_type                  object



Unnamed: 0,PMID,entity_id1,entity_id2,relation_id,relation_type
0,167,meshD006978,meshD006977,7305,disease_disease
1,167,meshD007676,meshD006973,10572,disease_disease
2,232,meshD001651,meshD002779,6875,disease_disease
3,273,meshD034721,meshD008415,8033,disease_disease
4,349,meshD058186,meshD051437,6249,disease_disease
5,349,meshD014511,meshD051437,8587,disease_disease
6,540,meshD014511,meshD051437,8587,disease_disease
7,643,meshD012640,meshD004827,6255,disease_disease
8,702,meshD004673,meshD004679,5905,disease_disease
9,750,meshD019586,meshD001927,6104,disease_disease


---
## A01 — Articles
Article metadata with titles, PMIDs, and dates.

In [None]:
df_a01 = preview("A01_Articles.tsv.gz")

---
## A03 — Keyword List
Keywords extracted from articles.

In [None]:
df_a03 = preview("A03_KeywordList.tsv.gz")

---
## A04 — Abstracts
Article abstracts — context for relationships and evidence.

In [None]:
df_a04 = preview("A04_Abstract.tsv.gz")

---
## A06 — MeSH Heading List
MeSH headings from articles — disease and therapeutic area filtering.

In [None]:
df_a06 = preview("A06_MeshHeadingList.tsv.gz")