# PKG 2.0 — Explore All Tables

One cell per table. Each cell loads from local `data/pkg2/*.tsv.gz`, shows row count, schema, and a sample.

In [None]:
import gzip
import io
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data/pkg2")

def read_truncated_gzip(path: Path) -> pd.DataFrame:
    """Read a possibly-truncated .tsv.gz, recovering all complete rows."""
    raw = path.read_bytes()
    try:
        text = gzip.decompress(raw).decode("utf-8", errors="replace")
    except EOFError:
        # Decompress as much as possible from the truncated stream
        d = gzip.GzipFile(fileobj=io.BytesIO(raw))
        chunks = []
        while True:
            try:
                chunk = d.read(64 * 1024 * 1024)  # 64 MB
                if not chunk:
                    break
                chunks.append(chunk)
            except EOFError:
                break
        text = b"".join(chunks).decode("utf-8", errors="replace")
        # Drop the last (likely incomplete) line
        text = text[:text.rfind("\n")]
    return pd.read_csv(io.StringIO(text), sep="\t", low_memory=False)

def preview(filename: str, nrows: int = 10) -> pd.DataFrame:
    """Load a TSV.gz (handling truncated gzip), print stats, return full df."""
    path = DATA_DIR / filename
    df = read_truncated_gzip(path)
    print(f"File:  {filename}")
    print(f"Rows:  {len(df):,}")
    print(f"Cols:  {len(df.columns)}")
    print()
    print("Schema:")
    for col in df.columns:
        print(f"  {col:30s} {df[col].dtype}")
    print()
    display(df.head(nrows))
    return df

---
## C23 — BioEntities
Biomedical entities (genes, diseases, drugs, proteins, pathways) — the nodes of the graph.

In [2]:
df_c23 = preview("C23_BioEntities.tsv.gz")

File:  C23_BioEntities.tsv.gz
Rows:  332,743
Cols:  3

Schema:
  EntityId                       object
  Type                           object
  Mention                        object



Unnamed: 0,EntityId,Type,Mention
0,CHEBI10003,drug,repromicin
1,CHEBI100147,drug,nalidixic acid
2,CHEBI10015,drug,votacin
3,CHEBI10016,drug,vobtusine
4,CHEBI10017,drug,volemitol
5,CHEBI10018,drug,volkensin
6,CHEBI10023,drug,VCZ
7,CHEBI100241,drug,ciprofloxacine
8,CHEBI100246,drug,NFLX
9,CHEBI10036,drug,wax esters


---
## C01 — Papers
Paper metadata with publication year, citation counts, and clinical relevance flag.

In [3]:
df_c01 = preview("C01_Papers.tsv.gz")

EOFError: Compressed file ended before the end-of-stream marker was reached

---
## C06 — Link Papers ↔ BioEntities
Linkages between papers and biomedical entities with mention positions.

In [None]:
df_c06 = preview("C06_Link_Papers_BioEntities.tsv.gz")

---
## C11 — Clinical Trials
Clinical trial data with NCT IDs, titles, and start dates.

In [None]:
df_c11 = preview("C11_ClinicalTrials.tsv.gz")

---
## C13 — Link Clinical Trials ↔ BioEntities
Links between clinical trials and biomedical entities.

In [None]:
df_c13 = preview("C13_Link_ClinicalTrials_BioEntities.tsv.gz")

---
## C15 — Patents
Patent data with grant dates, titles, and abstracts.

In [None]:
df_c15 = preview("C15_Patents.tsv.gz")

---
## C18 — Link Patents ↔ BioEntities
Links between patents and biomedical entities with mention positions.

In [None]:
df_c18 = preview("C18_Link_Patents_BioEntities.tsv.gz")

---
## C21 — Bioentity Relationships
Core edges of the knowledge graph — relationships between bioentities grounded in papers.

In [None]:
df_c21 = preview("C21_Bioentity_Relationships.tsv.gz")

---
## A01 — Articles
Article metadata with titles, PMIDs, and dates.

In [None]:
df_a01 = preview("A01_Articles.tsv.gz")

---
## A03 — Keyword List
Keywords extracted from articles.

In [None]:
df_a03 = preview("A03_KeywordList.tsv.gz")

---
## A04 — Abstracts
Article abstracts — context for relationships and evidence.

In [None]:
df_a04 = preview("A04_Abstract.tsv.gz")

---
## A06 — MeSH Heading List
MeSH headings from articles — disease and therapeutic area filtering.

In [None]:
df_a06 = preview("A06_MeshHeadingList.tsv.gz")