In [1]:
# 1. Basic setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 2. Download ClinVar variant_summary in Colab
!wget -O variant_summary.txt.gz https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz


--2025-11-20 20:02:22--  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.31, 130.14.250.10, 130.14.250.11, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.31|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 411788700 (393M) [application/x-gzip]
Saving to: ‘variant_summary.txt.gz’


2025-11-20 20:02:41 (21.4 MB/s) - ‘variant_summary.txt.gz’ saved [411788700/411788700]



In [3]:
df = pd.read_csv("variant_summary.txt.gz", sep="\t", compression="gzip", nrows=5)
df.columns


Index(['#AlleleID', 'Type', 'Name', 'GeneID', 'GeneSymbol', 'HGNC_ID',
       'ClinicalSignificance', 'ClinSigSimple', 'LastEvaluated', 'RS# (dbSNP)',
       'nsv/esv (dbVar)', 'RCVaccession', 'PhenotypeIDS', 'PhenotypeList',
       'Origin', 'OriginSimple', 'Assembly', 'ChromosomeAccession',
       'Chromosome', 'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele',
       'Cytogenetic', 'ReviewStatus', 'NumberSubmitters', 'Guidelines',
       'TestedInGTR', 'OtherIDs', 'SubmitterCategories', 'VariationID',
       'PositionVCF', 'ReferenceAlleleVCF', 'AlternateAlleleVCF',
       'SomaticClinicalImpact', 'SomaticClinicalImpactLastEvaluated',
       'ReviewStatusClinicalImpact', 'Oncogenicity',
       'OncogenicityLastEvaluated', 'ReviewStatusOncogenicity',
       'SCVsForAggregateGermlineClassification',
       'SCVsForAggregateSomaticClinicalImpact',
       'SCVsForAggregateOncogenicityClassification'],
      dtype='object')

In [4]:
use_cols = [
    "VariationID",
    "GeneSymbol",
    "Name",   # This contains HGVS notation
    "ClinicalSignificance",
    "ReviewStatus",
    "Chromosome",
    "Start",
    "ReferenceAllele",
    "AlternateAllele"
]

df = pd.read_csv(
    "variant_summary.txt.gz",
    sep="\t",
    compression="gzip",
    low_memory=False,
    usecols=use_cols
)

df.head()


Unnamed: 0,Name,GeneSymbol,ClinicalSignificance,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus,VariationID
0,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,AP5Z1,Pathogenic/Likely pathogenic,7,4820844,na,na,"criteria provided, multiple submitters, no con...",2
1,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,AP5Z1,Pathogenic/Likely pathogenic,7,4781213,na,na,"criteria provided, multiple submitters, no con...",2
2,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),AP5Z1,Pathogenic,7,4827361,na,na,no assertion criteria provided,3
3,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),AP5Z1,Pathogenic,7,4787730,na,na,no assertion criteria provided,3
4,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),ZNF592,Uncertain significance,15,85342440,na,na,no assertion criteria provided,4


In [5]:
df = pd.read_csv(
    "variant_summary.txt.gz",
    sep="\t",
    compression="gzip",
    low_memory=False,
    usecols=[
        "Name", "GeneSymbol", "ClinicalSignificance", "Chromosome", "Start",
        "ReferenceAllele", "AlternateAllele", "ReviewStatus", "VariationID"
    ]
)

# Keep only Pathogenic or Benign related labels
keep_labels = ["Pathogenic", "Likely pathogenic",
               "Benign", "Likely benign"]

df = df[df["ClinicalSignificance"].str.contains("|".join(keep_labels), case=False, na=False)]

# Exclude Uncertain significance, conflicting, etc.
exclude_labels = ["Uncertain", "conflict", "not provided", "no assertion"]

df = df[~df["ClinicalSignificance"].str.contains("|".join(exclude_labels), case=False, na=False)]

# Keep only high-confidence review statuses
high_conf = ["multiple submitters, no conflicts", "reviewed by expert panel", "practice guideline"]

df = df[df["ReviewStatus"].str.contains("|".join(high_conf), case=False, na=False)]

df.shape, df.head()


((717180, 9),
                                                  Name GeneSymbol  \
 0   NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...      AP5Z1   
 1   NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...      AP5Z1   
 6         NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter)    FOXRED1   
 7         NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter)    FOXRED1   
 32            NM_000410.4(HFE):c.848A>C (p.Gln283Pro)        HFE   
 
             ClinicalSignificance Chromosome      Start ReferenceAllele  \
 0   Pathogenic/Likely pathogenic          7    4820844              na   
 1   Pathogenic/Likely pathogenic          7    4781213              na   
 6                     Pathogenic         11  126145284              na   
 7                     Pathogenic         11  126275389              na   
 32  Pathogenic/Likely pathogenic          6   26093144              na   
 
    AlternateAllele                                       ReviewStatus  \
 0               na  criteria provided, mult

In [6]:
df.shape


(717180, 9)

In [7]:
# Split into pathogenic-like and benign-like
pathogenic_mask = df["ClinicalSignificance"].str.contains("Pathogenic", case=False, na=False)
benign_mask = df["ClinicalSignificance"].str.contains("Benign", case=False, na=False)

df_pathogenic = df[pathogenic_mask]
df_benign = df[benign_mask]

print("Pathogenic-like count:", len(df_pathogenic))
print("Benign-like count:", len(df_benign))


Pathogenic-like count: 172224
Benign-like count: 544956


In [8]:
# Sample 100 from each group
path_sample = df_pathogenic.sample(n=100, random_state=42)
ben_sample = df_benign.sample(n=100, random_state=42)

# Combine
df_final = pd.concat([path_sample, ben_sample]).reset_index(drop=True)

# Optional: add a simple binary label column
def simplify_label(x):
    x = x.lower()
    if "pathogenic" in x:
        return "Pathogenic"
    elif "benign" in x:
        return "Benign"
    else:
        return "Other"

df_final["SimpleLabel"] = df_final["ClinicalSignificance"].apply(simplify_label)

# Save final CSV for the project
df_final.to_csv("clinvar_llm_sample.csv", index=False)

df_final.head(), df_final["SimpleLabel"].value_counts()


(                                                Name GeneSymbol  \
 0          NM_000251.3(MSH2):c.1285C>T (p.Gln429Ter)       MSH2   
 1       NM_001134831.2(AHI1):c.1997A>T (p.Asp666Val)       AHI1   
 2         NM_000059.4(BRCA2):c.9025dup (p.Tyr3009fs)      BRCA2   
 3          NM_000238.4(KCNH2):c.545C>A (p.Ser182Ter)      KCNH2   
 4  NM_000419.5(ITGA2B):c.1366_1371del (p.Val456_A...     ITGA2B   
 
            ClinicalSignificance Chromosome      Start ReferenceAllele  \
 0                    Pathogenic          2   47672695              na   
 1  Pathogenic/Likely pathogenic          6  135759552              na   
 2                    Pathogenic         13   32953955              na   
 3                    Pathogenic          7  150655518              na   
 4             Likely pathogenic         17   44380901              na   
 
   AlternateAllele                                       ReviewStatus  \
 0              na                           reviewed by expert panel  

In [9]:
df_final.head(), df_final["SimpleLabel"].value_counts()


(                                                Name GeneSymbol  \
 0          NM_000251.3(MSH2):c.1285C>T (p.Gln429Ter)       MSH2   
 1       NM_001134831.2(AHI1):c.1997A>T (p.Asp666Val)       AHI1   
 2         NM_000059.4(BRCA2):c.9025dup (p.Tyr3009fs)      BRCA2   
 3          NM_000238.4(KCNH2):c.545C>A (p.Ser182Ter)      KCNH2   
 4  NM_000419.5(ITGA2B):c.1366_1371del (p.Val456_A...     ITGA2B   
 
            ClinicalSignificance Chromosome      Start ReferenceAllele  \
 0                    Pathogenic          2   47672695              na   
 1  Pathogenic/Likely pathogenic          6  135759552              na   
 2                    Pathogenic         13   32953955              na   
 3                    Pathogenic          7  150655518              na   
 4             Likely pathogenic         17   44380901              na   
 
   AlternateAllele                                       ReviewStatus  \
 0              na                           reviewed by expert panel  