In [3]:
import json
import os
import glob

# 1. Point to the FOLDER name
folder_path = 'guidelineAnnotations.json'

# 2. Find all .json files inside that folder
json_files = glob.glob(os.path.join(folder_path, '*.json'))

print(f"üìÇ Found {len(json_files)} individual guideline files. Loading them now...")

guideline_data = []

# 3. Loop through every file and add it to our list
for file_path in json_files:
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
            guideline_data.append(data)
    except Exception as e:
        print(f"Skipped {file_path}: {e}")

print(f"‚úÖ Success! Loaded {len(guideline_data)} clinical guidelines into memory.")

# 4. Inspect the first one to make sure it looks right
if len(guideline_data) > 0:
    print("Example Data Keys:", guideline_data[0].keys())

üìÇ Found 217 individual guideline files. Loading them now...
‚úÖ Success! Loaded 217 clinical guidelines into memory.
Example Data Keys: dict_keys(['citations', 'guideline'])


In [4]:
# --- INSPECT DATA STRUCTURE ---
# Let's look inside the first guideline to see how to find Drugs and Rules.

first_guideline = guideline_data[0]['guideline']

print("üîë Keys inside 'guideline':", list(first_guideline.keys()))
print("\nüìã Example Name:", first_guideline.get('name', 'No Name Found'))

# Let's see if there is a 'relatedChemicals' or similar field
if 'relatedChemicals' in first_guideline:
    print("üíä Linked Drugs:", [d['name'] for d in first_guideline['relatedChemicals']])

üîë Keys inside 'guideline': ['objCls', 'id', 'name', 'alternateDrugAvailable', 'cancerGenome', 'crossReferences', 'dosingInformation', 'hasTestingInfo', 'history', 'literature', 'otherPrescribingGuidance', 'pediatric', 'recommendation', 'relatedAlleles', 'relatedChemicals', 'relatedGenes', 'source', 'summaryMarkdown', 'terms', 'textMarkdown', 'version']

üìã Example Name: Annotation of DPWG Guideline for thioguanine and NUDT15
üíä Linked Drugs: ['thioguanine']


In [5]:
# --- THE SEARCH ENGINE ---

def get_drug_guideline(drug_name):
    """
    Scans all loaded guidelines to find the one matching the drug.
    """
    drug_name = drug_name.lower()
    
    for entry in guideline_data:
        guideline = entry['guideline']
        
        # 1. Check the "Name" field (e.g., "Codeine and CYP2D6")
        if drug_name in guideline.get('name', '').lower():
            return guideline
            
        # 2. Check "Related Chemicals" list (More accurate)
        if 'relatedChemicals' in guideline:
            for chemical in guideline['relatedChemicals']:
                if drug_name in chemical.get('name', '').lower():
                    return guideline
                    
    return None

# --- TEST THE ENGINE ---
# Let's try to find the rules for "Codeine"
test_drug = "Codeine"
result = get_drug_guideline(test_drug)

if result:
    print(f"‚úÖ SUCCESS: Found CPIC Guideline for {test_drug}!")
    print(f"üìú Title: {result['name']}")
    print(f"üîó URL: {result.get('url', 'No URL')}")
else:
    print(f"‚ùå Error: Could not find {test_drug} in the database.")

‚úÖ SUCCESS: Found CPIC Guideline for Codeine!
üìú Title: Annotation of CPIC Guideline for codeine and CYP2D6
üîó URL: No URL


In [7]:
# --- INSPECT THE CONTENT (FIXED) ---

if result:
    print(f"üìò Name: {result.get('name')}")
    print("-" * 50)
    
    # 1. Extract Summary Text safely
    # The data is inside a dictionary key called 'html'
    summary_obj = result.get('summaryMarkdown', {})
    if isinstance(summary_obj, dict):
        summary_text = summary_obj.get('html', 'No summary text.')
    else:
        summary_text = str(summary_obj)
        
    print(f"üìù Summary:\n{summary_text}")
    print("-" * 50)
    
    # 2. Extract Full Text safely
    text_obj = result.get('textMarkdown', {})
    if isinstance(text_obj, dict):
        full_text = text_obj.get('html', 'No full text.')
    else:
        full_text = str(text_obj)
        
    print(f"üìñ Full Text Snippet:\n{full_text[:500]}...")

üìò Name: Annotation of CPIC Guideline for codeine and CYP2D6
--------------------------------------------------
üìù Summary:
<p>Alternate non-tramadol analgesics are recommended for CYP2D6 ultrarapid and poor metabolizers. A label recommended age- or weight-specific dose of codeine is warranted for CYP2D6 normal and intermediate metabolizers.</p>

--------------------------------------------------
üìñ Full Text Snippet:
<p><em><strong>Note that the FDA released a <a rel="noopener noreferrer" href="https://www.fda.gov/Drugs/DrugSafety/ucm549679.htm?source=govdelivery&amp;utm_medium=email&amp;utm_source=govdelivery" target="_blank">safety announcement</a> on 4/20/2017 stating that codeine and tramadol should not be used in children under 12 years.</strong></em></p>
<p>This annotation is based on the <a rel="noopener noreferrer" href="https://cpicpgx.org/guidelines/guideline-for-codeine-and-cyp2d6/" target="_blank...


In [8]:
# --- FINAL MODULE: THE CLINICAL ADVISOR ---
import re

def clean_html(raw_html):
    """Removes <p>, <b>, and other HTML tags to make text readable."""
    clean_text = re.sub('<.*?>', '', raw_html) # Regex to kill HTML tags
    return clean_text

def analyze_patient_risk(drug_name, patient_phenotype):
    # 1. Get the Rule Book
    guideline = get_drug_guideline(drug_name)
    
    if not guideline:
        return {
            "Drug": drug_name,
            "Risk": "Unknown", 
            "Reason": f"No CPIC data found for {drug_name}"
        }
    
    # 2. Extract Text (Handling the dictionary structure)
    summary_obj = guideline.get('summaryMarkdown', {})
    summary_html = summary_obj.get('html', '') if isinstance(summary_obj, dict) else str(summary_obj)
    
    # 3. Clean the text for human reading
    clean_summary = clean_html(summary_html)
    
    # 4. INTELLIGENT MATCHING
    # We check if the patient's specific condition is mentioned in the warning.
    # We convert everything to lower case for comparison.
    
    search_term = patient_phenotype.lower()[:-1] # Remove last letter (metabolize vs metabolizer)
    
    if search_term in clean_summary.lower():
        status = "‚ö†Ô∏è HIGH RISK / ACTION REQUIRED"
        match_type = "Direct Match"
    else:
        status = "‚ÑπÔ∏è Standard Protocol"
        match_type = "General Guidance"
        
    return {
        "Drug": drug_name,
        "Patient Phenotype": patient_phenotype,
        "Assessment": status,
        "Guidance": clean_summary,
        "Source": "CPIC Clinical Guidelines (Level A)"
    }

# --- üß™ FINAL TEST: RUN THE SIMULATION ---

# Scenario 1: The "Danger" Patient
print("--- SCENARIO 1: Poor Metabolizer taking Codeine ---")
report1 = analyze_patient_risk("Codeine", "Poor Metabolizer")
print(f"üö® Result: {report1['Assessment']}")
print(f"üìñ Advice: {report1['Guidance']}")
print("\n")

# Scenario 2: A different drug
print("--- SCENARIO 2: Poor Metabolizer taking Clopidogrel ---")
report2 = analyze_patient_risk("Clopidogrel", "Poor Metabolizer")
print(f"üö® Result: {report2['Assessment']}")
print(f"üìñ Advice: {report2['Guidance']}")

--- SCENARIO 1: Poor Metabolizer taking Codeine ---
üö® Result: ‚ö†Ô∏è HIGH RISK / ACTION REQUIRED
üìñ Advice: Alternate non-tramadol analgesics are recommended for CYP2D6 ultrarapid and poor metabolizers. A label recommended age- or weight-specific dose of codeine is warranted for CYP2D6 normal and intermediate metabolizers.



--- SCENARIO 2: Poor Metabolizer taking Clopidogrel ---
üö® Result: ‚ÑπÔ∏è Standard Protocol
üìñ Advice: The CPIC Dosing Guideline for clopidogrel recommends an alternative antiplatelet therapy for CYP2C19 poor or intermediate metabolizers (cardiovascular indications: prasugrel or ticagrelor if no contraindication; neurovascular indications: alternative P2Y12 inhibitor if clinically indicated and no contraindication.)



In [9]:
# --- EXPORT STEP: SAVE THE BRAIN FOR THE WEBSITE ---
import json

print("üöÄ Building the Production Database for Website...")

website_db = {}

for entry in guideline_data:
    # 1. Get the Raw Data
    guideline = entry.get('guideline', {})
    
    # 2. Find the Drug Name (Key)
    if 'relatedChemicals' in guideline:
        for chemical in guideline['relatedChemicals']:
            drug_name = chemical.get('name')
            
            if drug_name:
                # 3. Clean the HTML Advice
                summary_obj = guideline.get('summaryMarkdown', {})
                summary_html = summary_obj.get('html', '') if isinstance(summary_obj, dict) else str(summary_obj)
                clean_advice = clean_html(summary_html)
                
                # 4. Save to our simple dictionary
                website_db[drug_name] = {
                    "gene": "See CPIC", # We can refine this later
                    "advice": clean_advice,
                    "source": "CPIC Guidelines"
                }

# 5. Save to a file
with open('drug_db.json', 'w') as f:
    json.dump(website_db, f, indent=4)

print(f"‚úÖ SUCCESS! Saved {len(website_db)} drugs to 'drug_db.json'.")
print("üìÇ You can now move this file to your Website folder.")

üöÄ Building the Production Database for Website...
‚úÖ SUCCESS! Saved 208 drugs to 'drug_db.json'.
üìÇ You can now move this file to your Website folder.


In [10]:
# Testing few genes sample 

In [11]:
import random
import pandas as pd

# --- 1. SETUP: CREATE 20 MOCK PATIENTS ---
# We simulate a population with different metabolizer statuses

# Phenotypes to mix (weighted towards Normal, but with enough "Danger" cases for testing)
phenotypes = [
    "Normal Metabolizer", "Normal Metabolizer", "Normal Metabolizer", # Common
    "Intermediate Metabolizer", "Intermediate Metabolizer",
    "Poor Metabolizer",      # The "Danger" group
    "Ultrarapid Metabolizer" # The "Overdose" group
]

# Drugs to test (Must match names in your database)
test_drugs = ["Codeine", "Clopidogrel", "Warfarin", "Amitriptyline", "Paroxetine"]

simulation_data = []

print(f"üöÄ Starting Batch Simulation for 20 Patients...")

for i in range(1, 21):
    # 1. Randomly assign a Patient Profile
    patient_id = f"PATIENT_{i:02d}"
    assigned_phenotype = random.choice(phenotypes)
    assigned_drug = random.choice(test_drugs)
    
    # 2. Run the Logic Brain (The function we built earlier)
    # Note: We pass the phenotype directly since we are simulating the result of Module 1
    result = analyze_patient_risk(assigned_drug, assigned_phenotype)
    
    # 3. Store the result
    simulation_data.append({
        "Patient ID": patient_id,
        "Drug Prescribed": assigned_drug,
        "Genotype Status": assigned_phenotype,
        "Risk Assessment": result['Assessment'],
        "Clinical Guidance": result['Guidance'][:100] + "..." # Truncate for cleaner table
    })

# --- 2. DISPLAY RESULTS ---
# Convert to DataFrame for a professional medical report format
df_results = pd.DataFrame(simulation_data)

# Function to color-code the output (Red = Danger, Green = Safe)
def color_risk(val):
    color = 'green'
    font_weight = 'normal'
    
    if "HIGH RISK" in val:
        color = 'red'
        font_weight = 'bold'
    elif "Intermediate" in val or "Warning" in val:
        color = 'orange'
        font_weight = 'bold'
        
    return f'color: {color}; font-weight: {font_weight}'

# Display the styled table
print(f"‚úÖ Simulation Complete. Generated {len(df_results)} clinical reports.")
display(df_results.style.applymap(color_risk, subset=['Risk Assessment']))

üöÄ Starting Batch Simulation for 20 Patients...
‚úÖ Simulation Complete. Generated 20 clinical reports.


  display(df_results.style.applymap(color_risk, subset=['Risk Assessment']))


Unnamed: 0,Patient ID,Drug Prescribed,Genotype Status,Risk Assessment,Clinical Guidance
0,PATIENT_01,Codeine,Normal Metabolizer,‚ÑπÔ∏è Standard Protocol,Alternate non-tramadol analgesics are recommended for CYP2D6 ultrarapid and poor metabolizers. A lab...
1,PATIENT_02,Amitriptyline,Normal Metabolizer,‚ÑπÔ∏è Standard Protocol,The CPIC Dosing Guideline update for amitriptyline recommends an alternative drug for CYP2D6 ultrara...
2,PATIENT_03,Codeine,Intermediate Metabolizer,‚ö†Ô∏è HIGH RISK / ACTION REQUIRED,Alternate non-tramadol analgesics are recommended for CYP2D6 ultrarapid and poor metabolizers. A lab...
3,PATIENT_04,Warfarin,Ultrarapid Metabolizer,‚ÑπÔ∏è Standard Protocol,Patients with the VKORC1 rs9923231 TT genotype (-1639 AA genotype) should be given 60% of the standa...
4,PATIENT_05,Warfarin,Intermediate Metabolizer,‚ÑπÔ∏è Standard Protocol,Patients with the VKORC1 rs9923231 TT genotype (-1639 AA genotype) should be given 60% of the standa...
5,PATIENT_06,Codeine,Normal Metabolizer,‚ÑπÔ∏è Standard Protocol,Alternate non-tramadol analgesics are recommended for CYP2D6 ultrarapid and poor metabolizers. A lab...
6,PATIENT_07,Codeine,Normal Metabolizer,‚ÑπÔ∏è Standard Protocol,Alternate non-tramadol analgesics are recommended for CYP2D6 ultrarapid and poor metabolizers. A lab...
7,PATIENT_08,Amitriptyline,Intermediate Metabolizer,‚ö†Ô∏è HIGH RISK / ACTION REQUIRED,The CPIC Dosing Guideline update for amitriptyline recommends an alternative drug for CYP2D6 ultrara...
8,PATIENT_09,Amitriptyline,Ultrarapid Metabolizer,‚ÑπÔ∏è Standard Protocol,The CPIC Dosing Guideline update for amitriptyline recommends an alternative drug for CYP2D6 ultrara...
9,PATIENT_10,Codeine,Normal Metabolizer,‚ÑπÔ∏è Standard Protocol,Alternate non-tramadol analgesics are recommended for CYP2D6 ultrarapid and poor metabolizers. A lab...


In [13]:
# --- UNIVERSAL RISK ANALYZER (Define this FIRST) ---
import re

def clean_html(raw_html):
    """Removes HTML tags."""
    if not isinstance(raw_html, str): return ""
    return re.sub('<.*?>', '', raw_html)

def analyze_universal_risk(drug_name, patient_phenotype):
    # 1. Get the Rule Book
    guideline = get_drug_guideline(drug_name)
    
    if not guideline:
        return {
            "Drug": drug_name,
            "Assessment": "Unknown", 
            "Guidance": f"No CPIC data found for {drug_name}"
        }
    
    # 2. Extract & Clean Text
    summary_obj = guideline.get('summaryMarkdown', {})
    summary_html = summary_obj.get('html', '') if isinstance(summary_obj, dict) else str(summary_obj)
    clean_summary = clean_html(summary_html).lower()
    
    # 3. INTELLIGENT MATCHING (Expanded)
    # We create a list of "Danger Keywords" based on the patient's status
    
    patient_status = patient_phenotype.lower()
    
    # Logic A: Metabolism (e.g., "Poor Metabolizer")
    if "metabolizer" in patient_status:
        # Search for "poor", "rapid", "ultra" etc.
        keyword = patient_status.split(" ")[0] 
        
    # Logic B: Immune / Hypersensitivity (e.g., "HLA-B Positive")
    elif "positive" in patient_status:
        keyword = "positive"
        
    # Logic C: Transporter Function (e.g., "Poor Function")
    elif "function" in patient_status:
        keyword = "poor function" if "poor" in patient_status else "decreased"
        
    # Logic D: Deficiency (e.g., "G6PD Deficient")
    elif "deficient" in patient_status:
        keyword = "deficien" # catches "deficiency" and "deficient"
        
    else:
        keyword = patient_status # Fallback
        
    # 4. Check if the "Danger Keyword" appears in the warning text
    if keyword in clean_summary:
        risk_level = "üî¥ HIGH RISK / TOXICITY"
    else:
        risk_level = "üü¢ Standard Protocol"
        
    return {
        "Drug": drug_name,
        "Patient Phenotype": patient_phenotype,
        "Assessment": risk_level,
        "Guidance": clean_summary[:200] + "..." # Show first 200 chars
    }

In [14]:
# --- HIGH RISK VALIDATION SET ---

validation_cases = [
    # 1. DEADLY RASH (Immune)
    {"drug": "Abacavir", "phenotype": "HLA-B*57:01 Positive", "expected": "High Risk - Hypersensitivity"},

    # 2. BLEEDING RISK (Metabolism)
    {"drug": "Warfarin", "phenotype": "CYP2C9 Poor Metabolizer", "expected": "High Risk - Hemorrhage"},

    # 3. FATAL CHEMO TOXICITY (Enzyme Deficiency)
    {"drug": "Fluorouracil", "phenotype": "DPYD Poor Metabolizer", "expected": "Critical Risk - Toxicity"},

    # 4. OPIOID OVERDOSE (Metabolism Speed)
    {"drug": "Codeine", "phenotype": "Ultra-Rapid Metabolizer", "expected": "Critical Risk - Overdose"},

    # 5. THERAPEUTIC FAILURE (Prodrug Failure)
    {"drug": "Clopidogrel", "phenotype": "CYP2C19 Poor Metabolizer", "expected": "High Risk - Ineffective"}
]

print("üè• RUNNING CRITICAL VALIDATION TESTS...")
print(f"{'DRUG':<15} | {'PHENOTYPE':<25} | {'SYSTEM RESULT'}")
print("-" * 65)

for case in validation_cases:
    # Run your Universal Analyzer
    result = analyze_universal_risk(case['drug'], case['phenotype'])
    
    # Check if we caught it (Look for "HIGH" or "Warning")
    status = "‚úÖ CAUGHT" if "HIGH" in result['Assessment'] or "Positive" in result['Assessment'] else "‚ùå MISSED"
    
    print(f"{case['drug']:<15} | {case['phenotype']:<25} | {status} ({result['Assessment']})")

üè• RUNNING CRITICAL VALIDATION TESTS...
DRUG            | PHENOTYPE                 | SYSTEM RESULT
-----------------------------------------------------------------
Abacavir        | HLA-B*57:01 Positive      | ‚ùå MISSED (üü¢ Standard Protocol)
Warfarin        | CYP2C9 Poor Metabolizer   | ‚ùå MISSED (üü¢ Standard Protocol)
Fluorouracil    | DPYD Poor Metabolizer     | ‚úÖ CAUGHT (üî¥ HIGH RISK / TOXICITY)
Codeine         | Ultra-Rapid Metabolizer   | ‚ùå MISSED (üü¢ Standard Protocol)
Clopidogrel     | CYP2C19 Poor Metabolizer  | ‚úÖ CAUGHT (üî¥ HIGH RISK / TOXICITY)


In [15]:
#Test ended

In [16]:
def parse_genetic_file(file_content):
    """
    Simulates extracting rsIDs and Genotypes from a VCF.
    In a real app, this would use a library like 'pyVCF'.
    """
    # For the hackathon, we simulate finding a specific variant
    # Let's assume the file contains a CYP2D6 Poor Metabolizer variant
    extracted_data = {
        "gene": "CYP2D6",
        "rsID": "rs3892097",
        "genotype": "1/1",
        "phenotype": "Poor Metabolizer"
    }
    return extracted_data

In [17]:
def generate_medical_report(patient_name, drug_name, genetic_data):
    # 1. Run the Brain
    analysis = analyze_universal_risk(drug_name, genetic_data['phenotype'])
    
    # 2. Structure the Report Data
    report = {
        "header": {
            "report_id": "PGX-2026-001",
            "patient": patient_name,
            "date": "2026-02-19"
        },
        "genetic_finding": {
            "gene": genetic_data['gene'],
            "phenotype": genetic_data['phenotype'],
            "evidence_level": "CPIC Level A"
        },
        "clinical_result": {
            "drug": drug_name,
            "risk_level": analysis['Assessment'],
            "recommendation": analysis['Guidance']
        }
    }
    return report

# --- üß™ TEST THE FULL FLOW ---
patient_dna = parse_genetic_file("mock_vcf_data")
final_report = generate_medical_report("Zeeshan", "Codeine", patient_dna)

print("üìÑ GENERATED MEDICAL REPORT")
print(f"Patient: {final_report['header']['patient']}")
print(f"Result: {final_report['clinical_result']['risk_level']}")
print(f"Action: {final_report['clinical_result']['recommendation']}")

üìÑ GENERATED MEDICAL REPORT
Patient: Zeeshan
Result: üî¥ HIGH RISK / TOXICITY
Action: alternate non-tramadol analgesics are recommended for cyp2d6 ultrarapid and poor metabolizers. a label recommended age- or weight-specific dose of codeine is warranted for cyp2d6 normal and intermedia...
