In [1]:
# imports
import numpy as np
import pandas as pd

### Data Collection

In [2]:
import json
import os
import time
import sys
from Bio import Entrez

Entrez.email = ''
Entrez.api_key = ''

def fetch_pubmed_id(query, max_results=100):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"] 

def fetch_pubmed_abstracts(id_list, batch_size=20):
    abstracts = []
    for start in range(0, len(id_list), batch_size):
        batch_ids = id_list[start:start+batch_size]
        ids = ",".join(batch_ids)
        handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="xml")
        records = Entrez.read(handle)
        handle.close()

        for article in records['PubmedArticle']:
            article_data = article['MedlineCitation']['Article']
            title = article_data.get('ArticleTitle', 'No Title')
            abstract_data = article_data.get('Abstract', {}).get('AbstractText', '')
            if isinstance(abstract_data, list):
                abstract_text = ' '.join([str(a) for a in abstract_data])
            elif isinstance(abstract_data, str):
                abstract_text = abstract_data
            else:
                abstract_text = ''

            pmid = article['MedlineCitation']['PMID']
            mesh_terms = [mesh['DescriptorName'] for mesh in article['MedlineCitation'].get('MeshHeadingList', [])]

            abstracts.append({
                "pmid": str(pmid),
                "title": str(title),
                "abstract": str(abstract_text),
                "mesh_terms": mesh_terms,
                "source": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
            })

        time.sleep(0.3)  # NCBI rate limits

    return abstracts

def save_to_json(data, output_file):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def fetch_and_save_pubmed_abstracts(query, max_results=100):
    for tag, query in SEARCH_QUERIES.items():
        ids = fetch_pubmed_id(query, max_results=100)
        print(f"Found {len(ids)} articles.")

        abstracts = fetch_pubmed_abstracts(ids)
        # Auto-tagging source type
        source_type = "Bangladesh-specific" if "Bangladesh" in query else "Global"
        for doc in abstracts:
            doc["source_type"] = source_type

        output_file = f"data/processed/{tag}.json"
        save_to_json(abstracts, output_file)
        
SEARCH_QUERIES = {
    # Infectious Diseases (High Priority for Bangladesh)
    "dengue_bangladesh": "Dengue AND Bangladesh",
    "dengue_global": "Dengue AND (Treatment OR Guidelines)",
    "typhoid_bangladesh": "Typhoid Fever AND Bangladesh",
    "typhoid_global": "Typhoid Fever AND (Treatment OR Management)",
    "malaria_bangladesh": "Malaria AND Bangladesh",
    "malaria_global": "Malaria AND (Treatment OR Prevention)",
    "hepatitis_bangladesh": "Hepatitis AND Bangladesh",
    "hepatitis_global": "Hepatitis AND (Treatment OR Management)",
    "diarrhea_bangladesh": "Diarrhea AND Bangladesh",
    "diarrhea_global": "Diarrhea AND (Treatment OR Guidelines)",
    "tuberculosis_bangladesh": "Tuberculosis AND Bangladesh",
    "tuberculosis_global": "Tuberculosis AND (Treatment OR WHO Guidelines)",
    "cholera_bangladesh": "Cholera AND Bangladesh",
    "cholera_global": "Cholera AND (Management OR Treatment)",
    "leptospirosis_bangladesh": "Leptospirosis AND Bangladesh",
    "leptospirosis_global": "Leptospirosis AND Treatment",
    "leishmaniasis_bangladesh": "Leishmaniasis AND Bangladesh",
    "leishmaniasis_global": "Leishmaniasis AND Treatment",
    "influenza_bangladesh": "Influenza AND Bangladesh",
    "influenza_global": "Influenza AND Treatment",

    # Non-Communicable Diseases (NCDs)
    "diabetes_bangladesh": "Diabetes AND Bangladesh",
    "diabetes_global": "Diabetes AND (Management OR Treatment)",
    "hypertension_bangladesh": "Hypertension AND Bangladesh",
    "hypertension_global": "Hypertension AND Guidelines",
    "cardiovascular_bangladesh": "Cardiovascular Diseases AND Bangladesh",
    "cardiovascular_global": "Cardiovascular Diseases AND Treatment",
    "ckd_bangladesh": "Chronic Kidney Disease AND Bangladesh",
    "ckd_global": "Chronic Kidney Disease AND Management",
    "cancer_bangladesh": "Cancer AND Bangladesh",
    "cancer_global": "Cancer AND (Treatment OR Management)",

    # Maternal & Child Healt
    "maternal_health_bangladesh": "Maternal Health AND Bangladesh",
    "maternal_health_global": "Maternal Health AND Guidelines",
    "neonatal_care_bangladesh": "Neonatal Care AND Bangladesh",
    "neonatal_care_global": "Neonatal Care AND WHO Guidelines",
    "malnutrition_bangladesh": "Malnutrition AND Bangladesh",
    "malnutrition_global": "Malnutrition AND Treatment",
    "immunization_bangladesh": "Vaccination AND Bangladesh",
    "immunization_global": "Immunization AND WHO Guidelines",

    # Public Health & Surveillance
    "surveillance_bangladesh": "Disease Surveillance AND Bangladesh",
    "surveillance_global": "Disease Surveillance AND WHO",
    "outbreak_management_bangladesh": "Outbreak Response AND Bangladesh",
    "outbreak_management_global": "Outbreak Response AND Guidelines",
    "health_policy_bangladesh": "Health Policy AND Bangladesh",
    "health_policy_global": "Health Policy AND Guidelines",

    # Drug & Treatment Protocols
    "amr_bangladesh": "Antibiotic Resistance AND Bangladesh",
    "amr_global": "Antimicrobial Resistance AND WHO Guidelines",
    "essential_medicines_bangladesh": "Essential Medicines AND Bangladesh",
    "essential_medicines_global": "Essential Medicines AND WHO Guidelines",
    "drug_pricing_bangladesh": "Drug Pricing AND Bangladesh",
    "drug_pricing_global": "Drug Pricing AND Policies",

    # General Bangladesh Healthcare Queries
    "healthcare_system_bangladesh": "Healthcare System AND Bangladesh",
    "primary_healthcare_bangladesh": "Primary Healthcare AND Bangladesh",
    "rural_health_services_bangladesh": "Rural Health Services AND Bangladesh",
    "community_health_workers_bangladesh": "Community Health Workers AND Bangladesh",

    # General Thematic Searches
    "thematic_infectious_diseases_bd": "Infectious Diseases AND Bangladesh",
    "thematic_ncd_bd": "Non-communicable Diseases AND Bangladesh",
    "thematic_public_health_guidelines_bd": "Public Health Guidelines AND Bangladesh",
    "thematic_disease_surveillance_reports_bd": "Bangladesh Disease Surveillance Reports",
}


In [3]:
import PyPDF2
import os

path = "/data/raw/9789240104907-eng.pdf" # temporary path for testing

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

def chunk_text(text, chunk_size=1000, overlap=100):
    chunks = []
    if not text:
        return chunks
    
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def process_pdf_to_chunks(pdf_path, chunk_size=1000, overlap=100):
    full_text = extract_text_from_pdf(pdf_path)
    return chunk_text(full_text, chunk_size, overlap)


print(process_pdf_to_chunks(path))

Error extracting text from /data/raw/9789240104907-eng.pdf: [Errno 2] No such file or directory: '/data/raw/9789240104907-eng.pdf'
[]


In [4]:
import os
import json
import uuid

# Import the generic PDF processing function
from scripts.preprocessing.pdf_to_text import process_pdf_to_chunks

# Define directories
RAW_PDF_DIR = "data/raw/"
PROCESSED_JSON_DIR = "data/processed"

def main():

    os.makedirs(PROCESSED_JSON_DIR, exist_ok=True)
    
    all_docs = []
    for filename in os.listdir(RAW_PDF_DIR):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(RAW_PDF_DIR, filename)
            
            chunks = process_pdf_to_chunks(pdf_path)
            
            for i, chunk in enumerate(chunks):
                doc_id = str(uuid.uuid4())
                all_docs.append({
                    "id": doc_id,
                    "title": filename.replace(".pdf", ""),
                    "body": chunk,
                    "source": f"WHO Guidelines: {filename}",
                    "language": "en",
                    "source_type": "Global"
                })
    
    output_path = os.path.join(PROCESSED_JSON_DIR, "who_guidelines.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_docs, f, indent=4, ensure_ascii=False)
    
    print(f"Saved {len(all_docs)} chunks to {output_path}")



ModuleNotFoundError: No module named 'scripts'

In [18]:
import os
import json
import asyncio
import nest_asyncio
from scrapegraphai.graphs import SmartScraperGraph

# Fix for asyncio event loop issues (especially in Jupyter notebooks)
nest_asyncio.apply()

# It is a best practice to store API keys as environment variables
GEMINI_APIKEY = "AIzaSyA_5Z72dRKCOAmBfLnLHMNBB2P7X-uYH9w"

graph_config = {
    "llm": { 
        "model": "gemini-pro",
        "api_key": GEMINI_APIKEY,
        "temperature": 0,
        "disable_streaming": True  # Fix for streaming argument warning
    },
    "verbose": True,
    "headless": True
}

# Example of a specific medicine page URL (you need to use actual medicine page URLs)
# This is just an example - replace with actual medicine page URLs
medex_url = "https://medex.com.bd/brands/"

# Updated prompt to be more flexible
user_prompt = """
Extract all available information about medicines from this page. 
If this is a brand listing page, extract:
- List of all brand names
- Associated generic names (if available)
- Manufacturers (if available)

If this is a specific medicine page, extract:
- Brand name of the medicine
- Generic name
- Strength of the medicine
- Manufacturer
- Dosage form
- Indications for use
- Pharmacology
- Dosage and administration instructions
- Precautions or warnings
- Side effects
- Storage conditions

Return the output as a JSON object. If information is not available, use null values.
"""

def scrape_medex_data(url, prompt):
    """
    Uses Scrapegraph-ai to scrape a specific URL based on a prompt.
    """
    try:
        scraper = SmartScraperGraph(
            prompt=prompt,
            source=url,
            config=graph_config
        )
        
        print(f"Scraping data from {url}...")
        
        # Alternative method to avoid asyncio issues
        try:
            result = scraper.run()
        except RuntimeError as e:
            if "asyncio.run() cannot be called from a running event loop" in str(e):
                # Try using asyncio directly
                loop = asyncio.new_event_loop()
                asyncio.set_event_loop(loop)
                try:
                    result = loop.run_until_complete(scraper.arun())
                finally:
                    loop.close()
            else:
                raise e
        
        return result
    
    except Exception as e:
        print(f"Error during scraping: {str(e)}")
        return None

def main():
    # Run the scraping function
    scraped_data = scrape_medex_data(medex_url, user_prompt)
    
    if scraped_data:
        print("\nScraping successful. Here is the output:")
        print(json.dumps(scraped_data, indent=4, ensure_ascii=False))
        
        # Optionally save to file
        try:
            with open('medex_data.json', 'w', encoding='utf-8') as f:
                json.dump(scraped_data, f, indent=4, ensure_ascii=False)
            print("\nData saved to 'medex_data.json'")
        except Exception as e:
            print(f"Error saving file: {str(e)}")
    else:
        print("\nScraping failed. Please check:")
        print("1. Your API key is valid")
        print("2. The URL is accessible")
        print("3. Your internet connection")
        print("4. Try using a specific medicine page URL instead of the general brands page")

if __name__ == "__main__":
    main()

Unexpected argument 'streaming' provided to ChatGoogleGenerativeAI. Did you mean: 'disable_streaming'?
--- Executing Fetch Node ---
--- (Fetching HTML from: https://medex.com.bd/brands/) ---


Found providers ['google_genai'] for model gemini-pro, using google_genai.
If it was not intended please specify the model provider in the graph configuration
Scraping data from https://medex.com.bd/brands/...


--- Executing ParseNode Node ---
--- Executing GenerateAnswer Node ---
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised NotFound: 404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods..
Error during chain execution: 404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Error during scraping: 404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.

Scraping failed. Please check:
1. Your API key is valid
2. The URL is accessible
3. Your internet connection
4. Try using a specific medicine page URL instead of the general brands page
