In [1]:
# imports
import numpy as np
import pandas as pd

### Data Collection

In [2]:
import json
import os
import time
import sys
from Bio import Entrez

Entrez.email = ''
Entrez.api_key = ''

def fetch_pubmed_id(query, max_results=100):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"] 

def fetch_pubmed_abstracts(id_list, batch_size=20):
    abstracts = []
    for start in range(0, len(id_list), batch_size):
        batch_ids = id_list[start:start+batch_size]
        ids = ",".join(batch_ids)
        handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="xml")
        records = Entrez.read(handle)
        handle.close()

        for article in records['PubmedArticle']:
            article_data = article['MedlineCitation']['Article']
            title = article_data.get('ArticleTitle', 'No Title')
            abstract_data = article_data.get('Abstract', {}).get('AbstractText', '')
            if isinstance(abstract_data, list):
                abstract_text = ' '.join([str(a) for a in abstract_data])
            elif isinstance(abstract_data, str):
                abstract_text = abstract_data
            else:
                abstract_text = ''

            pmid = article['MedlineCitation']['PMID']
            mesh_terms = [mesh['DescriptorName'] for mesh in article['MedlineCitation'].get('MeshHeadingList', [])]

            abstracts.append({
                "pmid": str(pmid),
                "title": str(title),
                "abstract": str(abstract_text),
                "mesh_terms": mesh_terms,
                "source": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
            })

        time.sleep(0.3)  # NCBI rate limits

    return abstracts

def save_to_json(data, output_file):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def fetch_and_save_pubmed_abstracts(query, max_results=100):
    for tag, query in SEARCH_QUERIES.items():
        ids = fetch_pubmed_id(query, max_results=100)
        print(f"Found {len(ids)} articles.")

        abstracts = fetch_pubmed_abstracts(ids)
        # Auto-tagging source type
        source_type = "Bangladesh-specific" if "Bangladesh" in query else "Global"
        for doc in abstracts:
            doc["source_type"] = source_type

        output_file = f"data/processed/{tag}.json"
        save_to_json(abstracts, output_file)
        
SEARCH_QUERIES = {
    # Infectious Diseases (High Priority for Bangladesh)
    "dengue_bangladesh": "Dengue AND Bangladesh",
    "dengue_global": "Dengue AND (Treatment OR Guidelines)",
    "typhoid_bangladesh": "Typhoid Fever AND Bangladesh",
    "typhoid_global": "Typhoid Fever AND (Treatment OR Management)",
    "malaria_bangladesh": "Malaria AND Bangladesh",
    "malaria_global": "Malaria AND (Treatment OR Prevention)",
    "hepatitis_bangladesh": "Hepatitis AND Bangladesh",
    "hepatitis_global": "Hepatitis AND (Treatment OR Management)",
    "diarrhea_bangladesh": "Diarrhea AND Bangladesh",
    "diarrhea_global": "Diarrhea AND (Treatment OR Guidelines)",
    "tuberculosis_bangladesh": "Tuberculosis AND Bangladesh",
    "tuberculosis_global": "Tuberculosis AND (Treatment OR WHO Guidelines)",
    "cholera_bangladesh": "Cholera AND Bangladesh",
    "cholera_global": "Cholera AND (Management OR Treatment)",
    "leptospirosis_bangladesh": "Leptospirosis AND Bangladesh",
    "leptospirosis_global": "Leptospirosis AND Treatment",
    "leishmaniasis_bangladesh": "Leishmaniasis AND Bangladesh",
    "leishmaniasis_global": "Leishmaniasis AND Treatment",
    "influenza_bangladesh": "Influenza AND Bangladesh",
    "influenza_global": "Influenza AND Treatment",

    # Non-Communicable Diseases (NCDs)
    "diabetes_bangladesh": "Diabetes AND Bangladesh",
    "diabetes_global": "Diabetes AND (Management OR Treatment)",
    "hypertension_bangladesh": "Hypertension AND Bangladesh",
    "hypertension_global": "Hypertension AND Guidelines",
    "cardiovascular_bangladesh": "Cardiovascular Diseases AND Bangladesh",
    "cardiovascular_global": "Cardiovascular Diseases AND Treatment",
    "ckd_bangladesh": "Chronic Kidney Disease AND Bangladesh",
    "ckd_global": "Chronic Kidney Disease AND Management",
    "cancer_bangladesh": "Cancer AND Bangladesh",
    "cancer_global": "Cancer AND (Treatment OR Management)",

    # Maternal & Child Healt
    "maternal_health_bangladesh": "Maternal Health AND Bangladesh",
    "maternal_health_global": "Maternal Health AND Guidelines",
    "neonatal_care_bangladesh": "Neonatal Care AND Bangladesh",
    "neonatal_care_global": "Neonatal Care AND WHO Guidelines",
    "malnutrition_bangladesh": "Malnutrition AND Bangladesh",
    "malnutrition_global": "Malnutrition AND Treatment",
    "immunization_bangladesh": "Vaccination AND Bangladesh",
    "immunization_global": "Immunization AND WHO Guidelines",

    # Public Health & Surveillance
    "surveillance_bangladesh": "Disease Surveillance AND Bangladesh",
    "surveillance_global": "Disease Surveillance AND WHO",
    "outbreak_management_bangladesh": "Outbreak Response AND Bangladesh",
    "outbreak_management_global": "Outbreak Response AND Guidelines",
    "health_policy_bangladesh": "Health Policy AND Bangladesh",
    "health_policy_global": "Health Policy AND Guidelines",

    # Drug & Treatment Protocols
    "amr_bangladesh": "Antibiotic Resistance AND Bangladesh",
    "amr_global": "Antimicrobial Resistance AND WHO Guidelines",
    "essential_medicines_bangladesh": "Essential Medicines AND Bangladesh",
    "essential_medicines_global": "Essential Medicines AND WHO Guidelines",
    "drug_pricing_bangladesh": "Drug Pricing AND Bangladesh",
    "drug_pricing_global": "Drug Pricing AND Policies",

    # General Bangladesh Healthcare Queries
    "healthcare_system_bangladesh": "Healthcare System AND Bangladesh",
    "primary_healthcare_bangladesh": "Primary Healthcare AND Bangladesh",
    "rural_health_services_bangladesh": "Rural Health Services AND Bangladesh",
    "community_health_workers_bangladesh": "Community Health Workers AND Bangladesh",

    # General Thematic Searches
    "thematic_infectious_diseases_bd": "Infectious Diseases AND Bangladesh",
    "thematic_ncd_bd": "Non-communicable Diseases AND Bangladesh",
    "thematic_public_health_guidelines_bd": "Public Health Guidelines AND Bangladesh",
    "thematic_disease_surveillance_reports_bd": "Bangladesh Disease Surveillance Reports",
}


In [3]:
import PyPDF2
import os

path = "/data/raw/9789240104907-eng.pdf" # temporary path for testing

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

def chunk_text(text, chunk_size=1000, overlap=100):
    chunks = []
    if not text:
        return chunks
    
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def process_pdf_to_chunks(pdf_path, chunk_size=1000, overlap=100):
    full_text = extract_text_from_pdf(pdf_path)
    return chunk_text(full_text, chunk_size, overlap)


print(process_pdf_to_chunks(path))

Error extracting text from /data/raw/9789240104907-eng.pdf: [Errno 2] No such file or directory: '/data/raw/9789240104907-eng.pdf'
[]


In [4]:
import os
import json
import uuid

# Import the generic PDF processing function
from scripts.preprocessing.pdf_to_text import process_pdf_to_chunks

# Define directories
RAW_PDF_DIR = "data/raw/"
PROCESSED_JSON_DIR = "data/processed"

def main():

    os.makedirs(PROCESSED_JSON_DIR, exist_ok=True)
    
    all_docs = []
    for filename in os.listdir(RAW_PDF_DIR):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(RAW_PDF_DIR, filename)
            
            chunks = process_pdf_to_chunks(pdf_path)
            
            for i, chunk in enumerate(chunks):
                doc_id = str(uuid.uuid4())
                all_docs.append({
                    "id": doc_id,
                    "title": filename.replace(".pdf", ""),
                    "body": chunk,
                    "source": f"WHO Guidelines: {filename}",
                    "language": "en",
                    "source_type": "Global"
                })
    
    output_path = os.path.join(PROCESSED_JSON_DIR, "who_guidelines.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_docs, f, indent=4, ensure_ascii=False)
    
    print(f"Saved {len(all_docs)} chunks to {output_path}")



ModuleNotFoundError: No module named 'scripts'

In [19]:


# Updated prompt to be more flexible
user_prompt = """
Extract all available information about medicines from this page. 
If this is a brand listing page, extract:
- List of all brand names
- Associated generic names (if available)
- Manufacturers (if available)

If this is a specific medicine page, extract:
- Brand name of the medicine
- Generic name
- Strength of the medicine
- Manufacturer
- Dosage form
- Indications for use
- Pharmacology
- Dosage and administration instructions
- Precautions or warnings
- Side effects
- Storage conditions

Return the output as a JSON object. If information is not available, use null values.
"""


In [43]:
import requests
from bs4 import BeautifulSoup
import re
import json
import time

# Base URL for the Medex website
BASE_URL = 'https://medex.com.bd'
BRANDS_URL = f'{BASE_URL}/brands?page='

# List to hold all the scraped medicine data
all_medicines = []

def clean_text(text):
    """Removes leading/trailing whitespace and cleans up multiple spaces."""
    if text:
        return ' '.join(text.strip().split())
    return None

def find_section_content(soup, heading_text):
    # Find the heading element (h3, h4 or h5) containing the heading text
    heading = soup.find(['h3', 'h4', 'h5'], string=lambda t: t and heading_text in t)
    
    if heading:
        parent_div = heading.parent
        content_div = parent_div.find_next_sibling()

        if content_div and 'ac-body' in content_div.get('class', []):
            # Use get_text() to retrieve all text nodes, including nested ones
            return clean_text(content_div.get_text(separator=' ', strip=True))
    return None

def extract_medicine_details(page_url):
    """
    Scrapes a single medicine brand page and returns a dictionary of its details.
    
    Args:
        page_url (str): The URL of the specific medicine page.
    
    Returns:
        dict: A dictionary containing the medicine's details, or None if scraping fails.
    """
    print(f"Scraping medicine details from: {page_url}")
    try:
        response = requests.get(page_url, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Create a dictionary to store the scraped data
        medicine_item = {}
        
        # --- Extract key information ---
        
        # Brand Name
        brand_name = soup.select_one('h1.page-heading-1-l')
        medicine_item['brand_name'] = clean_text(brand_name.text) if brand_name else None

        # Generic Name and Link
        generic_info = soup.select_one('div[title="Generic Name"] a')
        if generic_info:
            medicine_item['generic_name'] = clean_text(generic_info.text)
        else:
            medicine_item['generic_name'] = None
           

        # Manufacturer Name
        # Corrected selector to match the HTML in the user's example
        manufacturer_info = soup.find('div', title='Manufactured by')
        if manufacturer_info:
            manufacturer_name_tag = manufacturer_info.find('a')
            medicine_item['manufacturer_name'] = clean_text(manufacturer_name_tag.text) if manufacturer_name_tag else None
        else:
            medicine_item['manufacturer_name'] = None

        # Dosage Form
        # Corrected selector to find the <small> tag inside the main heading
        dosage_form_tag = soup.select_one('h1.page-heading-1-l small[title="Dosage Form"]')
        if dosage_form_tag:
            medicine_item['dosage_form'] = clean_text(dosage_form_tag.text)
        else:
            medicine_item['dosage_form'] = None


        # Strength
        strength = soup.select_one('div[title="Strength"]')
        medicine_item['strength'] = clean_text(strength.text) if strength else None
        
        # Unit Price
        # Corrected selector to find the unit price within the package-container
        price_info = soup.find('div', class_='package-container')
        if price_info:
            # The unit price is the second <span> tag within the package container
            price_span = price_info.find_all('span')
            if len(price_span) > 1:
                medicine_item['unit_price'] = clean_text(price_span[1].text)
            else:
                medicine_item['unit_price'] = None
        else:
            medicine_item['unit_price'] = None
        
        # --- Extract detailed sections using the new helper function ---
        medicine_item['indications'] = find_section_content(soup, 'Indications')
        medicine_item['pharmacology'] = find_section_content(soup, 'Pharmacology')
        medicine_item['dosage_and_administration'] = find_section_content(soup, 'Dosage & Administration')
        medicine_item['contraindications'] = find_section_content(soup, 'Contraindications')
        medicine_item['side_effects'] = find_section_content(soup, 'Side Effects')
        medicine_item['pregnancy_and_lactation'] = find_section_content(soup, 'Pregnancy & Lactation')
        medicine_item['precautions_and_warnings'] = find_section_content(soup, 'Precautions & Warnings')
        medicine_item['overdose_effects'] = find_section_content(soup, 'Overdose Effects')
            
        return medicine_item
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {page_url}: {e}")
        return None
    except AttributeError:
        print(f"Could not find all data on page: {page_url}. Skipping.")
        return None

def scrape_brand_page(page_number):
    """
    Scrapes a single page of medicine brands and extracts the links to each medicine.
    
    Args:
        page_number (int): The page number to scrape.
    
    Returns:
        list: A list of URLs for the individual medicine brand pages.
    """
    url = f"{BRANDS_URL}{page_number}"
    print(f"Scraping brands page: {url}")
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all medicine links on the page
        medicine_links = soup.select('a.hoverable-block')
        
        # Extract the href attribute from each link. The URLs are already absolute.
        links = [link.get('href') for link in medicine_links]
        
        # Check for pagination to see if there's a next page
        next_page_link = soup.select_one('a.page-link[rel="next"]')
        has_next_page = next_page_link is not None
        
        return links, has_next_page
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching brands page {page_number}: {e}")
        return [], False

def main_crawler(start_page=1, max_pages=None):
    """
    The main function to orchestrate the crawling process.
    
    Args:
        start_page (int): The page number to start crawling from.
        max_pages (int, optional): The maximum number of pages to crawl. Crawls all pages if None.
    """
    page_number = start_page
    while True:
        # Scrape the current brands page
        medicine_urls, has_next_page = scrape_brand_page(page_number)
        
        # If no links are found, stop crawling
        if not medicine_urls:
            print("No more pages to crawl or an error occurred.")
            break
        
        # Iterate through the extracted medicine URLs and get detailed info
        for url in medicine_urls:
            details = extract_medicine_details(url)
            if details:
                all_medicines.append(details)
            time.sleep(1)  # Be polite to the server with a short delay
        
        # Check if we should stop
        if not has_next_page or (max_pages and page_number >= max_pages):
            print("Finished crawling all available pages or reached the page limit.")
            break
            
        page_number += 1
        time.sleep(2)  # Longer delay between pages to be more respectful

    # Save the scraped data to a JSON file
    with open('medex_data.json', 'w', encoding='utf-8') as f:
        json.dump(all_medicines, f, ensure_ascii=False, indent=4)
        
    print(f"Successfully scraped {len(all_medicines)} medicine brands and saved to medex_data.json")

if __name__ == "__main__":
    # You can set a max_pages limit for testing purposes, e.g., max_pages=5
    main_crawler(max_pages=1)


Scraping brands page: https://medex.com.bd/brands?page=1
Scraping medicine details from: https://medex.com.bd/brands/13717/3-bion-100-mg-tablet
Scraping medicine details from: https://medex.com.bd/brands/7695/3-c-200-mg-capsule
Scraping medicine details from: https://medex.com.bd/brands/7696/3-c-100-mg-suspension
Scraping medicine details from: https://medex.com.bd/brands/18731/3-c-400-mg-capsule
Scraping medicine details from: https://medex.com.bd/brands/9538/3-f-500-mg-tablet
Scraping medicine details from: https://medex.com.bd/brands/7697/3-geocef-200-mg-capsule
Scraping medicine details from: https://medex.com.bd/brands/7698/3-geocef-100-mg-suspension
Scraping medicine details from: https://medex.com.bd/brands/31993/3d-20000-iu-capsule
Scraping medicine details from: https://medex.com.bd/brands/31994/3d-40000-iu-capsule
Scraping medicine details from: https://medex.com.bd/brands/33499/3d-2000-iu-tablet
Scraping medicine details from: https://medex.com.bd/brands/7699/3rd-cef-200-mg-