In [None]:
import os
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
import fitz  # PyMuPDF

BASE_URL = "https://www.ndb.int/projects/all-projects/page/{}/"

HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                   "(KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Referer": "https://www.ndb.int/"
}

def scrape_projects_from_page(page_num):
    url = BASE_URL.format(page_num)
    print(f"Scraping page {page_num}: {url}")
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    project_cards = soup.find_all("div", class_="project-card card card-transition")
    if not project_cards:
        print("No projects found on this page.")
        return []

    projects = []
    for card in project_cards:
        sector_div = card.find("div", class_="project-card-cat")
        sector = sector_div.text.strip() if sector_div else ""

        country_div = card.find("div", class_="project-card-country")
        country = country_div.text.strip() if country_div else ""

        title_div = card.find("div", class_="project-card-title")
        a_tag = title_div.find("a") if title_div else None
        project_name = a_tag.text.strip() if a_tag else ""
        project_url = a_tag['href'] if a_tag else ""

        type_div = card.find("div", class_="project-card-type")
        project_type = type_div.text.strip() if type_div else ""

        date_div = card.find("div", class_="project-card-date")
        date = date_div.text.strip() if date_div else ""

        projects.append({
            "Project Name": project_name,
            "Project URL": project_url,
            "Country": country,
            "Sector": sector,
            "Project Type": project_type,
            "Date": date
        })
    return projects

def scrape_all_projects(start_page=1, end_page=12, delay=1):
    all_projects = []
    for page_num in range(start_page, end_page + 1):
        projects = scrape_projects_from_page(page_num)
        if not projects:
            print(f"No projects on page {page_num}, stopping early.")
            break
        all_projects.extend(projects)
        time.sleep(delay)
    return all_projects

def get_pdf_link(project_url):
    try:
        response = requests.get(project_url, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        link = soup.find("a", class_="btn primary card-link download-icon external")
        if link and 'href' in link.attrs:
            return link['href']
    except Exception as e:
        print(f"Error fetching PDF link from {project_url}: {e}")
    return None

def extract_pdf_text(pdf_url):
    try:
        response = requests.get(pdf_url, headers=HEADERS, timeout=20)
        response.raise_for_status()
        pdf_data = response.content
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_url}: {e}")
        return ""

def parse_pdf_text_flexible(text):
    fields = [
        "Project Name",
        "Country",
        "Type",
        "Area of Operation",
        "Concept Approval Date",
        "Total Project Cost",
        "Proposed Limit of NDB Financing",
        "Borrower",
        "Project Entity",
        "Project Context",
        "Project Objective",
        "Project Description"
    ]
    
    parsed = {}
    for i, field in enumerate(fields):
        next_fields = fields[i+1:] if i+1 < len(fields) else []
        pattern = rf"{re.escape(field)}\s*(.*?)(?=" + "|".join([re.escape(f) for f in next_fields]) + "|$)"
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        parsed[field] = match.group(1).strip().replace('\n', ' ') if match else None
    return parsed

def step1_scrape_projects_to_excel(start_page=1, end_page=12, output_excel="ndb_all_projects.xlsx"):
    projects = scrape_all_projects(start_page, end_page)
    df = pd.DataFrame(projects)
    df.to_excel(output_excel, index=False)
    print(f"Step 1 complete: Scraped {len(projects)} projects and saved to '{output_excel}'")
    return output_excel

def step2_add_pdf_links_and_text(input_excel, output_csv="ndb_projects_with_pdf_text.csv", delay=1):
    df = pd.read_excel(input_excel)
    if 'Project URL' not in df.columns:
        print("Error: 'Project URL' column not found in Excel.")
        return None
    
    pdf_urls = []
    pdf_text_snippets = []
    
    for i, project_url in enumerate(df['Project URL'], 1):
        print(f"[{i}/{len(df)}] Processing project URL: {project_url}")
        pdf_link = get_pdf_link(project_url)
        pdf_urls.append(pdf_link if pdf_link else "")
        
        pdf_text = ""
        if pdf_link:
            pdf_text = extract_pdf_text(pdf_link)
        pdf_text_snippets.append(pdf_text[:1000] if pdf_text else "")
        
        time.sleep(delay)
    
    df['PDF URL'] = pdf_urls
    df['PDF Text Snippet'] = pdf_text_snippets
    df.to_csv(output_csv, index=False)
    print(f"Step 2 complete: Added PDF URLs and text snippets saved to '{output_csv}'")
    return output_csv

def step3_parse_pdf_text_to_excel(input_csv, output_excel="ndb_projects_parsed.xlsx", delay=1):
    if not os.path.isfile(input_csv):
        print(f"Input file not found: {input_csv}")
        return
    
    df = pd.read_csv(input_csv)
    if 'PDF URL' not in df.columns:
        print("Error: 'PDF URL' column missing in input CSV.")
        return
    
    parsed_rows = []
    total = len(df)
    
    for i, pdf_url in enumerate(df['PDF URL'], 1):
        if pd.isna(pdf_url) or not pdf_url.strip():
            print(f"[{i}/{total}] Skipping empty PDF URL")
            parsed_rows.append({field: None for field in [
                "Project Name","Country","Type","Area of Operation","Concept Approval Date",
                "Total Project Cost","Proposed Limit of NDB Financing","Borrower",
                "Project Entity","Project Context","Project Objective","Project Description"
            ]})
            continue
        
        print(f"[{i}/{total}] Parsing PDF: {pdf_url}")
        text = extract_pdf_text(pdf_url)
        if text:
            parsed = parse_pdf_text_flexible(text)
        else:
            parsed = {field: None for field in [
                "Project Name","Country","Type","Area of Operation","Concept Approval Date",
                "Total Project Cost","Proposed Limit of NDB Financing","Borrower",
                "Project Entity","Project Context","Project Objective","Project Description"
            ]}
        parsed_rows.append(parsed)
        time.sleep(delay)
    
    parsed_df = pd.DataFrame(parsed_rows)
    final_df = pd.concat([df.reset_index(drop=True), parsed_df.reset_index(drop=True)], axis=1)
    final_df.to_excel(output_excel, index=False)
    print(f"Step 3 complete: Parsed PDF text saved to '{output_excel}'")

def main():
    # Adjust page ranges or file paths if needed
    excel_path = step1_scrape_projects_to_excel(start_page=1, end_page=12, output_excel="ndb_all_projects.xlsx")
    csv_path = step2_add_pdf_links_and_text(input_excel=excel_path, output_csv="ndb_projects_with_pdf_text.csv")
    if csv_path:
        step3_parse_pdf_text_to_excel(input_csv=csv_path, output_excel="ndb_projects_parsed.xlsx")

if __name__ == "__main__":
    main()


Scraping page 1: https://www.ndb.int/projects/all-projects/page/1/
Scraping page 2: https://www.ndb.int/projects/all-projects/page/2/
Scraping page 3: https://www.ndb.int/projects/all-projects/page/3/
