In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

options = Options()
# options.add_argument("--headless")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)
driver.get("https://www.aiib.org/en/projects/list/index.html")
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
time.sleep(5)

rows = driver.find_elements(By.CSS_SELECTOR, "ul.table-row.row")
print("Total projects found:", len(rows))
all_projects = []

for row in rows:
    try:
        approval_year = row.find_element(By.CSS_SELECTOR, "li:nth-child(1)").get_attribute("textContent").strip()
        economy = row.find_element(By.CSS_SELECTOR, "li:nth-child(2)").get_attribute("textContent").strip()
        sector = row.find_element(By.CSS_SELECTOR, "li:nth-child(3)").get_attribute("textContent").strip()
        financing_type = row.find_element(By.CSS_SELECTOR, "li:nth-child(4)").get_attribute("textContent").strip()
        project_name = row.find_element(By.CSS_SELECTOR, "li:nth-child(5)").get_attribute("textContent").strip()
        try:
            details_url = row.find_element(By.CSS_SELECTOR, "li:nth-child(5) a").get_attribute("href")
        except:
            details_url = ""
        financing_amount = row.find_element(By.CSS_SELECTOR, "li:nth-child(6)").get_attribute("textContent").strip()
        status = row.find_element(By.CSS_SELECTOR, "li:nth-child(7)").get_attribute("textContent").strip()
        all_projects.append({
            "Approval Year": approval_year,
            "Economy": economy,
            "Sector": sector,
            "Financing Type": financing_type,
            "Project Name": project_name,
            "Details URL": details_url,
            "Financing Amount": financing_amount,
            "Status": status
        })
    except Exception as e:
        print("Error parsing row:", e)

driver.quit()

df = pd.DataFrame(all_projects)
df.to_excel("aiib_projects_all_fixed_f5.xlsx", index=False)
print(f"Scraped {len(df)} projects saved to aiib_projects_all_fixed_f5.xlsx")


Total projects found: 380
Scraped 380 projects saved to aiib_projects_all_fixed_f5.xlsx


In [8]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time

# ---- Setup ----
input_file = "aiib_projects_all_fixed_f5.xlsx"
output_file = "aiib_projects_with_details.xlsx"

options = Options()
# options.add_argument("--headless")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)

def extract_project_details(driver, detail_url):
    # Open project detail page
    driver.get(detail_url)
    time.sleep(2)

    def get_summary_value(label):
        try:
            for div in driver.find_elements(By.CSS_SELECTOR, ".project-summary .summary-item"):
                if div.find_element(By.CLASS_NAME, "summary-item-name").text.strip().upper() == label.upper():
                    return div.find_element(By.CLASS_NAME, "summary-item-value").text.strip()
        except:
            return ""
        return ""

    def get_paragraph_text(headline):
        try:
            sections = driver.find_elements(By.CSS_SELECTOR, ".col-md-12.paragraph, .col-md-4.paragraph")
            for sec in sections:
                h2s = sec.find_elements(By.TAG_NAME, "h2")
                for h2 in h2s:
                    if headline.lower() in h2.text.lower():
                        return sec.text.replace(h2.text, "").strip()
        except:
            return ""
        return ""

    def get_person(section_headline):
        try:
            # Find the paragraph col that matches the section headline
            paragraphs = driver.find_elements(By.CSS_SELECTOR, ".col-md-4.paragraph")
            for para in paragraphs:
                if section_headline.lower() in para.text.lower():
                    lines = para.text.strip().split("\n")
                    name = ""
                    org = ""
                    title = ""
                    email = ""
                    for line in lines:
                        if "@" in line:
                            email = line
                        elif not org:
                            org = line
                        elif not name:
                            name = line
                        elif not title:
                            title = line
                    return org, name, title, email
        except:
            return ("", "", "", "")
        return ("", "", "", "")

    # Extract main fields
    status = get_summary_value("STATUS")
    member = get_summary_value("MEMBER")
    sector = get_summary_value("SECTOR")
    es_category = get_summary_value("E&S CATEGORY")
    project_number = get_summary_value("PROJECT NUMBER")
    funding = get_summary_value("PROPOSED FUNDING AMOUNT")
    financing_type = get_summary_value("FINANCING TYPE")
    objective = get_paragraph_text("OBJECTIVE")
    tl_org, tl_name, tl_title, tl_email = get_person("PROJECT TEAM LEADER")
    br_org, br_name, br_title, br_email = get_person("BORROWER")

    return {
        "Detail_Status": status,
        "Detail_Member": member,
        "Detail_Sector": sector,
        "Detail_ES_Category": es_category,
        "Detail_Project_Number": project_number,
        "Detail_Proposed_Funding": funding,
        "Detail_Financing_Type": financing_type,
        "Detail_Objective": objective,
        "Team_Leader_Org": tl_org,
        "Team_Leader_Name": tl_name,
        "Team_Leader_Title": tl_title,
        "Team_Leader_Email": tl_email,
        "Borrower_Org": br_org,
        "Borrower_Name": br_name,
        "Borrower_Title": br_title,
        "Borrower_Email": br_email,
    }

# ---- Main Processing ----
df = pd.read_excel(input_file)
details_rows = []
total = len(df)
for i, row in df.iterrows():
    url = str(row.get("Details URL") or "")
    if url and url.startswith("http"):
        print(f"[{i+1}/{total}] Extracting: {url}")
        detail_data = extract_project_details(driver, url)
    else:
        detail_data = {k:"" for k in [
            "Detail_Status","Detail_Member","Detail_Sector","Detail_ES_Category","Detail_Project_Number",
            "Detail_Proposed_Funding","Detail_Financing_Type","Detail_Objective",
            "Team_Leader_Org","Team_Leader_Name","Team_Leader_Title","Team_Leader_Email",
            "Borrower_Org","Borrower_Name","Borrower_Title","Borrower_Email"]}
    details_rows.append(detail_data)

details_df = pd.DataFrame(details_rows)
full_df = pd.concat([df, details_df], axis=1)
full_df.to_excel(output_file, index=False)
print("Done! Saved:", output_file)
driver.quit()


[1/380] Extracting: https://www.aiib.org/en/projects/details/2025/proposed/turkiye-vakifbank-climate-transition-reconstruction-facility.html
[2/380] Extracting: https://www.aiib.org/en/projects/details/2025/proposed/kazakhstan-inclusive-and-sustainable-economic-growth-development-policy-operation.html
[3/380] Extracting: https://www.aiib.org/en/projects/details/2025/proposed/uzbekistan-telecom.html
[4/380] Extracting: https://www.aiib.org/en/projects/details/2023/approved/India-Manipur-Urban-Road-Drainage-and-Asset-Management-Improvement-Project.html
[5/380] Extracting: https://www.aiib.org/en/projects/details/2024/approved/Georgia-Tbilisi-Metro-Modernization-Project.html
[6/380] Extracting: https://www.aiib.org/en/projects/details/2025/approved/brazil-sicredi-green-loan.html
[7/380] Extracting: https://www.aiib.org/en/projects/details/2024/approved/multicountry-green-energy-capacity-expansion.html
[8/380] Extracting: https://www.aiib.org/en/projects/details/2025/approved/bangladesh-no

In [10]:
import pandas as pd
import re

# Load your Excel file
df = pd.read_excel("aiib_projects_with_details1.xlsx")  # Replace with your file name

# Extract Year from 'Approval Year'
df['Approval_Year_Clean'] = df['Approval Year'].str.extract(r'(\d{4})')

# Improved function to extract and format Financing Amount
def extract_amount(text):
    text = str(text).lower()
    match = re.search(r'(usd|eur)\s*([\d.,]+)\s*million', text)
    if match:
        currency = match.group(1).upper()
        amount = match.group(2).replace(',', '')
        return f"{currency} {amount} million"
    return None

df['Financing_Amount_Clean'] = df['Financing Amount'].apply(extract_amount)

# Save cleaned data
df.to_excel("cleaned_output_aiib_banking.xlsx", index=False)

print("✅ Fixed and saved to 'cleaned_output_aiib_banking.xlsx'")


✅ Fixed and saved to 'cleaned_output_aiib_banking.xlsx'
