<a href="https://colab.research.google.com/github/kipsangmarion/webscraper/blob/main/WebScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Dependencies

In [11]:
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
import numpy as np
import csv
import concurrent.futures
from re import sub
from decimal import Decimal

#Extracting the links to individual project details

In [12]:
# Replace with the URL of the website you want to scrape
base_url = 'https://www.thegef.org/projects-operations/database?f%5B0%5D=focal_areas%3A2207&search=&page={}'

# Number of pages to scrape
num_pages = 77

# Initialize project_links list
project_links = []

# Loop through pages
for page in range(num_pages):
  url = base_url.format(page)
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')

  # Find the table that contains the links (you may need to inspect the HTML)
  table = soup.find('table')

  # Loop through rows in the table
  for row in table.find_all('tr'):
    # Check if there are any <td> elements in this row
    td_elements = row.find_all('td')
    if td_elements:
      # Assuming the links are in the first column (index 0)
      link = td_elements[0].find('a')
      if link:
        link_text = link.get('href')
        full_link = "https://www.thegef.org" + link_text
        project_links.append(full_link)

# Create a DataFrame from project_links and save it to a CSV file
link_dataframe = pd.DataFrame(project_links)
link_dataframe.to_csv('project_links.csv', index=False, header=None)

print(project_links)

['https://www.thegef.org/projects-operations/projects/11318', 'https://www.thegef.org/projects-operations/projects/11317', 'https://www.thegef.org/projects-operations/projects/11315', 'https://www.thegef.org/projects-operations/projects/11312', 'https://www.thegef.org/projects-operations/projects/11310', 'https://www.thegef.org/projects-operations/projects/11309', 'https://www.thegef.org/projects-operations/projects/11306', 'https://www.thegef.org/projects-operations/projects/11305', 'https://www.thegef.org/projects-operations/projects/11303', 'https://www.thegef.org/projects-operations/projects/11302', 'https://www.thegef.org/projects-operations/projects/11285', 'https://www.thegef.org/projects-operations/projects/11284', 'https://www.thegef.org/projects-operations/projects/11280', 'https://www.thegef.org/projects-operations/projects/11279', 'https://www.thegef.org/projects-operations/projects/11278', 'https://www.thegef.org/projects-operations/projects/11277', 'https://www.thegef.org

#Extract project general info

Including catching any exceptions

In [13]:
# Function to scrape project information from a single link
def scrape_project_info(project_link):
    project_info = {}
    print(project_link)
    try:

        response = requests.get(project_link)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Project title
        title_element = soup.find('span', class_='field--name-title')
        title_text = title_element.get_text(strip=True) if title_element else None
        project_info['Title'] = title_text

        # Funding institution
        funding_institution_element = soup.find('div', class_='field--name-field-trust-fund-name')
        funding_institution_items = funding_institution_element.find_all('div', class_='field__item')
        funding_institution_list = [item.get_text(strip=True) for item in funding_institution_items]
        project_info['Funding Institution'] = funding_institution_list

        # Implementing institution
        implementing_institution_element = soup.find('div', class_='field--name-field-implementing-agencies')
        implementing_institution_items = implementing_institution_element.find_all('div', class_='field__item')
        implementing_institution_list = [item.get_text(strip=True) for item in implementing_institution_items]
        project_info['Implementing Institution'] = implementing_institution_list

        # Country
        country_element = soup.find('div', class_='field--name-field-country')
        country_items = country_element.find_all('div', class_='field__item')
        country_list = [item.get_text(strip=True) for item in country_items]
        project_info['Country'] = country_list

        # Region
        region_element = soup.find('div', class_='field--name-field-region')
        region_items = region_element.find_all('div', class_='field__item')
        region_list = [item.get_text(strip=True) for item in region_items]
        project_info['Region'] = region_list

        # Status
        status_element = soup.find('div', class_='field--name-field-latest-timeline-status')
        status_text = status_element.find('div', class_='field__item').get_text(strip=True) if status_element else None
        project_info['Status'] = status_text

        # Total amount
        gef_project_grant_element = soup.find('div', class_='views-field-field-gef-project-grant')
        cofinancing_element = soup.find('div', class_='views-field-field-co-financing-total')
        gef_project_grant_amount = gef_project_grant_element.find('div', class_='field-content').get_text(strip=True)
        cofinancing_amount = cofinancing_element.find('div', class_='field-content').get_text(strip=True)
        total_amount = Decimal(sub(r'[^\d.]', '', gef_project_grant_amount)) + Decimal(sub(r'[^\d.]', '', cofinancing_amount))
        project_info['Total Project Amount'] = str(total_amount)

        # Link to project documentation
        project_documents_element = soup.find('div', class_='field--name-field-document-url')
        project_documents_items = project_documents_element.find_all('div', class_='field__item')
        document_links = [item.find('a')['href'] for item in project_documents_items]
        project_info['Link to Project Documentation'] = document_links

        # Start Date
        start_date_element = soup.find('div', class_='views-field-field-combined-project-appr-date')
        start_date_text = start_date_element.find('div', class_='field-content').get_text(strip=True) if start_date_element else None
        project_info['Start Date'] = start_date_text

        # End Date
        end_date_element = soup.find('div', class_='views-field-field-combined-closing-date')
        end_date_text = end_date_element.find('div', class_='field-content').get_text(strip=True) if end_date_element else None
        project_info['End Date'] = end_date_text

        return project_info
    except Exception as e:
        print(f"Error scraping project: {e}")
        return None

# Number of threads to use for parallel scraping
num_threads = 8

# Initialize project_info list
project_info_list = []

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit scraping tasks for each project link
    future_to_link = {executor.submit(scrape_project_info, project_link): project_link for project_link in project_links}
    for future in concurrent.futures.as_completed(future_to_link):
        project_info = future.result()
        if project_info:
            project_info_list.append(project_info)
            print(project_info)

https://www.thegef.org/projects-operations/projects/11318
https://www.thegef.org/projects-operations/projects/11317
https://www.thegef.org/projects-operations/projects/11315
https://www.thegef.org/projects-operations/projects/11312https://www.thegef.org/projects-operations/projects/11310
https://www.thegef.org/projects-operations/projects/11309

https://www.thegef.org/projects-operations/projects/11306
https://www.thegef.org/projects-operations/projects/11305
https://www.thegef.org/projects-operations/projects/11303{'Title': 'Preparation of Belize’s First Biennial Transparency Report and a combined Second Biennial Transparency Report and Fifth National Communication (BTR1 and BTR2/NC5) to the United Nations Framework Convention on Climate Change (UNFCCC)', 'Funding Institution': ['GEF Trust Fund'], 'Implementing Institution': ['United Nations Environment Programme'], 'Country': ['Belize'], 'Region': ['Latin America and Caribbean'], 'Status': 'Project Approved', 'Total Project Amount': 

Hey

#Create dataframe and export to csv

For the purposes of downloading

In [14]:
with open("thegef_data.csv", "w", newline="") as csvfile:
    fieldnames = ["Title", "Funding Institution", "Implementing Institution", "Country", "Region", "Status", "Total Project Amount", "Link to Project Documentation", "Start Date", "End Date"]
    csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    csv_writer.writeheader()
    for project_data in project_info_list:
        csv_writer.writerow(project_data)