In [76]:
import pandas as pd
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.firefox.options import Options
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.keys import Keys

import os
import yaml
from types import SimpleNamespace

In [None]:
config = {
    "general": {
        # True: Starts a new scraping process from the input spreadsheet.
        # False: Attempts to reprocess failed items from the JSON file.
        "new_scraping": True,
    },
    "files": {
        # Path to the input spreadsheet (used if new_scraping is true).
        #"input_csv": './data/ei_and_programs.csv',
        "input_csv": './data/test.csv',

        # Final file, with data enriched with descriptions.
        "final_enriched_json": './data/sucupira_data_projects.json',

        # File to save intermediate progress.
        "intermediate_json": './data/intermediate_data.json',
    },
    "scrapers": {
        "selenium": {
            # URL of the Sucupira platform's query page.
            "page_url": "https://sucupira-legado.capes.gov.br/sucupira/public/consultas/coleta/envioColeta/dadosFotoEnvioColeta.xhtml",

            # The value "1000" corresponds to "All years" in the page's form.
            "collection_calendar_value": "1000",
        },
        "beautifulsoup": {
            # Pause in seconds between each request to fetch the description.
            # Helps to avoid overloading the website's server.
            "request_pause_seconds": 1,
        }
    },
    "max_retries": 5
}

# Example of how to access the values:
print(f"Page URL: {config['scrapers']['selenium']['page_url']}")
print(f"Input file: {config['files']['input_csv']}")
print(f"Maximum retries: {config['max_retries']}")

Page URL: https://sucupira-legado.capes.gov.br/sucupira/public/consultas/coleta/envioColeta/dadosFotoEnvioColeta.xhtml
Input file: ./data/test.csv
Maximum retries: 5


In [None]:

class SucupiraScraper:
    """A web scraper for the Brazilian Sucupira academic platform.

    This class automates the process of navigating the Sucupira platform,
    searching for specific postgraduate programs, and extracting detailed
    information, including program details and a full list of associated
    research projects. It handles dynamic page elements, pagination, and
    error logging.

    Attributes:
        config (dict): Configuration dictionary with URLs and other settings.
        driver (webdriver): The Selenium Firefox WebDriver instance.
        wait (WebDriverWait): Selenium wait object for handling dynamic elements.
        collected_data (list): A list of dictionaries, where each dictionary
            holds the scraped data for a successful program.
        data_with_errors (list): A list of dictionaries for HEI/Program pairs
            that failed during scraping.
    """
    def __init__(self, config):
        """Initializes the SucupiraScraper instance.

        Args:
            config (dict): A configuration dictionary containing necessary
                parameters like URLs and file paths.
        """
        self.config = config
        print("Initializing Selenium with Firefox...")
        
        options = Options()
        options.add_argument("--width=1920")
        options.add_argument("--height=1080")
        options.add_argument("--headless")
        
        try:
            service = Service(GeckoDriverManager().install())
            self.driver = webdriver.Firefox(service=service, options=options)
            print("✅ Firefox WebDriver initialized successfully.   ")
        except Exception as e:
            print(f"❌ Error initializing Firefox: {e}")
            print("Try using system geckodriver...")
            self.driver = webdriver.Firefox(options=options)

        self.wait = WebDriverWait(self.driver, 25)
        self.collected_data = []
        self.data_with_errors = []



    def _extract_header_info(self, label_text):
        """Extracts text from a div that follows a div containing a specific label.

        A helper utility to scrape data from the header section of the program page,
        which follows a 'label: value' pattern in the HTML structure.

        Args:
            label_text (str): The text of the <label> tag to find.

        Returns:
            str: The extracted text content, or "Not found" if the element
                 could not be located.
        """
        try:
            xpath = f"//label[text()='{label_text}']/parent::div/following-sibling::div"
            return self.wait.until(EC.visibility_of_element_located((By.XPATH, xpath))).text.strip()
        except TimeoutException:
            return "Not found"

    def _process_item(self, hei_code, program_code):
        """Processes a single HEI/Program pair to scrape its data.

        This is the core scraping logic for one item. It navigates to the page,
        fills out the form, extracts header information, and then scrapes all
        associated research projects, handling pagination within the projects table.
        Successful results are appended to `self.collected_data`, and failures
        are logged in `self.data_with_errors`.

        Args:
            hei_code (str): The code for the Higher Education Institution.
            program_code (str): The code for the postgraduate program.
        """
        try:
            self.driver.get(self.config['scrapers']['selenium']['page_url'])
            try:
                cookie_button = WebDriverWait(self.driver, 0.5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='ACEITO']")))
                cookie_button.click()
            except TimeoutException:
                pass  # Cookie button not found, proceed

            calendar_value = self.config['scrapers']['selenium']['collection_calendar_value']
            Select(self.wait.until(EC.element_to_be_clickable((By.ID, "form:j_idt33:calendarioid")))).select_by_value(calendar_value)

            # select the HEI using the autocomplete
            input_hei = self.wait.until(EC.element_to_be_clickable((By.ID, "form:j_idt33:inst:input"))) 
            input_hei.clear() 
            input_hei.send_keys(hei_code) 
            option_selector_xpath = f"//select[@id='form:j_idt33:inst:listbox']/option[starts-with(text(), '{hei_code}')]"
            time.sleep(1)  # brief pause to allow options to load
            self.wait.until(EC.element_to_be_clickable((By.XPATH, option_selector_xpath))).click()
            

            # select the program
            program_selector_name = "form:j_idt33:j_idt406"
            self.wait.until(lambda d: len(Select(d.find_element(By.NAME, program_selector_name)).options) > 1)
            select_program = Select(self.driver.find_element(By.NAME, program_selector_name))
            
            value_to_select = next((opt.get_attribute("value") for opt in select_program.options if program_code in opt.text), None)

            if value_to_select:
                select_program.select_by_value(value_to_select)
            else:
                raise NoSuchElementException(f"Program '{program_code}' not found for HEI '{hei_code}'.")
            
            self.wait.until(EC.element_to_be_clickable((By.ID, "form:consultar"))).click()
            


            # Click the "Programa" details button to ensure its content is loaded
            program_details_button = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[href="#collapsePrograma"]')))
            self.driver.execute_script("arguments[0].scrollIntoView(true); arguments[0].click();", program_details_button)

            program_info = {
                'SEARCHED_HEI_CODE': hei_code,
                'SEARCHED_PROGRAM_CODE': program_code,
                'Institution': self._extract_header_info("Instituição de Ensino:"),
                'Program': self._extract_header_info("Programa:"),
                'Coordinator': self._extract_header_info("Coordenador(a):"),
                'Status': self._extract_header_info("Situação:"),
                'English_Name': self._extract_header_info("Nome em Inglês:"),
                'Basic_Area': self._extract_header_info("Área Básica:"),
                'Evaluation_Area': self._extract_header_info("Área de Avaliação:"),
                'Academic_Term': self._extract_header_info("Regime Letivo:"),
                'Modality': self._extract_header_info("Modalidade:")
            }

            # Extract City and State from the "Instituições de Ensino" table
            try:
                # This XPath finds the table under the correct H1 and gets the 3rd and 4th columns of the first data row
                # sigla em ingles
                hei_abbreviation_xpath = "//h1[contains(text(), 'Instituições de Ensino')]/following-sibling::div//table/tbody/tr/td[2]"
                city_xpath = "//h1[contains(text(), 'Instituições de Ensino')]/following-sibling::div//table/tbody/tr/td[3]"
                state_xpath = "//h1[contains(text(), 'Instituições de Ensino')]/following-sibling::div//table/tbody/tr/td[4]"
                
                program_info['HEI_Abbreviation'] = self.wait.until(EC.visibility_of_element_located((By.XPATH, hei_abbreviation_xpath))).text.strip()
                program_info['City'] = self.wait.until(EC.visibility_of_element_located((By.XPATH, city_xpath))).text.strip()
                program_info['State_UF'] = self.wait.until(EC.visibility_of_element_located((By.XPATH, state_xpath))).text.strip()
            except TimeoutException:
                program_info['HEI_Abbreviation'] = "Not found"
                program_info['City'] = "Not found"
                program_info['State_UF'] = "Not found"


            
            projects_button = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[href="#collapseProjetos"]')))
            self.driver.execute_script("arguments[0].scrollIntoView(true); arguments[0].click();", projects_button)

            all_research_project_rows = []

            def extract_data_from_current_page():
                """Helper function to extract project data from the currently visible page."""
                table_rows = self.wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "#collapseProjetos table.table-bordered tbody tr")))
                for row in table_rows:
                    try:
                        columns = row.find_elements(By.TAG_NAME, "td")
                        if len(columns) < 6: continue
                        
                        details_link = columns[5].find_element(By.TAG_NAME, 'a').get_attribute('href')
                        row_data = {
                            'Project_Name': columns[0].text.strip(),
                            'Research_Line': columns[1].text.strip(),
                            'Concentration_Area': columns[2].text.strip(),
                            'Project_Nature': columns[3].text.strip(),
                            'Project_Status': columns[4].text.strip(),
                            'Details_Link': details_link
                        }
                        all_research_project_rows.append(row_data)
                    except (NoSuchElementException, IndexError) as e:
                        print(f"  - Warning: error processing a table row: {e}")

            # Pagination Logic
            try:
                pagination_selector_css = "select[id$=':cmbPagina']"
                pagination_select_element = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, pagination_selector_css)))
                pagination_select = Select(pagination_select_element)
                
                num_pages = len(pagination_select.options)
                print(f"-> Found {num_pages} pages of projects.")

                for i in range(num_pages):
                    print(f"  - Processing page {i + 1}/{num_pages}...")
                    
                    if i > 0:
                        old_first_row = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#collapseProjetos table.table-bordered tbody tr")))
                        pagination_select_element = self.driver.find_element(By.CSS_SELECTOR, pagination_selector_css)
                        pagination_select = Select(pagination_select_element)
                        pagination_select.select_by_index(i)
                        self.wait.until(EC.staleness_of(old_first_row))

                    extract_data_from_current_page()

            except TimeoutException:
                print("-> Only one page of projects found. Extracting data...")
                extract_data_from_current_page()

            program_info['Projects'] = all_research_project_rows
            self.collected_data.append(program_info)
            print(f"-> SUCCESS: Extracted {len(all_research_project_rows)} research projects from all pages.")

        except Exception as e:
            error_msg = f"ERROR: {str(e).splitlines()[0]}"
            print(f"-> {error_msg}")
            self.data_with_errors.append({
                'CD_INSTITUICAO_ENSINO': hei_code,
                'CD_PROGRAMA': program_code
            })
    def close_browser(self):
        """Closes the Selenium WebDriver instance.

        This should be called at the end of the scraping process to free up
        system resources.
        """
        print("\nClosing the browser...")
        self.driver.quit()

    def run(self, uncollected_data=None):
        """Executes the main scraping loop.

        This is the primary entry point for the scraper. It reads a list of
        HEI/Program pairs from a CSV file (or a provided DataFrame) and iterates
        through them, calling `_process_item` for each. It also handles
        saving intermediate progress.

        Args:
            uncollected_data (pd.DataFrame, optional): A DataFrame containing
                items that failed in a previous run. If provided, the scraper
                will only process these items. Defaults to None, in which case
                it reads from the initial input CSV file.

        Returns:
            tuple[list, list]: A tuple containing two lists:
                - The first list contains all successfully collected data.
                - The second list contains all items that resulted in an error.
        """
        if uncollected_data is not None:
            print("Restarting scraping for uncollected data...")
            search_df = uncollected_data
        else:
            search_df = pd.read_csv(self.config['files']['input_csv'], encoding='utf-8')

        total = len(search_df)
        print(f"Starting the search for {total} unique HEI/Program pairs...")

        for index, row in search_df.iterrows():
            hei_code = str(row['CD_INSTITUICAO_ENSINO'])
            program_code = str(row['CD_PROGRAMA'])
            print(f"\n({index + 1}/{total}) Processing HEI: {hei_code} | Program: {program_code}")
            self._process_item(hei_code, program_code)
        
            if (index + 1) % 10 == 0:
                print(f"\nProgress: {index + 1}/{total} items processed.")
                print(f"Collected so far: {len(self.collected_data)} items successfully.")
                print(f"Items with errors so far: {len(self.data_with_errors)}\n")
                with open(self.config['files']['intermediate_json'], 'w', encoding='utf-8') as f:
                    json.dump(self.collected_data, f, ensure_ascii=False, indent=4)

        print(f"\nScraping finished. {len(self.collected_data)} items collected successfully.")
                                     
        self.close_browser()
        return self.collected_data, self.data_with_errors

In [None]:

def main():
    """
    Main function to orchestrate the scraping process, including retries for failed items.
    """
    # Maximum number of retry attempts for failed items.
    MAX_RETRIES = config['max_retries']

    print("--- STARTING STEP 1: SCRAPING RESEARCH PROJECTS (SELENIUM) ---")

    # Initial scraping run.
    scraper = SucupiraScraper(config=config)
    # The run method is called without arguments for the first run.
    collected_data, failed_items = scraper.run()

    # Loop to reprocess failed items from the initial run.
    attempt_count = 0
    # The loop continues as long as there are failed items and we haven't exceeded the retry limit.
    while failed_items and attempt_count < MAX_RETRIES:
        attempt_count += 1
        print(f"\n--- RESTARTING SCRAPING (ATTEMPT {attempt_count}/{MAX_RETRIES}) FOR {len(failed_items)} FAILED ITEMS ---")

        time.sleep(5)  # Pause before retrying to avoid hammering the server.

        # A new scraper instance is created for the retry attempt.
        retry_scraper = SucupiraScraper(config=config)
        retry_df = pd.DataFrame(failed_items)

        # The run method is now called with the DataFrame of failed items.
        newly_collected_data, failed_items = retry_scraper.run(uncollected_data=retry_df)

        if newly_collected_data:
            print(f"Successfully collected {len(newly_collected_data)} new items.")
            # Add the newly collected data to the main list.
            collected_data.extend(newly_collected_data)
        else:
            print("No new data was collected in this attempt.")

    # After all retries, check if there are still items that could not be processed.
    if failed_items:
        print(f"After {MAX_RETRIES} attempts, {len(failed_items)} items still have errors and will be discarded.")
        # Optional: Save these persistent errors to a file for later analysis.
        with open('persistent_errors_step1.json', 'w', encoding='utf-8') as f:
            json.dump(failed_items, f, ensure_ascii=False, indent=4)
            print(f"Persistent errors saved to 'persistent_errors_step1.json'.")

    # Save all successfully collected data to a final JSON file.
    print(f"\nSaving a total of {len(collected_data)} successfully collected items...")
    with open('./data/intermediate_data.json', 'w', encoding='utf-8') as f:
        json.dump(collected_data, f, ensure_ascii=False, indent=4)

    print("\nProcess finished. Final data saved to './data/intermediate_data.json'.")



In [80]:

if __name__ == "__main__":
    main()

--- STARTING STEP 1: SCRAPING RESEARCH PROJECTS (SELENIUM) ---
Initializing Selenium with Firefox...
❌ Erro ao inicializar Firefox: response body:
{"message":"API rate limit exceeded for 189.122.185.124. (But here's the good news: Authenticated requests get a higher rate limit. Check out the documentation for more details.)","documentation_url":"https://docs.github.com/rest/overview/resources-in-the-rest-api#rate-limiting"}

request url:
https://api.github.com/repos/mozilla/geckodriver/releases/latest
response headers:
{'Date': 'Wed, 01 Oct 2025 17:59:08 GMT', 'Server': 'Varnish', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'deny', 'X-XSS-Protection': '1; mode=block', 'Content-Security-Policy': "default-src 'none'; style-src 'unsafe-inline'", 'Access-Control-Allow-Origin': '*', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remainin

# Enrich Project Data by Scraping Details

This notebook is designed to read a list of projects from a JSON file, scrape detailed information for each project from its specific URL, and save the enriched data back into a new JSON file.

**Process:**
1.  **Load Data**: Reads an input JSON file (`intermediate_data.json`) containing project information, including a link to a details page.
2.  **Scrape Details**: For each project, it visits the `Details_Link` and extracts:
    * The project's full description.
    * A list of team members.
    * A list of funders.
3.  **Save Enriched Data**: Saves the combined data into a new output file (`complete_data.json`).
4.  **Logging**: All actions, successes, and errors are logged to the console and to a file named `scraper.log`.

In [86]:
# --- Library Imports ---
import json
import requests
from bs4 import BeautifulSoup
import time

In [87]:
def extract_project_data(url):
    """
    Accesses a project URL with multiple retry attempts in case of an error,
    and extracts the description, members, and funders.
    """
    max_retries = 10
    pause_between_retries = 10  # seconds

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')
            
            extracted_data = {
                "Description": None,
                "Members": [],
                "Funders": []
            }

            # 1. Extract the Description
            description_label = soup.find('label', class_='control-label', string='Descrição:')
            if description_label:
                content_div = description_label.find_parent('div').find_next_sibling('div')
                if content_div:
                    extracted_data["Description"] = content_div.get_text(strip=True)

            # 2. Extract Members
            h1_members = soup.find('h1', string='Membros')
            if h1_members:
                members_table = h1_members.find_next('table')
                if members_table and members_table.tbody:
                    for row in members_table.tbody.find_all('tr'):
                        columns = row.find_all('td')
                        if len(columns) == 4:
                            extracted_data["Members"].append({
                                "Name": ' '.join(columns[0].get_text(strip=True).split()),
                                "Category": columns[1].get_text(strip=True),
                                "StartDate": columns[2].get_text(strip=True),
                                "EndDate": columns[3].get_text(strip=True)
                            })

            # 3. Extract Funders
            h1_funders = soup.find('h1', string='Financiadores')
            if h1_funders:
                funders_table = h1_funders.find_next('table')
                if funders_table and funders_table.tbody:
                    for row in funders_table.tbody.find_all('tr'):
                        columns = row.find_all('td')
                        if len(columns) == 4:
                            extracted_data["Funders"].append({
                                "Name": ' '.join(columns[0].get_text(strip=True).split()),
                                "FundingNature": columns[1].get_text(strip=True),
                                "Start": columns[2].get_text(strip=True),
                                "End": columns[3].get_text(strip=True)
                            })
            
            print(f"  [✅ SUCCESS] Successfully extracted data from {url}")
            return extracted_data

        except requests.exceptions.RequestException as e:
            print(f"  [⚠️ WARNING] Attempt {attempt + 1}/{max_retries} failed for {url}. Error: {e}")
            if attempt < max_retries - 1:
                print(f"  ...retrying in {pause_between_retries} seconds...")
                time.sleep(pause_between_retries)
            else:
                print(f"  [❌ ERROR] All {max_retries} attempts failed. Giving up on this URL.")

    return {
        "Description": "DATA COLLECTION FAILED AFTER MULTIPLE ATTEMPTS",
        "Members": [],
        "Funders": []
    }

In [88]:
def process_json_file(input_file, output_file):
    """
    Reads a JSON file, enriches the data using the scraper, and saves it to a new file.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            all_data = json.load(f)
    except FileNotFoundError:
        print(f"❌ Fatal Error: Input file '{input_file}' not found. The script will be terminated.")
        return
    except json.JSONDecodeError:
        print(f"❌ Fatal Error: The file '{input_file}' is not a valid JSON. The script will be terminated.")
        return

    total_projects = sum(len(inst.get("Projects", [])) for inst in all_data)
    current_project = 0
    print(f"Total of {total_projects} projects to be processed.")

    for institution in all_data:
        if "Projects" in institution:
            for project in institution["Projects"]:
                current_project += 1
                print(f"Processing project {current_project}/{total_projects}: {project.get('Project_Name', 'No Name')}")
                
                link = project.get("Details_Link")
                
                if link:
                    scraped_data = extract_project_data(link)
                    project.update(scraped_data)
                    time.sleep(0.5)
                else:
                    print(f"  [⚠️ WARNING] Project without 'Details_Link'. Skipping.")

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_data, f, indent=4, ensure_ascii=False)
    
    print(f"\n✅ Process complete! The full data has been saved to '{output_file}'.")

In [None]:
input_json_file = config['files']['intermediate_json']
output_json_file = config['files']['final_enriched_json']

# Run the main process
process_json_file(input_json_file, output_json_file)

Total of 139 projects to be processed.
Processing project 1/139: A reafirmação dos Direitos Fundamentais Sociais sob o olhar ético do Estado Democrático de Direito Agroambiental brasileiro
  [✅ SUCCESS] Successfully extracted data from https://sucupira-legado.capes.gov.br/sucupira/public/consultas/coleta/envioColeta/detalhesDados/viewProjetoPesquisa.xhtml?popup=true&idProjeto=9570471
  [✅ SUCCESS] Successfully extracted data from https://sucupira-legado.capes.gov.br/sucupira/public/consultas/coleta/envioColeta/detalhesDados/viewProjetoPesquisa.xhtml?popup=true&idProjeto=9570471
Processing project 2/139: “A teoria finalista mitigada enquanto mecanismo judicial de inclusão e efetividade do direito fundamental da proteção às relações de consumo”
Processing project 2/139: “A teoria finalista mitigada enquanto mecanismo judicial de inclusão e efetividade do direito fundamental da proteção às relações de consumo”
  [✅ SUCCESS] Successfully extracted data from https://sucupira-legado.capes.go