In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import pandas as pd

# Set up the Chrome WebDriver with ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Open the GSoC 2023 organizations page
driver.get('https://summerofcode.withgoogle.com/archive/2023/organizations')

# Set up WebDriverWait
wait = WebDriverWait(driver, 10)

# Prepare CSV file
with open('gsoc_organizations_2023.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Year', 'Organization'])  # Write header

    consecutive_empty_pages = 0

    while True:
        try:
            # Wait for the organization names to be present
            orgs = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'name')))
            num_orgs = len(orgs)
            print(f"Found {num_orgs} organizations on this page.")

            # Write each organization's name to the CSV
            for org in orgs:
                writer.writerow(['2023', org.text])
            
            # Check if the number of organizations is less than 50
            if num_orgs < 50:
                consecutive_empty_pages += 1
                print(f"Number of organizations is less than 50. Consecutive empty pages: {consecutive_empty_pages}")
                # If this condition is met twice in a row, stop
                if consecutive_empty_pages >= 2:
                    print("Terminating script after two consecutive pages with fewer than 50 organizations.")
                    break
            else:
                consecutive_empty_pages = 0  # Reset counter if the condition is not met

            # Find the "Next" button using the provided XPath
            try:
                next_button = wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/app-root/app-layout/mat-sidenav-container/mat-sidenav-content/div/div/main/app-organizations/app-orgs-grid/section[2]/div/mat-paginator/div/div/div[2]/button[2]/span[3]')))
                driver.execute_script("arguments[0].click();", next_button)
                print("Clicked 'Next' button.")
            
                # Wait for the next page to load
                time.sleep(5)  # Adjust time if necessary
            
            except Exception as e:
                print(f"Error finding or clicking 'Next' button: {e}")
                break
            
        except Exception as e:
            print(f"Error finding organizations: {e}")
            break

print("CSV file 'gsoc_organizations_2023.csv' created successfully!")

# Close the browser
driver.quit()

# Attempt to load the CSV file with different encodings
try:
    df = pd.read_csv('gsoc_organizations_2023.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('gsoc_organizations_2023.csv', encoding='ISO-8859-1')
    except UnicodeDecodeError:
        df = pd.read_csv('gsoc_organizations_2023.csv', encoding='windows-1252')

# Remove duplicates from a specific column, e.g., 'Organization'
df_cleaned = df.drop_duplicates(subset='Organization')

# Save the cleaned data to a new CSV file
df_cleaned.to_csv('cleaned_file_2023.csv', index=False)

print("Duplicates removed and saved to 'cleaned_file.csv'")

Found 50 organizations on this page.
Clicked 'Next' button.
Found 50 organizations on this page.
Clicked 'Next' button.
Found 50 organizations on this page.
Clicked 'Next' button.
Found 21 organizations on this page.
Number of organizations is less than 50. Consecutive empty pages: 1
Clicked 'Next' button.
Found 21 organizations on this page.
Number of organizations is less than 50. Consecutive empty pages: 2
Terminating script after two consecutive pages with fewer than 50 organizations.
CSV file 'gsoc_organizations_2023.csv' created successfully!
Duplicates removed and saved to 'cleaned_file.csv'
