In [8]:
# Import necessary libraries
import selenium               # Selenium for web automation
import boto3                 # Boto3 for AWS SDK
import pandas as pd         # Pandas for data manipulation
import time                # Time for time-related functions

# Specific imports from selenium
from selenium import webdriver          # WebDriver for browser automation
from selenium.webdriver.common.by import By         # By for locating elements
from selenium.webdriver.chrome.service import Service   # Service for Chrome driver service
from webdriver_manager.chrome import ChromeDriverManager   # ChromeDriverManager for managing Chrome driver
from selenium.webdriver.support.ui import WebDriverWait    # WebDriverWait for waiting for elements
from selenium.webdriver.support import expected_conditions as EC   # EC for expected conditions
from time import sleep

In [9]:
# Initialize the Chrome WebDriver service
chrome_service = Service(ChromeDriverManager().install())
browser = webdriver.Chrome(service=chrome_service)

# URL of the website to scrape
website_url = 'https://www.charitiesnys.com/RegistrySearch/search_charities.jsp'

# Navigate to the website
browser.get(website_url)

# Find the input element for EIN and send keys
ein_input = browser.find_element(By.XPATH, '//*[@id="header"]/div[2]/div/table/tbody/tr/td[2]/div/div/font/font/font/font/font/font/table/tbody/tr[4]/td/form/table/tbody/tr[2]/td[2]/input[1]')
ein_input.send_keys('0')

# Click the search button
search_button = browser.find_element(By.XPATH, '//*[@id="header"]/div[2]/div/table/tbody/tr/td[2]/div/div/font/font/font/font/font/font/table/tbody/tr[4]/td/form/table/tbody/tr[10]/td/input[1]')
search_button.click()

# Wait for the table to load
sleep(4)
table = browser.find_element(By.CSS_SELECTOR, 'table.Bordered')
sleep(1)

# Initialize an empty list to store scraped data
scraped_data = []

# Loop through pages
while True:
    # Extract data from the table
    for row in table.find_elements(By.CSS_SELECTOR, 'tr'):
        cols = scraped_data.append([cell.text for cell in row.find_elements(By.CSS_SELECTOR, 'td')])

    # Check for the next page link
    try:
        next_page_link = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div/table/tbody/tr/td[3]/div/div/span[2]/a[9]')))
        next_page_link.click()
        # Wait for the next page to load
        sleep(4)
        table = browser.find_element(By.CSS_SELECTOR, 'table.Bordered')
    except:
        # Stop if next page is not available
        break

# Create a DataFrame from the scraped data with appropriate column names
df = pd.DataFrame(scraped_data, columns=["Organization Name", "NY Reg #", "EIN", "Registrant Type", "City", "State"])

# Drop rows with NaN values
df.dropna(inplace=True)

# Display the DataFrame
df.head()  # Let's have a look at the data before creating the CSV file and loading it into S3


Unnamed: 0,Organization Name,NY Reg #,EIN,Registrant Type,City,State
1,"""Forever Captain Poodaman"" The Ahmad Butler Fo...",48-07-16,843800926,NFP,PHILADELPHIA,PA
2,"""Incredibly Blessed"" Inc",49-54-61,842071758,NFP,STATEN ISLAND,NY
3,"""R"" S.U.C.C.E.S.S. Foundation Inc.",49-06-59,874012670,NFP,ROCHESTER,NY
4,"""Studio 5404"" Inc.",44-39-58,463180470,NFP,MASSAPAQUA,NY
5,"""THEY ARE HAITIAN"" FUND, INC.",20-63-46,300170128,NFP,HUDSON,NY


In [10]:
import boto3
import pandas as pd
import time

# Define the S3 bucket name
bucket_name = 'database-update-bucket-m10-assn-harshkumar-vaghmaria'

# Prepare the CSV file name
filename_prefix = 'charities_bureau_scrape_'  # Specify the prefix for the filename
current_datetime = time.strftime("%Y%m%d%H%M%S")  # Get the current timestamp
s3_key = f"{filename_prefix}{current_datetime}.csv"  # Construct the S3 key (filename)

# Convert the DataFrame to a CSV string
csv_buffer = df.to_csv(index=False)

# Upload the CSV file to S3
s3 = boto3.resource('s3')
s3_object = s3.Object(bucket_name, s3_key)
s3_object.put(Body=csv_buffer)

# Print a success message
print("File successfully uploaded to S3 location: s3://{}/{}".format(bucket_name, s3_key))


File successfully uploaded to S3 location: s3://database-update-bucket-m10-assn-harshkumar-vaghmaria/charities_bureau_scrape_20240414232219.csv
