<a href="https://colab.research.google.com/github/joshlevin91/Benefit-Corp-Web-Scraper/blob/master/BCorporationWebScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install Selenium and Chromedriver**

In [None]:
!pip install selenium
!apt-get update 
!apt install chromium-chromedriver

**Get benefit corporation information**

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

# Get company website
def getWebsite(url):
  source = requests.get(url).text
  soup = BeautifulSoup(source, 'lxml')
  res = soup.find_all('div',{'class':'opacity-60'})
  for r in res:
    a = r.find('a', href=True)
    if a:
      return a['href']

# Get region string
def getRegStr(region):
  regstr = region[0]
  if regstr == "Netherlands":
    return "The Netherlands"
  if len(region) == 2:
      regstr += " " + region[1]
  return regstr

# Get bcorporation URL by region and page
def getURL(region, page):
  url = 'https://www.bcorporation.net/en-us/find-a-b-corp/search?page='
  url += str(page)
  url += '&sortBy=companies-production-en-us-latest-certification-desc&refinement=countries%3D'
  url += region[0]
  if len(region) == 2:
    url += '%20' + region[1]
  return url

# Get company info at a URL
def getCompanyInfo(region, url):
  driver.get(url)
  time.sleep(0.3) # Make sure page loads
  html = driver.page_source
  soup = BeautifulSoup(html, "html.parser")

  # Iterate through each company on page
  info = []
  for company in soup.find_all("li", {"class":"ais-Hits-item"}):
    name = company.find("div", {"data-testid":"company-name"}).text
    description = company.find("p").text
    cert = company.find("div", {"class":"flex-grow text-gray-dark"}).text.lstrip('Certified since: ')

    # Go to company page to get its website
    a = company.find('a', href=True)
    bcorp_company_page = 'https://www.bcorporation.net' + a['href']
    website = getWebsite(bcorp_company_page)

    company_info = [region, name, website, description, cert]
    info.append(company_info)

  return info

# Get info on corporations located in specified regions (sorted by newest)
def getCorporationInfo(regions):
  corporations = []

  # Iterate through regions
  for region in regions:
    regstr = getRegStr(region)
    print("Collecting data from " + regstr + "...")
    
    # Iterate through pages
    page = 1
    while True:
      url = getURL(region, page)
      info = getCompanyInfo(regstr, url)

      # Last page reached when no more info is available
      if not info:
        print()
        break
      else:
        print("Page " + str(page))
        corporations.extend(info)
        page += 1

  return corporations

# Initiate the webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=chrome_options)

# Get corporation info
regions = [['United', 'States'], ['United', 'Kingdom'], ['Canada'], ['Australia'], ['France'], ['Netherlands', 'The'], ['Italy'], ['Spain'], ['Germany']]
corporations = getCorporationInfo(regions)

# Close the webdriver
driver.close()

**Write benefit corporation information to a Google worksheet**

In [82]:
import time

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()
gc = gspread.authorize(creds)

# Create and open worksheet
gc.create('B Corps')
worksheet = gc.open('B Corps').sheet1

# Add corporation data to worksheet
worksheet.insert_row(['Region', 'Name', 'Website', 'Description', 'Certification Date'])
worksheet.format('A1:E1', {'textFormat': {'bold': True}})
for c in corporations:
  time.sleep(0.1) #  Google Sheets API has a limit on the request rate
  worksheet.append_row(c)

**Go to https://sheets.google.com to find the new spreadsheet**