## Requests

In [15]:
# simple web crawling, use 'requests'
# first import it (it is installed, but not loaded by default)
import requests
response = requests.get("http://www.ufl.edu")
print (response.content)

b'<!DOCTYPE html>\r\n<html lang="en" class="no-js">\r\n<head>\r\n  <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({\'gtm.start\':new Date().getTime(),event:\'gtm.js\'});var f=d.getElementsByTagName(s)[0],j=d.createElement(s),dl=l!=\'dataLayer\'?\'&l=\'+l:\'\';j.async=true;j.src=\'https://www.googletagmanager.com/gtm.js?id=\'+i+dl;f.parentNode.insertBefore(j,f);})(window,document,\'script\',\'dataLayer\',\'GTM-NZ4MS8Q\');</script>\r\n\t\r\n\t<meta charset="utf-8">\r\n\t<meta name="viewport" content="width=device-width,initial-scale=1">\r\n\t<meta http-equiv="X-UA-Compatible" content="IE=edge" />\r\n\t<title>University of Florida</title>\r\n  \r\n\t<link rel="stylesheet" href="/media/templates/uf2015/css/style.css">\r\n    <link rel="stylesheet" href="/media/wwwufledu/UFL-CSS.css">\r\n\r\n    \r\n\t<link rel="icon" href="/media/templates/uf2015/img/favicon.ico" >\r\n\t<link rel="apple-touch-icon" href="/media/templates/uf2015/img/favicon-180.png">\r\n\t<meta name="msapplication-Til

## Selenium

In [None]:
# Preparation: install Selenium, download Chrome driver
# ------------------------------------------------------
# Install selenium
# pip install selenium
# Chrome driver download page
# https://sites.google.com/a/chromium.org/chromedriver/downloads
# unzip and place chromedriver.exe in the same folder as the .ipynb file

In [1]:
import time
from selenium import webdriver

# without setting a path variable, need to point to where chrome driver is on disk
driver = webdriver.Chrome(r'chromedriver.exe')

In [2]:
# Navigate to some page
driver.get("http://www.ufl.edu") 

In [3]:
# Close the browser
driver.quit()

## Google search

In [16]:
driver = webdriver.Chrome(r'chromedriver.exe')
driver.get("https://www.google.com")

In [17]:
#input element has name="q"
elem = driver.find_element_by_name("q")
elem

<selenium.webdriver.remote.webelement.WebElement (session="70556f1953fb0f0f0c5dea0ee702658f", element="bf92ee22-89df-4d8c-b7d9-fc03be2cf3da")>

In [19]:
from selenium.webdriver.common.keys import Keys
elem.clear()
elem.send_keys("go gators")
elem.send_keys(Keys.RETURN)

In [None]:
# overview of selector functions:
# https://selenium-python.readthedocs.io/locating-elements.html

## Startengine.com

In [24]:
#https://www.startengine.com/explore
driver = webdriver.Chrome(r'chromedriver.exe')
driver.get("https://www.startengine.com/explore") 

In [25]:
# first hyperlink
el = driver.find_element_by_tag_name("a")
el.get_attribute("href")

'https://www.startengine.com/explore?utm_source=startengine&utm_medium=main_website&utm_campaign=nav_bar'

In [27]:
# all hyperlinks -- notice find_elements (not find_element)
elements = driver.find_elements_by_tag_name("a")
for el in elements[15:30]:
    print(el.get_attribute("href"))

https://www.startengine.com/explore?utm_source=startengine&utm_medium=main_website&utm_campaign=nav_bar
https://www.startengine.com/forgot-password
https://www.startengine.com/signup
mailto:contact@startengine.com
https://www.startengine.com/login?redirect=%2Fexplore
https://www.startengine.com/login
https://www.startengine.com/terracycle
https://www.startengine.com/knightscope
https://www.startengine.com/dbg
https://www.startengine.com/startengine-crowdfunding
https://www.startengine.com/golfboard-2
https://www.startengine.com/liquidpiston
https://www.startengine.com/hylete-2019
https://www.startengine.com/parallel
https://www.startengine.com/oneroqspirits


## Crawl (many) urls

In [None]:
# Example: scraping the web
# load external file that has an identifier (unique number) and a url
# Load each url and write it to disk (with identifier as the file name)

In [22]:
import csv

# import the file with ids and urls
with open('list.txt') as f: 
    # pages is a list of dictionaries, where each dictionary has a 'id' and 'url' attribute (see text file)
    pages = [ {k: v for k, v in row.items()} for row in csv.DictReader(f, delimiter='|', skipinitialspace=True)]
pages
    

[{'id': '3', 'url': 'http://www.ufl.edu'},
 {'id': '2',
  'url': 'https://www.sec.gov/cgi-bin/browse-edgar?CIK=aapl&Find=Search&owner=exclude&action=getcompany'},
 {'id': '1',
  'url': 'https://www.sec.gov/Archives/edgar/data/880417/000090631803000077/0000906318-03-000077.txt'}]

In [23]:
# open a browser
driver = webdriver.Chrome(r'chromedriver.exe')
    
# loop through pages
for page in pages:
    # go to url
    driver.get(page["url"])
    # some time to finish loading
    driver.implicitly_wait(2)
    # grab contents (encoding deals with special characters)
    contents = driver.page_source.encode("utf-8")
    # write to output folder (needs to exist)
    filename = "output/{}.html".format( page["id"])
    with open( filename, 'wb') as fd_html:
        fd_html.write( contents ) 
    fd_html.close() 
    # take a well deserved rest
    driver.implicitly_wait(2)

# done
#driver.quit()

## In-class assignment

In [None]:
# Crawl the crowdfunding pages at https://www.startengine.com/explore
# Collect the urls to the different projects, go to each project page, and save it to disk.
# Hint: see code snippet startengine above: this gets all hyperlinks; can you get the 'parent' of each and see the class? 
# Use google (how to find the parent of an element selenium python) (the parent has a class 'tombstone') use 
# get_attribute("class") function to get the class of an element (parent class should be 'tombstone')