In [3]:
import re
import time
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import pickle
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

### Problem 1
Modify `scrape_books()` so that it gathers the price for each fiction book and
returns the mean price, in £, of a fiction book.

In [42]:
def scrape_books(start_page = "index.html"):
    """ Crawl through http://books.toscrape.com and extract fiction data"""
    base_url="http://books.toscrape.com/catalogue/category/books/fiction_10/"
    titles = []
    fict_prices = []
    page = base_url + start_page                # Complete page URL.
    next_page_finder = re.compile(r"next")      # We need this button.
    
    current = None

    for _ in range(2):
        while current == None:                   # Try downloading until it works.
            # Download the page source and PAUSE before continuing.  
            page_source = requests.get(page).text
            time.sleep(1)           # PAUSE before continuing.
            soup = BeautifulSoup(page_source, "html.parser")
            current = soup.find_all(class_="product_pod")
            
        # Navigate to the correct tag and extract title.
        for book in current:
            price = book.find(class_="product_price").p.contents[0]
            price = re.sub("[^0-9\.]", r"", price)
            fict_prices.append(float(price))
            titles.append(book.h3.a["title"])
    
        # ind the URL for the page with the next data
        if "page-2" not in page:
            # Find the URL for the page with the next data.
            new_page = soup.find(string=next_page_finder).parent["href"]    
            page = base_url + new_page      # New complete page URL.
            current = None
    
    result = sum(fict_prices) / len(fict_prices)
    
    with open('ans1', 'wb') as fp:

               pickle.dump(result, fp)
            
    return result

In [43]:
scrape_books()

36.45550000000001

### Problem 2
Modify `bank_data()` so that it extracts the total consolidated assets ("Consol
Assets") for JPMorgan Chase, Bank of America, and Wells Fargo recorded each December from
2004 to the present. Return a list of lists where each list contains the assets of each bank.

In [6]:
def bank_data():
    """Crawl through the Federal Reserve site and extract bank data."""
    # Compile regular expressions for finding certain tags.
    link_finder = re.compile(r"December 31, (?!2003)")
    chase_bank_finder = re.compile(r"^JPMORGAN CHASE BK")
    BofA_finder = re.compile(r"^BANK OF AMER")
    wells_fargo_finder = re.compile(r"WELLS FARGO")

    # Get the base page and find the URLs to all other relevant pages.
    base_url="https://www.federalreserve.gov/releases/lbr/"
    base_page_source = requests.get(base_url).text
    base_soup = BeautifulSoup(base_page_source, "html.parser")
    link_tags = base_soup.find_all(name='a', href=True, string=link_finder)
    pages = [base_url + tag.attrs["href"] for tag in link_tags]

    # Crawl through the individual pages and record the data.
    chase_assets = []
    BofA_assets = []
    wf_assets = []
    for page in pages:
        time.sleep(1)               # PAUSE, then request the page.
        soup = BeautifulSoup(requests.get(page).text, "html.parser")

        # Find the tag corresponding to the banks' consolidated assets.
        chase_temp_tag = soup.find(name="td", string=chase_bank_finder)
        BofA_temp_tag = soup.find(name="td", string=BofA_finder)
        wf_temp_tag = soup.find(name="td", string=wells_fargo_finder)

        for _ in range(10):
            chase_temp_tag = chase_temp_tag.next_sibling
            BofA_temp_tag = BofA_temp_tag.next_sibling
            wf_temp_tag = wf_temp_tag.next_sibling
            
        # Extract the data, removing commas.
        chase_assets.append(int(chase_temp_tag.string.replace(',', '')))
        BofA_assets.append(int(BofA_temp_tag.string.replace(',', '')))
        wf_assets.append(int(wf_temp_tag.string.replace(',', '')))
        
    result = [chase_assets, BofA_assets, wf_assets]
        
    with open('ans2', 'wb') as fp:
               pickle.dump(result, fp)

    return result

In [7]:
bank_data()

[[2218960,
  2140778,
  2082803,
  1914658,
  2074952,
  1945467,
  1896773,
  1811678,
  1631621,
  1627684,
  1746242,
  1318888,
  1179390,
  1013985,
  967365],
 [1782639,
  1751524,
  1677490,
  1639305,
  1574093,
  1433716,
  1474077,
  1451969,
  1482278,
  1465221,
  1471631,
  1312794,
  1196124,
  1082243,
  771619],
 [1689351,
  1747354,
  1727235,
  1610580,
  1532784,
  1373600,
  1266125,
  1161490,
  1102278,
  608778,
  635476,
  467861,
  398671,
  403258,
  366256]]

### Problem 3
The Basketball Reference website at `https://www.basketball-reference.com`
contains data on NBA athletes, including which player led different categories for each season.
For the past ten seasons, identify which player had the most season points and find how many
points they scored during that season. Return a list of triples consisting of the season, the
player, and the points scored, ("season year", "player name", points scored).

In [8]:
def prob3():
    '''The Basketball Reference website at 
    https://www.basketball-reference.com} hosts data on NBA athletes, 
    including which player led different categories.
    For the past ten years, identify which player had the most season points.
    Return a list of triples, ("season year", "player name", points scored).
    '''
    
    TABLE_ELEM_ID = 'totals_stats'
    names  = []
    points = []
    years = range(2010, 2020)
    
    for year in years:
        url = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html#totals_stats::pts'
        html = requests.get(url).text
        df = pd.read_html(html)[0]
        
        df = df[df.apply(lambda row: row['PTS'].isnumeric(), axis=1)]
        df['PTS'] = df['PTS'].astype(int)
        df = df.sort_values('PTS', ascending=False).reset_index(drop=True)
        names.append(df["Player"][0])
        points.append(df["PTS"][0])
        
    result = [(years[i], names[i], int(points[i])) for i in range(len(names))]
        
    with open('ans3', 'wb') as fp:
               pickle.dump(result, fp)
        
    return result

In [9]:
result = prob3()

In [10]:
print(type(result[0][2]))

<class 'int'>


### Problem 4
The website IMDB contains a variety of information on movies. Specifically,
information on the top 10 box offce movies of the week can be found at `https://www.imdb.
com/chart/boxoffice`. Using `BeautifulSoup`, `Selenium`, or both, return a list of the top 10
movies of the week and order the list according to the total grossing of the movies, from most
money to the least.

In [11]:
def prob4():
    """
    Sort the Top 10 movies of the week by Total Grossing, taken from 
    https://www.imdb.com/chart/boxoffice?ref_=nv_ch_cht.

    Returns:
        titles (list): Top 10 movies of the week sorted by total grossing
    """
    
    url = "https://www.imdb.com/chart/boxoffice"
    html = requests.get(url).text
    df = pd.read_html(html)[0]
    
    df["Gross"] = df["Gross"].replace('[\$M]', '', regex=True).astype(float)
    df = df.sort_values("Gross", ascending=False).reset_index(drop=True)
    
#     df = df[df.apply(lambda row: row['Gross'].isnumeric(), axis=1)]
#     df['Gross'] = df['Gross'].astype(int)
#     df = df.sort_values('Gross', ascending=False).reset_index(drop=True)

    result = list(df.Title)

    with open('ans4', 'wb') as fp:
               pickle.dump(result, fp)
    
    return result

In [12]:
prob4()

['The Lion King',
 'It Chapter Two',
 'Hustlers',
 'Good Boys',
 'Angel Has Fallen',
 'Downton Abbey',
 'Ad Astra',
 'Rambo: Last Blood',
 'Abominable',
 'Judy']

### Problem 5
The arXiv (pronounced "archive") is an online repository of scientific publications,
hosted by Cornell University. Write a function that accepts a string to serve as a search
query defaulting to linkedin. Use `Selenium` to enter the query into the search bar of `https:
//arxiv.org` and press Enter. The resulting page has up to 50 links to the PDFs of technical
papers that match the query. Gather these URLs, then continue to the next page (if there are
more results) and continue gathering links until obtaining at most 150 URLs. Return the list
of URLs.

In [13]:
def prob5(search_query):
    """Use Selenium to enter the given search query into the search bar of
    https://arxiv.org and press Enter. The resulting page has up to 25 links
    to the PDFs of technical papers that match the query. Gather these URLs,
    then continue to the next page (if there are more results) and continue
    gathering links until obtaining at most 100 URLs. Return the list of URLs.

    Returns:
        (list): Up to 100 URLs that lead directly to PDFs on arXiv.
    """
    # install chrome driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    
    links = []
    # open chrome driver
#     driver = webdriver.Chrome()
    driver.get("https://arxiv.org")
    try:
        
        # Query search bar
        input_text = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, "query"))
        )
        input_text.send_keys(search_query)
        
        # Enter Search Query
        search_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="header"]/div[2]/form/div/button'))
        )
        search_button.click()
        
        # get html
        html_text = driver.page_source
        
        while True:
            
            # only find up to 150 links
            if len(links) > 150:
                break
                
            # parse links
            soup = BeautifulSoup(html_text)
            anchors = soup.find_all('a', text=re.compile('pdf'))
            links += [ a['href'] for a in anchors ]
        
            # try to click the next button -- stop if fails
            try:
                button = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, '/html/body/main/div[2]/nav[1]/a[2]'))
                )
                button.click()
                time.sleep(5)
            except Exception:
                break
        
        
        time.sleep(5)
    finally:
        driver.quit()
        
    # return 150 links
    return links[:150]

In [14]:
result = prob5("linkedin")


Checking for win32 chromedriver:77.0.3865.40 in cache
Driver found in C:\Users\Kameron Lightheart\.wdm\chromedriver\77.0.3865.40\win32/chromedriver.exe


In [15]:
with open('ans5', 'wb') as fp:
               pickle.dump(result, fp)

In [16]:
import numpy as np
import pickle

def self_check():

    # Problem 1
    with open('ans1', 'rb') as fp:
        res = pickle.load(fp)
    assert type(res) == float

    # Problem2
    """
        extract the assets for 
        JPMORGAN CHASE BK NA
        BANK OF AMER NA
        WELLS FARGO BK NA
    """
    with open('ans2', 'rb') as fp:
        res = pickle.load(fp)
    assert type(res) == list
    assert len(res) == 3
    assert type(res[0]) == list
    assert len(res[0]) == 15
    assert type(res[0][0]) == int

    # Problem 3
    """ 
        make sure your first tuple in the list 
        aka res[0] corresponds to the season that ended in 2019, 
        I won't be checking how you stored the year I'll just be 
        expecting them to be in order with the last entry being 
        the season that ended in 2010 
    """
    with open('ans3', 'rb') as fp:
        res = pickle.load(fp)
    assert type(res) == list
    assert len(res) == 10
    assert type(res[0]) == tuple
    assert len(res[0]) == 3
    assert type(res[0][1]) == str    #player's name
    assert type(res[0][2]) == int    #points scored


    #Problem 4
    """ 
        the first movie in the list should be the movie with 
        the most money or that grossed the most money, 
        not sure how to say it 
    """
    with open('ans4', 'rb') as fp:
        res = pickle.load(fp)
    assert type(res) == list
    assert len(res) == 10
    assert (type(res[0]) == str or type(res[0]) == np.str_)


    #Problem 5
    """ 
        for this problem store the list that you get when 
        the default 'linkedin' search query is passed in. the first 
        element in your list should be 'https://arxiv.org/pdf/1907.12549' 
    """
    with open('ans5', 'rb') as fp:
        res = pickle.load(fp)
    assert type(res) == list
    assert len(res) > 100
    assert len(res) < 150
    assert type(res[0]) == str

    return True

In [17]:
self_check()

True