## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import os

## Useful functions

In [2]:
def get_soup(url):
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    return soup

In [3]:
def get_all_rows(soup):
    table = soup.find(id='results')
    tbody = table.find('tbody')
    trs = tbody.find_all('tr')

    return trs

In [4]:
def scrapy_row(tr):
    identity = None
    body = None

    td_year = tr.find('td', {'data-stat': 'year_id'})
    td_round = tr.find('td', {'data-stat': 'draft_round'})
    td_pick = tr.find('td', {'data-stat': 'draft_pick'})

    td_name = tr.find('td', {'data-stat': 'player'})
    td_position = tr.find('td', {'data-stat': 'pos'})
    td_age = tr.find('td', {'data-stat': 'draft_age'})
    td_first_team_ap = tr.find('td', {'data-stat': 'all_pros_first_team'})
    td_pro_bowls = tr.find('td', {'data-stat': 'pro_bowls'})
    td_team = tr.find('td', {'data-stat': 'team'})
    td_av = tr.find('td', {'data-stat': 'career_av'})
    
    if td_year is None:
        return None

    data = {}

    data['year'] = td_year.find('a').text
    data['round'] = td_round.text
    data['pick'] = td_pick.text
    data['player_name'] = td_name.text
    data['position'] = td_position.text
    data['age'] = td_age.text
    data['first_team_ap'] = td_first_team_ap.text
    data['pro_bowls'] = td_pro_bowls.text
    data['team'] = td_team.text

    av = td_av.text
    if av == '':
        av = '0'
    data['av'] = av

    return data

In [5]:
def get_next_url(soup):
    next_btn = soup.find('a', {'class': 'button2 next'}, href=True)

    if next_btn is None:
        return None

    return next_btn['href']

In [6]:
async def save_player(player, writer):
    print('async writting')
    writer.writerow(player)

## Scrapping

In [7]:
def get(first_year=1970, last_year=2019, target='drafted_players'):
    BASE_URL = 'https://www.pro-football-reference.com'
    path = '/play-index/draft-finder.cgi?request=1&year_min=' + str(first_year) + '&year_max=' + str(last_year) + '&draft_slot_min=1&draft_slot_max=500&pick_type=overall&pos%5B%5D=qb&pos%5B%5D=rb&pos%5B%5D=wr&pos%5B%5D=te&pos%5B%5D=e&pos%5B%5D=t&pos%5B%5D=g&pos%5B%5D=c&pos%5B%5D=ol&pos%5B%5D=dt&pos%5B%5D=de&pos%5B%5D=dl&pos%5B%5D=ilb&pos%5B%5D=olb&pos%5B%5D=lb&pos%5B%5D=cb&pos%5B%5D=s&pos%5B%5D=db&pos%5B%5D=k&pos%5B%5D=p&conference=any&show=all&order_by=default'
    page = 0

    if os.path.isfile(target + '.csv'):
        os.remove(target + '.csv')

    with open(target + '.csv', 'w', newline='') as csvfile:
        fieldnames = ['year', 'round', 'pick', 'player_name', 'position', 'age', 
                      'first_team_ap', 'pro_bowls', 'team', 'av']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        while True:
            start = time.time()

            soup = get_soup(BASE_URL + path)

            trs = get_all_rows(soup)
            for tr in trs:
                player = scrapy_row(tr)
                if player is None:
                    continue

                writer.writerow(player)

            end = time.time()

            page += 1
            print('Page ' + str(page) + ' took ' + str(round(end - start)) + 's')

            path = get_next_url(soup)

            if path is None:
                break

In [8]:
start = time.time()

get()

end = time.time()
print('Overall time: ' + str(round(end - start)) + 's')

Page 1 took 4s
Page 2 took 4s
Page 3 took 4s
Page 4 took 3s
Page 5 took 4s
Page 6 took 8s
Page 7 took 8s
Page 8 took 8s
Page 9 took 8s
Page 10 took 13s
Page 11 took 9s
Page 12 took 8s
Page 13 took 12s
Page 14 took 9s
Page 15 took 8s
Page 16 took 8s
Page 17 took 8s
Page 18 took 7s
Page 19 took 7s
Page 20 took 9s
Page 21 took 8s
Page 22 took 4s
Page 23 took 4s
Page 24 took 4s
Page 25 took 3s
Page 26 took 3s
Page 27 took 3s
Page 28 took 3s
Page 29 took 4s
Page 30 took 4s
Page 31 took 3s
Page 32 took 3s
Page 33 took 3s
Page 34 took 3s
Page 35 took 4s
Page 36 took 3s
Page 37 took 3s
Page 38 took 4s
Page 39 took 5s
Page 40 took 4s
Page 41 took 3s
Page 42 took 3s
Page 43 took 3s
Page 44 took 4s
Page 45 took 4s
Page 46 took 4s
Page 47 took 4s
Page 48 took 3s
Page 49 took 3s
Page 50 took 3s
Page 51 took 5s
Overall time: 271s
