## Web Scraping

In [1]:
# Import necessary libraries

import pandas as pd
from time import sleep
from random import randint

from urllib.request import urlopen
from bs4 import BeautifulSoup

### Define function to extract the player data

In [None]:
def extract_player_data(table_rows):
    """
    Extract and return the the desired information from the td elements within
    the table rows.
    """
    # create the empty list to store the player data
    player_data = []
    
    for row in table_rows:  # for each row do the following

        # Get the text for each table data (td) element in the row
        # Some player names end with ' HOF', if they do, get the text excluding
        # those last 4 characters,
        # otherwise get all the text data from the table data
#         player_list = [td.get_text()[:-4] if td.get_text().endswith(" HOF") 
#                        else td.get_text() for td in row.find_all("td")]
        player_list = [td.get_text() for td in row.find_all("th")]
        player_list.extend([td.get_text() for td in row.find_all("td")])
        # there are some empty table rows, which are the repeated 
        # column headers in the table
        # we skip over those rows and and continue the for loop
        if not player_list:
            continue

        # Extracting the player links
        # Instead of a list we create a dictionary, this way we can easily
        # match the player name with their pfr url
        # For all "a" elements in the row, get the text
        # NOTE: Same " HOF" text issue as the player_list above
        links_dict = {(link.get_text()[:-4]   # exclude the last 4 characters
                       if link.get_text().endswith(" HOF")  # if they are " HOF"
                       # else get all text, set thet as the dictionary key 
                       # and set the url as the value
                       else link.get_text()) : link["href"] 
                       for link in row.find_all("a", href=True)}

        # The data we want from the dictionary can be extracted using the
        # player's name, which returns us their pfr url, and "College Stats"
        # which returns us their college stats page
    
        # add the link associated to the player's pro-football-reference page, 
        # or en empty string if there is no link
        player_list.append(links_dict.get(player_list[3], ""))

        # add the link for the player's college stats or an empty string
        # if ther is no link
        player_list.append(links_dict.get("College Stats", ""))

        # Now append the data to list of data
        player_data.append(player_list)
        
    return player_data

In [None]:
# Create an empty list that will contain all the dataframes
# (one dataframe for each draft)
draft_dfs_list = []

# a list to store any errors that may come up while scraping
errors_list = []

In [None]:
# The url template that we pass in the draft year inro
url_template = "http://www.pro-football-reference.com/years/{year}/draft.htm"

# for each year from 1967 to (and including) 2016
for year in range(2013, 2023): 
    
    # Use try/except block to catch and inspect any urls that cause an error
    try:
        # get the draft url
        url = url_template.format(year=year)

        # get the html
        html = urlopen(url)

        # create the BeautifulSoup object
        soup = BeautifulSoup(html, "lxml") 

        # get the column headers
        column_headers = [th.getText() for th in 
                          soup.findAll('tr', limit=2)[1].findAll('th')]
        column_headers.extend(["Player_NFL_Link", "Player_NCAA_Link"])

        # select the data from the table using the '#drafts tr' CSS selector
        table_rows = soup.select("#drafts tr")[2:] 

        # extract the player data from the table rows
        player_data = extract_player_data(table_rows)

        # create the dataframe for the current years draft
        year_df = pd.DataFrame(player_data, columns=column_headers)

        # add the year of the draft to the dataframe
        year_df.insert(0, "Draft_Yr", year)

        # append the current dataframe to the list of dataframes
        draft_dfs_list.append(year_df)
    
    except Exception as e:
        # Store the url and the error it causes in a list
        error =[url, e] 
        # then append it to the list of errors
        errors_list.append(error)

In [None]:
# store all drafts in one DataFrame
draft_df = pd.concat(draft_dfs_list, ignore_index=True)

In [None]:
# Select on the columns I need from the all drafts DataFrame
drafts_df = draft_df[['Draft_Yr','Rnd','Pick','Tm','Player','Pos','Age','To','College/Univ','Player_NFL_Link','Player_NCAA_Link']]

### Scraping each players college stats page

I removed all of the offensive linemen as they don't have any relevant stats on sportsreference. Addition I removed punters and kickers since none of them have been drafted in the first round in my time period. Additionally I had to remove two players who had blank pages despite not playing OL or Special Teams. The sleep timer of 5.15 is very important as sportsreference will block anyone who attempts more than 20 requests in 1 minute. Anything over 3 should work but I thought I had 3.75 and got a 429 error so I went up just to be safe.

Each row of data is added to the df DataFrame as it goes and there is a block of code for each primary category needed for the data

In [38]:
# Counter is initiated to see how many rows were run in case there was an error and it would be good to know where it left off
counter = 0
OL_list = ['G','OT','T','C','OL','K','P']

for row in range(len(drafts_df)):
#for row in range(16):
    player_url = drafts_df.Player_NCAA_Link[row]
    if drafts_df.Pos[row] in OL_list or drafts_df.Player[row] == 'Quinton Bell' or drafts_df.Player[row] == 'Michael Woods II':
        pass
    else:
        if len(player_url) > 0:
            sleep(5.15)
            html = urlopen(player_url)
            soup = BeautifulSoup(html,'html.parser')
            
            if soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'pass_cmp':
                try:
                    passing = soup.find("div",{'id':'all_passing'}).find('tfoot').select('td[class="right"]')
                    passing_values = [col.getText() for col in passing]
                    col_names = []
                    for col in range(0,len(passing)):
                        col_names.append(passing[col]['data-stat'])
                    dic = dict(zip(col_names,passing_values))
                    temp_passing_dict = dic.fromkeys(dic,0)
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key] 
                except:
                    dic = temp_passing_dict
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key] 

            elif soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'rush_att':
                try:
                    rushing = soup.find("div",{'id':'all_rushing'}).find('tfoot').select('td[class="right"]')
                    rushing_values = [col.getText() for col in rushing]
                    col_names = []
                    for col in range(0,len(rushing)):
                        col_names.append(rushing[col]['data-stat'])
                    dic = dict(zip(col_names,rushing_values))
                    temp_rushing_dict = dic.fromkeys(dic,0)
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key] 
                except:
                    dic = temp_rushing_dict
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]
                        
            elif soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'rec':
                try:
                    rec = soup.find("div",{'id':'all_receiving'}).find('tfoot').select('td[class="right"]')
                    rec_values = [col.getText() for col in rec]
                    col_names = []
                    for col in range(0,len(rec)):
                        col_names.append(rec[col]['data-stat'])
                    dic = dict(zip(col_names,rec_values))
                    temp_rec_dict = dic.fromkeys(dic,0)
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key] 
                except:
                    dic = temp_rec_dict
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]

            elif soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'xpm':
                try:
                    kick = soup.find("div",{'id':'kicking'}).find('tfoot').select('td[class="right"]')
                    kick_values = [col.getText() for col in kick]
                    col_names = []
                    for col in range(0,len(kick)):
                        col_names.append(kick[col]['data-stat'])
                    dic = dict(zip(col_names,kick_values))
                    temp_kick_dict = dic.fromkeys(dic,0)
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]
                except:
                    dic = temp_kick_dict
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]

#             elif soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'punt_ret':
#                 try:
#                     punt = soup.find("div",{'id':'punt_ret'}).find('tfoot').select('td[class="right"]')
#                     punt_values = [col.getText() for col in punt]
#                     col_names = []
#                     for col in range(0,len(punt)):
#                         col_names.append(punt[col]['data-stat'])
#                     dic = dict(zip(col_names,punt_values))
#                     temp_punt_dict = dic.fromkeys(dic,0)
                    
#                     for key in dic:
#                         drafts_df.loc[row,key] = dic[key]

#                 except:
#                     dic = temp_punt_dict
                    
#                     for key in dic:
#                         drafts_df.loc[row,key] = dic[key]

            elif soup.find('div').find('tfoot').select('td.right')[1]['data-stat'] == 'tackles_solo':
                try:
                    fum = soup.find("div",{'id':'all_defense'}).find('tfoot').select('td.right')
                    fum_values = [col.getText() for col in fum]
                    col_names = []
                    for col in range(0,len(fum)):
                        col_names.append(fum[col]['data-stat'])
                    dic = dict(zip(col_names,fum_values))
                    temp_fum_dict = dic.fromkeys(dic,0)
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]
                except:
                    dic = temp_fum_dict
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]            
        else:
            continue
    counter += 1

In [25]:
# Check counter to see how many rows were added
counter

0

### Save the scraped data to a parquet file for later use in other notebooks

In [39]:
drafts_df.to_parquet('./Data/scrapeddraft1322')

In [40]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
drafts_df

Unnamed: 0,Draft_Yr,Rnd,Pick,Tm,Player,Pos,Age,To,College/Univ,Player_NFL_Link,Player_NCAA_Link,tackles_solo,tackles_assists,tackles_total,tackles_loss,sacks,def_int,def_int_yds,def_int_yds_per_int,def_int_td,pass_defended,fumbles_rec,fumbles_rec_yds,fumbles_rec_td,fumbles_forced,rec,rec_yds,rec_yds_per_rec,rec_td,rush_att,rush_yds,rush_yds_per_att,rush_td,scrim_att,scrim_yds,scrim_yds_per_att,scrim_td,pass_cmp,pass_att,pass_cmp_pct,pass_yds,pass_yds_per_att,adj_pass_yds_per_att,pass_td,pass_int,pass_rating,g
0,2013,1,1,KAN,Eric Fisher,T,22.0,2021.0,Central Michigan,/players/F/FishEr00.htm,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2013,1,2,JAX,Luke Joeckel,T,21.0,2017.0,Texas A&M,/players/J/JoecLu00.htm,http://www.sports-reference.com/cfb/players/lu...,1.0,0.0,1.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,
2,2013,1,3,MIA,Dion Jordan,DE,23.0,2020.0,Oregon,/players/J/JordDi00.htm,http://www.sports-reference.com/cfb/players/di...,73.0,48.0,121.0,29.0,14.5,,,,,2.0,0.0,0.0,0.0,4.0,,,,,,,,,,,,,,,,,,,,,,
3,2013,1,4,PHI,Lane Johnson,T,23.0,2022.0,Oklahoma,/players/J/JohnLa01.htm,http://www.sports-reference.com/cfb/players/la...,1.0,0.0,1.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,
4,2013,1,5,DET,Ezekiel Ansah,DE,24.0,2020.0,BYU,/players/A/AnsaEz00.htm,http://www.sports-reference.com/cfb/players/ez...,39.0,33.0,72.0,13.0,4.5,1.0,-2.0,-2.0,0.0,1.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,
5,2013,1,6,CLE,Barkevious Mingo,LB,22.0,2020.0,LSU,/players/M/MingBa00.htm,http://www.sports-reference.com/cfb/players/ba...,60.0,59.0,119.0,29.0,15.0,0.0,0.0,,0.0,11.0,1.0,0.0,0.0,4.0,,,,,,,,,,,,,,,,,,,,,,
6,2013,1,7,ARI,Jonathan Cooper,G,23.0,2018.0,North Carolina,/players/C/CoopJo01.htm,http://www.sports-reference.com/cfb/players/jo...,2.0,0.0,2.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,
7,2013,1,8,STL,Tavon Austin,WR,23.0,2021.0,West Virginia,/players/A/AustTa00.htm,http://www.sports-reference.com/cfb/players/ta...,,,,,,,,,,,,,,,288.0,3413.0,11.9,29.0,110.0,1033.0,9.4,6.0,398.0,4446.0,11.2,35.0,,,,,,,,,,
8,2013,1,9,NYJ,Dee Milliner,DB,21.0,2015.0,Alabama,/players/M/MillDe00.htm,http://www.sports-reference.com/cfb/players/de...,55.0,26.0,81.0,5.0,0.0,6.0,107.0,17.8,1.0,17.0,0.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,
9,2013,1,10,TEN,Chance Warmack,G,21.0,2018.0,Alabama,/players/W/WarmCh00.htm,http://www.sports-reference.com/cfb/players/ch...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
