# Web Scraping

In [1]:
# Import necessary libraries

import pandas as pd
from time import sleep

from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
# Create an empty list that will contain all the dataframes
# (one dataframe for each draft)
draft_dfs_list = []

# a list to store any errors that may come up while scraping
errors_list = []

### Loop through combine page for combine stats and link to college stats for each year

In [3]:
url_temp = "https://www.pro-football-reference.com/draft/{year}-combine.htm"

for year in range (2005,2023):
    url = url_temp.format(year=year)
    html = urlopen(url)

    soup = BeautifulSoup(html,'lxml')

    player_dic = []
    text = soup.select('#combine tr')[1:]
    for player in range(0,len(text)):
        try:
            name = text[player].find('th',{'data-stat':'player'})['csk']
            name = " ".join(name.split(',')[::-1])
        except:
            name = text[player].find('th',{'data-stat':'player'})
        dic = {'player':name}
        dic.update({'year':year})
        content = text[player].findAll('td')
        values = [col.getText() for col in content]
        col_names = []
        for col in range(0,len(content)):
            col_names.append(content[col]['data-stat'])
        dic.update(dict(zip(col_names,values)))
        try:
            link = text[player].find('td',{'data-stat':'college'}).find('a',href=True)['href']
        except:
            link = ''
        dic.update({'link':link})
        player_dic.append(dic)
    draft_dfs_list.append(player_dic)

### Merge results into a single DataFrame

In [4]:
dflist = [pd.DataFrame(x) for x in draft_dfs_list]

In [5]:
drafts_df = pd.concat(dflist, ignore_index=True)

In [6]:
# Remove unnecessary column
drafts_df.drop(['college'],axis=1,inplace=True)

### Loop through newly created combine DataFrame to scrape each players college stats page

The scraper will go through each players page and find the primary stats they have. Unfortunately the secondary stats are not retrievable through web scraping as far as I can tell. It will get both a players final season and entire college career statistics as dictionaries and then merge them together.

In [211]:
# Counter is initiated to see how many rows were run in case there was an error and it would be good to know where it left off
counter = 0
OL_list = ['G','OT','T','C','OL','K','P','OG','LS']

for row in range(6167,len(drafts_df)):
#for row in range(16):
    player_url = drafts_df.link[row]
    if drafts_df.pos[row] in OL_list or drafts_df.player[row] == 'Quinton Bell' or drafts_df.player[row] == 'Michael Woods II'\
    or drafts_df.player[row] == 'Ellis Hobbs' or drafts_df.player[row] == 'Erasmus James' or drafts_df.player[row] == 'Brian Calhoun'\
    or drafts_df.player[row] == 'Walter Thurmond' or drafts_df.player[row] == 'Roy Helu' or drafts_df.player[row] == 'Louis Nix'\
    or drafts_df.player[row] == 'Stanley Williams' or drafts_df.player[row] == 'Dimitri Flowers' \
    or drafts_df.player[row] == 'Keenen Brown' or drafts_df.player[row] == 'Emmanuel Butler':
        pass
    else:
        if len(player_url) > 0:
            sleep(3.15)
            html = urlopen(player_url)
            soup = BeautifulSoup(html,'html.parser')
            
            if soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'pass_cmp':
                try:
                    passing_season = soup.find("div",{'id':'all_passing'}).find('tbody').findAll('tr')[-1].select('td[class="right"]')
                    passing_season_values = [col.getText() for col in passing_season]
                    passing_season_col = []
                    for col in range(0,len(passing_season)):
                        passing_season_col.append((passing_season[col]['data-stat'] + '_season'))
                    season_dic = dict(zip(passing_season_col,passing_season_values))

                    passing_career = soup.find("div",{'id':'all_passing'}).find('tfoot').select('td[class="right"]')
                    passing_career_values = [col.getText() for col in passing_career]
                    passing_career_col = []
                    for col in range(0,len(passing_career)):
                        passing_career_col.append((passing_career[col]['data-stat'] + '_career'))
                    career_dic = dict(zip(passing_career_col,passing_career_values))

                    dic = season_dic | career_dic
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key] 
                except:
                    dic = temp_passing_dict
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key] 

            elif soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'rush_att':
                try:
                    rushing_season = soup.find("div",{'id':'all_rushing'}).find('tbody').findAll('tr')[-1].select('td[class="right"]')
                    rushing_season_values = [col.getText() for col in rushing_season]
                    rushing_season_col = []
                    for col in range(0,len(rushing_season)):
                        rushing_season_col.append((rushing_season[col]['data-stat'] + '_season'))
                    season_dic = dict(zip(rushing_season_col,rushing_season_values))

                    rushing_career = soup.find("div",{'id':'all_rushing'}).find('tfoot').select('td[class="right"]')
                    rushing_career_values = [col.getText() for col in rushing_career]
                    rushing_career_col = []
                    for col in range(0,len(rushing_career)):
                        rushing_career_col.append((rushing_career[col]['data-stat'] + '_career'))
                    career_dic = dict(zip(rushing_career_col,rushing_career_values))

                    dic = season_dic | career_dic
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key] 
                except:
                    dic = temp_rushing_dict
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]
                        
            elif soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'rec':
                try:
                    receiving_season = soup.find("div",{'id':'all_receiving'}).find('tbody').findAll('tr')[-1].select('td[class="right"]')
                    receiving_season_values = [col.getText() for col in receiving_season]
                    receiving_season_col = []
                    for col in range(0,len(receiving_season)):
                        receiving_season_col.append((receiving_season[col]['data-stat'] + '_season'))
                    season_dic = dict(zip(receiving_season_col,receiving_season_values))

                    receiving_career = soup.find("div",{'id':'all_receiving'}).find('tfoot').select('td[class="right"]')
                    receiving_career_values = [col.getText() for col in receiving_career]
                    receiving_career_col = []
                    for col in range(0,len(receiving_career)):
                        receiving_career_col.append((receiving_career[col]['data-stat'] + '_career'))
                    career_dic = dict(zip(receiving_career_col,receiving_career_values))

                    dic = season_dic | career_dic
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key] 
                except:
                    dic = temp_rec_dict
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]

            elif soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'xpm':
                try:
                    kicking_season = soup.find("div",{'id':'kicking'}).find('tbody').findAll('tr')[-1].select('td[class="right"]')
                    kicking_season_values = [col.getText() for col in kicking_season]
                    kicking_season_col = []
                    for col in range(0,len(kicking_season)):
                        kicking_season_col.append((kicking_season[col]['data-stat'] + '_season'))
                    season_dic = dict(zip(kicking_season_col,kicking_season_values))

                    kicking_career = soup.find("div",{'id':'kicking'}).find('tfoot').select('td[class="right"]')
                    kicking_career_values = [col.getText() for col in kicking_career]
                    kicking_career_col = []
                    for col in range(0,len(kicking_career)):
                        kicking_career_col.append((kicking_career[col]['data-stat'] + '_career'))
                    career_dic = dict(zip(kicking_career_col,kicking_career_values))

                    dic = season_dic | career_dic
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]
                except:
                    dic = temp_kick_dict
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]

#             elif soup.find('div').find('tfoot').select('td[class="right"]')[0]['data-stat'] == 'punt_ret':
#                 try:
#                     punt = soup.find("div",{'id':'punt_ret'}).find('tfoot').select('td[class="right"]')
#                     punt_values = [col.getText() for col in punt]
#                     col_names = []
#                     for col in range(0,len(punt)):
#                         col_names.append(punt[col]['data-stat'])
#                     dic = dict(zip(col_names,punt_values))
#                     temp_punt_dict = dic.fromkeys(dic,0)
                    
#                     for key in dic:
#                         drafts_df.loc[row,key] = dic[key]

#                 except:
#                     dic = temp_punt_dict
                    
#                     for key in dic:
#                         drafts_df.loc[row,key] = dic[key]

            elif soup.find('div').find('tfoot').select('td.right')[1]['data-stat'] == 'tackles_solo':
                try:
                    defense_season = soup.find("div",{'id':'all_defense'}).find('tbody').findAll('tr')[-1].select('td[class="right"]')
                    defense_season_values = [col.getText() for col in defense_season]
                    defense_season_col = []
                    for col in range(0,len(defense_season)):
                        defense_season_col.append((defense_season[col]['data-stat'] + '_season'))
                    season_dic = dict(zip(defense_season_col,defense_season_values))

                    defense_career = soup.find("div",{'id':'all_defense'}).find('tfoot').select('td[class="right"]')
                    defense_career_values = [col.getText() for col in defense_career]
                    defense_career_col = []
                    for col in range(0,len(defense_career)):
                        defense_career_col.append((defense_career[col]['data-stat'] + '_career'))
                    career_dic = dict(zip(defense_career_col,defense_career_values))

                    dic = season_dic | career_dic
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]
                except:
                    dic = temp_fum_dict
                    
                    for key in dic:
                        drafts_df.loc[row,key] = dic[key]            
        else:
            continue
    counter += 1

### Save DataFrame to csv file

In [223]:
drafts_df.to_csv('./Data/scrapeddraft0522.csv')

### Example of player whose link was broken that could be fixed

In [165]:
drafts_df[drafts_df.link == 'https://www.sports-reference.com/cfb/players/jokobi-meyers-1.html'] = 'https://www.sports-reference.com/cfb/players/jakobi-meyers-1.html'

In [210]:
drafts_df[drafts_df.link == 'https://www.sports-reference.com/cfb/players/michael-woods-5.html']

Unnamed: 0,player,year,pos,school_name,height,weight,forty_yd,vertical,bench_reps,broad_jump,cone,shuttle,draft_info,link,g_season,def_int_season,def_int_yds_season,def_int_yds_per_int_season,def_int_career,def_int_yds_career,def_int_yds_per_int_career,pass_cmp_season,pass_att_season,pass_cmp_pct_season,pass_yds_season,pass_yds_per_att_season,adj_pass_yds_per_att_season,pass_td_season,pass_int_season,pass_rating_season,pass_cmp_career,pass_att_career,pass_cmp_pct_career,pass_yds_career,pass_yds_per_att_career,adj_pass_yds_per_att_career,pass_td_career,pass_int_career,pass_rating_career,rush_att_season,rush_yds_season,rush_yds_per_att_season,rush_td_season,rec_season,rec_yds_season,rec_yds_per_rec_season,scrim_att_season,scrim_yds_season,scrim_yds_per_att_season,scrim_td_season,rush_att_career,rush_yds_career,rush_yds_per_att_career,rush_td_career,rec_career,rec_yds_career,rec_yds_per_rec_career,rec_td_career,scrim_att_career,scrim_yds_career,scrim_yds_per_att_career,scrim_td_career,fumbles_rec_yds_career,fumbles_rec_td_career,rec_td_season,def_int_td_season,def_int_td_career,fumbles_rec_career,fumbles_rec_yds_season,fumbles_rec_td_season,fumbles_rec_season,tackles_solo_season,tackles_assists_season,tackles_total_season,tackles_loss_season,sacks_season,pass_defended_season,tackles_solo_career,tackles_assists_career,tackles_total_career,tackles_loss_career,sacks_career,pass_defended_career,fumbles_forced_season,fumbles_forced_career
