# Scraping the main table of Running Backs who play(ed) in the NFL:

**Current Search:** NFL Combine Results, from 2000 to 2020, player played RB, athlete played in the NFL, ordered by Combine Year, descending.

In [1]:
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
url_1 = 'https://www.pro-football-reference.com/play-index/nfl-combine-results.cgi?request=1&year_min=2014&year_max=2020&height_min=65&height_max=82&weight_min=140&weight_max=400&pos%5B%5D=RB&show=p&order_by=year_id'
url_2 = 'https://www.pro-football-reference.com/play-index/nfl-combine-results.cgi?request=1&year_min=2007&year_max=2013&height_min=65&height_max=82&weight_min=140&weight_max=400&pos%5B%5D=RB&show=p&order_by=year_id'
url_3 = 'https://www.pro-football-reference.com/play-index/nfl-combine-results.cgi?request=1&year_min=2000&year_max=2006&height_min=65&height_max=82&weight_min=140&weight_max=400&pos%5B%5D=RB&show=p&order_by=year_id'

def get_soup_obj(url):
    '''
    This function takes in a url and transforms it into a BeautifulSoup object to facilitate scraping information from an html source.
    '''
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    
    return soup

In [4]:
rb_soup_p1 = get_soup_obj(url_1)
rb_soup_p2 = get_soup_obj(url_2)
rb_soup_p3 = get_soup_obj(url_3)

In [5]:
def make_table(soup_object):
    '''
    This function takes in a soup object and finds the first table on that webpage. 
    
    '''
    table = soup_object.find('table')
    return table

In [6]:
table_1 = make_table(rb_soup_p1)
table_2 = make_table(rb_soup_p2)
table_3 = make_table(rb_soup_p3)

In [7]:
def get_list_of_rows(table):
    '''
    This function takes a table as an input and returns all rows within the table whose class is 'None' (excludes all header rows).
    '''
    rows = rows = table.tbody.findAll('tr')
    rows = table.findAll('tr',attrs={'class':None})
    return rows

In [8]:
rows_1 = get_list_of_rows(table_1)
rows_2 = get_list_of_rows(table_2)
rows_3 = get_list_of_rows(table_3)

In [9]:
def rb_dictionary(rows):
    '''
    This function takes in specified rows of a table and returns a dictionary containing:
    - Player Name
    - Combine Stats
    - Height
    - Weight
    - Link to College Stats
    - Link to NFL Stats
    '''
    # empty dictionary:
    rb_info = {}

    for row in rows[1:]:
        
        # items = all cells in a row
        items = row.find_all('td')
        
        # college link = 6th item in row
        link = items[6].find('a')
        # some players don't have a link for their college stats:
        if link:
            url = link['href']
        else:
            url = None
        
        # nfl link = 1st item in row
        nfl_stats = items[1].find('a')
        # some players don't have a link for NFL stats
        if link:
            nfl_url = nfl_stats['href']
        else:
            nfl_url = None
        
        # Player's name is 1st item in row
        player = items[1].text
        
        # Create a dictionary object for each player and their corresponding stats and links:
        rb_info[player] = [url] + [nfl_url] + [i.text for i in items]
        
    return rb_info

In [10]:
rb_dict_1 = rb_dictionary(rows_1)
rb_dict_2 = rb_dictionary(rows_2)
rb_dict_3 = rb_dictionary(rows_3)

In [11]:
def make_df(rb_dict):
    '''
    Function that transforms a dictionary into a Pandas DataFrame
    '''
    rb_df = pd.DataFrame(rb_dict).T
    rb_df.columns = ['link','nfl_stats','year_drafted','name','position','age','av','school','stats','height','weight','40yd','vert','bench_reps','broad_jump','3cone','shuttle','Drafted(tm/rnd/yr)']
    
    return rb_df

In [12]:
rb_df_1 = make_df(rb_dict_1)
rb_df_2 = make_df(rb_dict_2)
rb_df_3 = make_df(rb_dict_3)

In [13]:
# merge all dataframes into one dataframe:
rb_df = pd.concat([rb_df_1,rb_df_2,rb_df_3])
rb_df.head()

Unnamed: 0,link,nfl_stats,year_drafted,name,position,age,av,school,stats,height,weight,40yd,vert,bench_reps,broad_jump,3cone,shuttle,Drafted(tm/rnd/yr)
Trayveon Williams,https://www.sports-reference.com/cfb/players/t...,/players/W/WillTr06.htm,2019,Trayveon Williams,RB,21,0,Texas A&M,College Stats,5-8,206,4.51,33.0,19,121,7.44,4.44,Cincinnati Bengals / 6th / 182nd pick / 2019
Dexter Williams,https://www.sports-reference.com/cfb/players/d...,/players/W/WillDe07.htm,2019,Dexter Williams,RB,22,0,Notre Dame,College Stats,5-11,212,4.57,36.0,17,130,7.0,4.16,Green Bay Packers / 6th / 194th pick / 2019
Devin Singletary,https://www.sports-reference.com/cfb/players/d...,/players/S/SingDe00.htm,2019,Devin Singletary,RB,21,6,Florida Atlantic,College Stats,5-7,203,4.66,35.0,15,117,7.32,4.4,Buffalo Bills / 3rd / 74th pick / 2019
Jordan Scarlett,https://www.sports-reference.com/cfb/players/j...,/players/S/ScarJo00.htm,2019,Jordan Scarlett,RB,23,0,Florida,College Stats,5-11,208,4.47,30.0,21,116,7.37,4.63,Carolina Panthers / 5th / 154th pick / 2019
Miles Sanders,https://www.sports-reference.com/cfb/players/m...,/players/S/SandMi01.htm,2019,Miles Sanders,RB,21,9,Penn State,College Stats,5-11,211,4.49,36.0,20,124,6.89,4.19,Philadelphia Eagles / 2nd / 53rd pick / 2019


In [14]:
rb_df.shape

(423, 18)

In [15]:
rb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 423 entries, Trayveon Williams to Shaun Alexander
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   link                395 non-null    object
 1   nfl_stats           395 non-null    object
 2   year_drafted        423 non-null    object
 3   name                423 non-null    object
 4   position            423 non-null    object
 5   age                 423 non-null    object
 6   av                  423 non-null    object
 7   school              423 non-null    object
 8   stats               423 non-null    object
 9   height              423 non-null    object
 10  weight              423 non-null    object
 11  40yd                423 non-null    object
 12  vert                423 non-null    object
 13  bench_reps          423 non-null    object
 14  broad_jump          423 non-null    object
 15  3cone               423 non-null    object
 16  shu

In [16]:
rb_df.describe()

Unnamed: 0,link,nfl_stats,year_drafted,name,position,age,av,school,stats,height,weight,40yd,vert,bench_reps,broad_jump,3cone,shuttle,Drafted(tm/rnd/yr)
count,395,395,423,423,423,423,423,423,423,423,423,423.0,423.0,423.0,423.0,423.0,423.0,423.0
unique,395,395,20,422,1,6,74,113,2,11,69,52.0,33.0,25.0,33.0,82.0,61.0,320.0
top,https://www.sports-reference.com/cfb/players/d...,/players/B/BellTa00.htm,2011,Adrian Peterson,RB,22,0,LSU,College Stats,5-11,220,4.56,,,,,,
freq,1,1,30,2,423,170,87,14,395,95,19,22.0,76.0,90.0,79.0,181.0,176.0,104.0


The name 'Adrian Peterson' appears twice:

In [17]:
ap = rb_df[rb_df['name']=='Adrian Peterson']
ap

Unnamed: 0,link,nfl_stats,year_drafted,name,position,age,av,school,stats,height,weight,40yd,vert,bench_reps,broad_jump,3cone,shuttle,Drafted(tm/rnd/yr)
Adrian Peterson,https://www.sports-reference.com/cfb/players/a...,/players/P/PeteAd01.htm,2007,Adrian Peterson,RB,21,121,Oklahoma,College Stats,6-1,217,4.4,38.5,,127,7.09,4.4,Minnesota Vikings / 1st / 7th pick / 2007
Adrian Peterson,,,2002,Adrian Peterson,RB,22,12,Georgia Southern,,5-10,214,4.68,34.0,14.0,119,7.59,4.6,Chicago Bears / 6th / 199th pick / 2002


## Drop all rows that do not have a link to their college stats ('link')
By dropping all rows that do not have a link to their nfl stats (which I will be using for the target), this will take care of many of the NaN values as well as the duplicate of the name 'Adrian Peterson'.

In [18]:
#drops all rows that do not have a link for nfl_stats:
rb_df = rb_df[~rb_df['nfl_stats'].isin([None])]
rb_df.shape

(395, 18)

## Add number of yards run in rookie season:
Use the nfl_stats link to extract this value

In [19]:
def get_nfl_stats(nfl_stats):
    '''
    This function takes the nfl_stats (path part of the url), 
    adds it to the domain portion of the url (base), 
    creates a BeautifulSoup object from each url, 
    and returns the player's name and total number of scrimmage yards their rookie season.
    '''
    # add the path portion (nfl_stats) of the url (player specific) to the base url
    base_url = 'https://www.pro-football-reference.com'
    url = nfl_stats
    full_url = base_url + url

    # create a beautifulsoup object for each url:
    response = requests.get(full_url)
    page = response.text
    soup = BeautifulSoup(page,'lxml')
    
    #specify headers to be used in dataframe:
    headers = ['name','rookie_scrim_yds']
    
    # get player's name:
    name = soup.find('title').text
    name = name.split(' Stats')[0]
    name = name.strip(' ')
    
    # use beautifulsoup to find Rushing and Receiving table
    table = soup.find(lambda tag:
                      tag.name=='table'
                      and tag.has_attr('id')
                      and tag['id']=="rushing_and_receiving")
    # not every player has this table:
    if not table:
        name = name
        # since the player doesn't have this table, they did not play running back in the NFL:
        rookie_total = None
        rook = {'name':name, 'rookie_scrim_yds':rookie_total}
        return rook

    # get the row which represents the player's rookie year (rows0&1=headers)
    rookie_row = table.find_all('tr')[2]
    # from rookie_row, find the cell representing the total yards from scrimmage:
    rookie_total_yards = rookie_row.find(lambda tag:
                                         tag.name=='td'
                                         and tag.has_attr('data-stat')
                                         and tag['data-stat']=='yds_from_scrimmage')
    # retrieve the text from the yds_from_scrimmage cell
    rookie_total = rookie_total_yards.text
    
    # return a dictionary with the player's name and total yards of scrimmage rookie year:
    rook = [name,rookie_total]
    nfl_stats_dict = dict(zip(headers, [name, rookie_total]))
    
    return nfl_stats_dict

In [20]:
# create an empty list and append the name and rookie_scrim_yds for each player using get_nfl_stats function:
nfl_stats_list = []
for link in rb_df.nfl_stats:
    nfl_stats_list.append(get_nfl_stats(link))

In [21]:
# transform nfl_stats_list into a Pandas DataFrame:
nfl_stats_info = pd.DataFrame(nfl_stats_list)
nfl_stats_info.set_index('name', inplace=True)
nfl_stats_info

Unnamed: 0_level_0,rookie_scrim_yds
name,Unnamed: 1_level_1
Trayveon Williams,
Dexter Williams,11
Devin Singletary,969
Jordan Scarlett,9
Miles Sanders,1327
...,...
Reuben Droughns,93
Ron Dayne,781
Doug Chapman,330
Trung Canidate,10


In [22]:
# merge this DataFrame with the main dataframe:
rb_df = rb_df.merge(nfl_stats_info, left_index=True, right_index=True)

In [23]:
rb_df.head()

Unnamed: 0,link,nfl_stats,year_drafted,name,position,age,av,school,stats,height,weight,40yd,vert,bench_reps,broad_jump,3cone,shuttle,Drafted(tm/rnd/yr),rookie_scrim_yds
Trayveon Williams,https://www.sports-reference.com/cfb/players/t...,/players/W/WillTr06.htm,2019,Trayveon Williams,RB,21,0,Texas A&M,College Stats,5-8,206,4.51,33.0,19,121,7.44,4.44,Cincinnati Bengals / 6th / 182nd pick / 2019,
Dexter Williams,https://www.sports-reference.com/cfb/players/d...,/players/W/WillDe07.htm,2019,Dexter Williams,RB,22,0,Notre Dame,College Stats,5-11,212,4.57,36.0,17,130,7.0,4.16,Green Bay Packers / 6th / 194th pick / 2019,11.0
Devin Singletary,https://www.sports-reference.com/cfb/players/d...,/players/S/SingDe00.htm,2019,Devin Singletary,RB,21,6,Florida Atlantic,College Stats,5-7,203,4.66,35.0,15,117,7.32,4.4,Buffalo Bills / 3rd / 74th pick / 2019,969.0
Jordan Scarlett,https://www.sports-reference.com/cfb/players/j...,/players/S/ScarJo00.htm,2019,Jordan Scarlett,RB,23,0,Florida,College Stats,5-11,208,4.47,30.0,21,116,7.37,4.63,Carolina Panthers / 5th / 154th pick / 2019,9.0
Miles Sanders,https://www.sports-reference.com/cfb/players/m...,/players/S/SandMi01.htm,2019,Miles Sanders,RB,21,9,Penn State,College Stats,5-11,211,4.49,36.0,20,124,6.89,4.19,Philadelphia Eagles / 2nd / 53rd pick / 2019,1327.0


## Data Cleaning:
- 

In [24]:
rb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 390 entries, Trayveon Williams to Shaun Alexander
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   link                390 non-null    object
 1   nfl_stats           390 non-null    object
 2   year_drafted        390 non-null    object
 3   name                390 non-null    object
 4   position            390 non-null    object
 5   age                 390 non-null    object
 6   av                  390 non-null    object
 7   school              390 non-null    object
 8   stats               390 non-null    object
 9   height              390 non-null    object
 10  weight              390 non-null    object
 11  40yd                390 non-null    object
 12  vert                390 non-null    object
 13  bench_reps          390 non-null    object
 14  broad_jump          390 non-null    object
 15  3cone               390 non-null    object
 16  shu

In [25]:
def parse_height(height):
    '''
    This function takes in a height (in the format Feet-Inches) and returns the height as just inches.
    '''
    ht = height.split('-')
    feet = float(ht[0])
    inches = float(ht[1])
    return ((12*feet) + inches)

In [26]:
# apply the parse_height function to the height of each player:
rb_df['height'] = rb_df['height'].apply(lambda x:parse_height(x))

In [27]:
rb_df['height'] = rb_df['height'].astype(float)

In [28]:
# some players in the NFL were undrafted. For these players, change the Drafted(tm/rnd/yr) to represent not drafted:
rb_df['Drafted(tm/rnd/yr)'] = rb_df['Drafted(tm/rnd/yr)'].replace('','Team / 0th Round / 0th pick / xxxx')

In [29]:
def parse_draft(drafted):
    '''
    Takes the Drafted(tm/rnd/yr) value for each player and returns just the pick number.
    '''
    splitted = drafted.split('/')[2]
    pick = splitted.strip()
    pick = re.sub("\D","",pick)
    return pick
def parse_round(drafted):
    '''
    Takes the Drafted(tm/rnd/yr) value for each player and returns just the round number.
    '''
    splitted = drafted.split('/')[1]
    rnd = splitted.strip()
    rnd = re.sub("\D","",rnd)
    return rnd

In [30]:
# apply the functions above to the Draftedd(tm/rnd/yr) value for each player:
rb_df['pick_number'] = rb_df['Drafted(tm/rnd/yr)'].apply(lambda x:parse_draft(x))
rb_df['round_number'] = rb_df['Drafted(tm/rnd/yr)'].apply(lambda x:parse_round(x))
rb_df.head()

Unnamed: 0,link,nfl_stats,year_drafted,name,position,age,av,school,stats,height,...,40yd,vert,bench_reps,broad_jump,3cone,shuttle,Drafted(tm/rnd/yr),rookie_scrim_yds,pick_number,round_number
Trayveon Williams,https://www.sports-reference.com/cfb/players/t...,/players/W/WillTr06.htm,2019,Trayveon Williams,RB,21,0,Texas A&M,College Stats,68.0,...,4.51,33.0,19,121,7.44,4.44,Cincinnati Bengals / 6th / 182nd pick / 2019,,182,6
Dexter Williams,https://www.sports-reference.com/cfb/players/d...,/players/W/WillDe07.htm,2019,Dexter Williams,RB,22,0,Notre Dame,College Stats,71.0,...,4.57,36.0,17,130,7.0,4.16,Green Bay Packers / 6th / 194th pick / 2019,11.0,194,6
Devin Singletary,https://www.sports-reference.com/cfb/players/d...,/players/S/SingDe00.htm,2019,Devin Singletary,RB,21,6,Florida Atlantic,College Stats,67.0,...,4.66,35.0,15,117,7.32,4.4,Buffalo Bills / 3rd / 74th pick / 2019,969.0,74,3
Jordan Scarlett,https://www.sports-reference.com/cfb/players/j...,/players/S/ScarJo00.htm,2019,Jordan Scarlett,RB,23,0,Florida,College Stats,71.0,...,4.47,30.0,21,116,7.37,4.63,Carolina Panthers / 5th / 154th pick / 2019,9.0,154,5
Miles Sanders,https://www.sports-reference.com/cfb/players/m...,/players/S/SandMi01.htm,2019,Miles Sanders,RB,21,9,Penn State,College Stats,71.0,...,4.49,36.0,20,124,6.89,4.19,Philadelphia Eagles / 2nd / 53rd pick / 2019,1327.0,53,2


In [31]:
rb_df['pick_number'] = rb_df['pick_number'].astype(int)

## Export this DataFrame as .csv to be used on other jupyter notebooks:

In [32]:
rb_df.to_csv(r'main_and_nfl.csv',index=True, header=True, index_label=False)