In [1]:
import requests
import bs4
import numpy as np
import pandas as pd
import datetime

# Putting '..' on sys.path because Player import was causing an error when scraper.py is imported from
# another module (such as 5_seasons_50_carries.py).
import sys
import os

# os.path.split() splits the head and tail of the path for the file.
# This line of code grabs the head, joins it with '..', and inserts the path into the first element of sys.path.
sys.path.insert(0, '..')

import pro_football_ref_scraper as pfbr
from player import Player

In [2]:
pfbr_fantasy_table = {
    'name': str,
    'fantasy_points': int
}


def get_fantasy_table(year):
    str_year = str(year)
    url = 'https://www.pro-football-reference.com/years/' + str_year + '/fantasy.htm'
    table_id = 'fantasy'
    player_list = scrape_table(url, table_id)
    list_of_player_dicts = create_player_objects(player_list, pfbr_fantasy_table)
    fantasy_df = make_data_frame(list_of_player_dicts, year)

    return fantasy_df


def scrape_table(url, table_id):
    """
    Scrape a table from pro-football-reference.com based on provided table ID.

    :param url: Websites URL.
    :param table_id: Identifier for the table. Found when used "inspect element" on web page.

    :return: List of BeautifulSoup4 element ResultSets. Each item in list is a row in the table.
    """
    # Send a GET request to Pro Football Reference's Rushing & Receiving page to gather the data.
    r = requests.get(url)
    r.raise_for_status()

    # Create a BeautifulSoup object.
    soup = bs4.BeautifulSoup(r.text, 'lxml')

    # Find the first table with tag 'table' and id 'rushing_and_receiving.
    table = soup.find('table', id=table_id)

    # tbody is the table's body
    # Get the body of the table
    body = table.find('tbody')

    # tr refers to a table row
    # Each element in player_list has data for a single player.
    # This will also collect descriptions of each column found in the web page's table, which
    # is filtered out in create_player_objects().
    player_list = body.find_all('tr')

    return player_list

# name = 0
# fp = 20

def create_player_objects(player_list, header):
    """
    Create an object for each player using the player_list created by scrape_data().

    :param player_list: List of BeautifulSoup4 element ResultSets. Each item in list is a row in the table.
    :param header: Dictionary where keys are the name of the stat and values are the data type.

    :return: List of dictionary representations of Player objects (object.__dict__).
    """
    # This list holds a dictionary of each object's attributes.
    # Each dictionary is made from the object's __dict__ attribute.
    list_of_player_dicts = []

    # Get each player's stats, create a Player object, and append the object
    # and the instance's __dict__ to their own list.
    for player in player_list:
        # The <td> tag defines a standard cell in an HTML table.
        # Get a list of cells. This raw web page data represents one player.
        raw_stat_list = player.find_all('td')

        # If info_list has data, then we will extract the desired information from the elements.
        # info_list will be empty if the current 'player' in the player_list is actually other
        # irrelevant information we're not interested in (such as a column description).
        if raw_stat_list:
            player_stats = get_player_stats(raw_stat_list)
            # Create a Player object and append the __dict__ attribute to a list.
            # This list is used for the data in our data frame.
            obj = Player(player_stats, header)
            list_of_player_dicts.append(obj.__dict__)

    return list_of_player_dicts


def get_player_stats(raw_stat_list):
    """
    Get text data from from a BeautifulSoup4 element tag. Also gets a URL to the player's personal career stat
    page. Used in create_player_objects().

    :param raw_stat_list: List of BeautifulSoup4 element ResultSets. Inside of each ResultSet is a stat.

    :return: List of the player's stats in text form.
    """
    name = raw_stat_list[0].text
    fantasy_points = raw_stat_list[20].text

    return [name, fantasy_points]


def make_data_frame(player_dict_list, year):
    """
    Create a new data frame and return it.

    :param player_dict_list: List of unique Player.__dict__'s.
    :param year: NFL season's year.
    :param fantasy: When true, add a column for player's total fantasy points for the season.

    :return: Data frame of stats.
    """
    df_columns = list(pfbr_fantasy_table.keys())  # Get header dict's keys for df's column names.
    df = pd.DataFrame(data=player_dict_list, columns=df_columns)  # Create the data frame.
    df['year'] = year  # Add a 'year' column.
    df.set_index('name', inplace=True)  # Make 'name' the data frame's index

    for stat in df_columns[5:]:
        df[stat].fillna(0, inplace=True)  # Fill missing stats with 0.

    return df


In [3]:
fb_ref = pfbr.ProFbRefScraper()
rush_rec_df = fb_ref.get_rushing_receiving_data(2017, fantasy=True)
fantasy_df = get_fantasy_table(2017)

In [4]:
rush_rec_df.head()

Unnamed: 0_level_0,url,team,age,position,games_played,games_started,rush_attempts,rush_yards,rush_td,longest_run,...,catch_percentage,scrimmage_yards,rush_rec_td,fumbles,year,fumbles_lost,two_pt_conversions,return_yards,return_td,fantasy_points
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Jones,/players/J/JoneAa00.htm,GNB,23,rb,12,4,81,448,4,46,...,50.0,470,4,0,2017,0.0,0.0,0.0,0.0,71.0
Aaron Ripkowski,/players/R/RipkAa00.htm,GNB,25,fb,16,2,5,13,0,4,...,70.0,52,0,0,2017,0.0,0.0,0.0,0.0,5.2
Aaron Rodgers,/players/R/RodgAa00.htm,GNB,34,qb,7,7,24,126,0,18,...,0.0,126,0,1,2017,1.0,0.0,0.0,0.0,10.6
Adam Humphries,/players/H/HumpAd00.htm,TAM,24,wr,16,3,1,6,0,6,...,73.5,637,1,1,2017,1.0,0.0,49.0,0.0,69.66
Adam Thielen,/players/T/ThieAd00.htm,MIN,27,WR,16,16,1,11,0,11,...,64.1,1287,4,3,2017,2.0,0.0,0.0,0.0,148.7


In [5]:
fantasy_df.head()

Unnamed: 0_level_0,fantasy_points,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Todd Gurley,319.0,2017
Le'Veon Bell,259.0,2017
Kareem Hunt,242.0,2017
Alvin Kamara,233.0,2017
DeAndre Hopkins,216.0,2017


In [6]:
rush_rec_names = list(rush_rec_df.index)

In [7]:
rush_rec_names

['Aaron Jones',
 'Aaron Ripkowski',
 'Aaron Rodgers',
 'Adam Humphries',
 'Adam Thielen',
 "Adoree' Jackson",
 'Adrian Peterson',
 'Akeem Hunt',
 'Albert Wilson',
 'Alex Collins',
 'Alex Erickson',
 'Alex Smith',
 'Alfred Blue',
 'Alfred Morris',
 'Alvin Kamara',
 'Amari Cooper',
 'Ameer Abdullah',
 'Andre Ellington',
 'Andre Williams',
 'Andy Dalton',
 'Andy Janovich',
 'Anthony Sherman',
 'ArDarius Stewart',
 'Austin Davis',
 'Austin Ekeler',
 'Ben Roethlisberger',
 'Benny Cunningham',
 'Bernard Reedy',
 'Bilal Powell',
 'Blaine Gabbert',
 'Blake Bortles',
 'Bobby Rainey',
 'Branden Oliver',
 'Brandin Cooks',
 'Brandon Bolden',
 'Braxton Miller',
 'Brett Hundley',
 'Brian Hill',
 'Brian Hoyer',
 'Brock Osweiler',
 'Bronson Hill',
 'Bruce Ellington',
 'Bryce Petty',
 'Bryce Treggs',
 'Byron Marshall',
 'C.J. Anderson',
 'C.J. Beathard',
 'C.J. Ham',
 'C.J. Prosise',
 'C.J. Spiller',
 'Cam Newton',
 'Cameron Artis-Payne',
 'Carlos Hyde',
 'Carson Palmer',
 'Carson Wentz',
 'Case Keenum

In [8]:
fantasy_table_names = list(fantasy_df.index)

In [9]:
fantasy_df.loc['Todd Gurley']

fantasy_points     319.0
year              2017.0
Name: Todd Gurley, dtype: float64

In [10]:
fantasy_df.loc['Todd Gurley']['fantasy_points']

319.0

In [11]:
for name in rush_rec_names:
    if name in fantasy_table_names:
        if name == 'Chris Thompson':
            continue
#         if isinstance(rush_rec_df.loc[name]['position'], float):
#             print(name)
        print(type(rush_rec_df.loc[name]['position']))
#         if rush_rec_df.loc[name]['position'].lower() == 'qb':
#             continue
#         rush_rec_points = np.float64(round(rush_rec_df.loc[name]['fantasy_points']))
#         fbref_points = np.float64(fantasy_df.loc[name]['fantasy_points'])
#         if rush_rec_points != fbref_points:
#             print(name)
        
        
#         if not isinstance(fantasy_df.loc[name]['fantasy_points'], np.float64):
#             print(name, type(fantasy_df.loc[name]))
#         if round(rush_rec_df.loc[name]['fantasy_points']) != fantasy_df.loc[name]['fantasy_points']:
#             print(name)

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'str'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'

In [12]:
round(rush_rec_df.loc['Todd Gurley']['fantasy_points']) == fantasy_df.loc['Todd Gurley']['fantasy_points']

True

In [13]:
np.float64(1)

1.0

In [14]:
np.float64

numpy.float64

In [15]:
num = np.float64(1.2)

In [16]:
isinstance(type(num), np.float64)

False

In [17]:
fantasy_df.loc['Chris Thompson']

Unnamed: 0_level_0,fantasy_points,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Chris Thompson,112.0,2017
Chris Thompson,8.0,2017


In [18]:
rush_rec_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 317 entries, Aaron Jones to Zach Zenner
Data columns (total 31 columns):
url                   317 non-null object
team                  317 non-null object
age                   317 non-null int64
position              220 non-null object
games_played          317 non-null int64
games_started         317 non-null int64
rush_attempts         317 non-null int64
rush_yards            317 non-null int64
rush_td               317 non-null int64
longest_run           317 non-null int64
yards_per_rush        317 non-null float64
yards_per_game        317 non-null float64
attempts_per_game     317 non-null float64
targets               317 non-null float64
receptions            317 non-null float64
rec_yards             317 non-null float64
yards_per_rec         317 non-null float64
rec_td                317 non-null float64
longest_rec           317 non-null float64
rec_per_game          317 non-null float64
rec_yards_per_game    317 non-null floa

In [19]:
for i, name in enumerate(rush_rec_names):
    if i > 0 and rush_rec_names[i-1] == name:
        print(name)
#     if i > 0 and i < len(rush_rec_names):
#         if rush_rec_names[i-1] == name or rush_rec_names[i+1] == name:

Chris Thompson


In [20]:
for i in range(len(rush_rec_df)):
    print(type(rush_rec_df.iloc[0]['url']))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

In [21]:
for i in range(len(rush_rec_df)):
    print(type(rush_rec_df.iloc[0]['position']))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

In [22]:
for i in range(len(rush_rec_df)):
    print(type(rush_rec_df.iloc[0]['position']))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

In [23]:
rush_rec_df.iloc[0]

url                   /players/J/JoneAa00.htm
team                                      GNB
age                                        23
position                                   rb
games_played                               12
games_started                               4
rush_attempts                              81
rush_yards                                448
rush_td                                     4
longest_run                                46
yards_per_rush                            5.5
yards_per_game                           37.3
attempts_per_game                         6.8
targets                                    18
receptions                                  9
rec_yards                                  22
yards_per_rec                             2.4
rec_td                                      0
longest_rec                                 9
rec_per_game                              0.8
rec_yards_per_game                        1.8
catch_percentage                  

In [24]:
rush_rec_df['position'].fillna('NEW')

name
Aaron Jones              rb
Aaron Ripkowski          fb
Aaron Rodgers            qb
Adam Humphries           wr
Adam Thielen             WR
Adoree' Jackson          CB
Adrian Peterson         NEW
Akeem Hunt              NEW
Albert Wilson            wr
Alex Collins             RB
Alex Erickson           NEW
Alex Smith               QB
Alfred Blue             NEW
Alfred Morris            rb
Alvin Kamara          fb/rb
Amari Cooper             WR
Ameer Abdullah           RB
Andre Ellington         NEW
Andre Williams          NEW
Andy Dalton              QB
Andy Janovich            fb
Anthony Sherman          fb
ArDarius Stewart         wr
Austin Davis            NEW
Austin Ekeler           NEW
Ben Roethlisberger       QB
Benny Cunningham        NEW
Bernard Reedy           NEW
Bilal Powell             RB
Blaine Gabbert           qb
                      ...  
Terron Ward             NEW
Tevin Coleman         fb/rb
Theo Riddick             rb
Thomas Rawls             rb
Tion Green     

In [25]:
rush_rec_df['position']

name
Aaron Jones              rb
Aaron Ripkowski          fb
Aaron Rodgers            qb
Adam Humphries           wr
Adam Thielen             WR
Adoree' Jackson          CB
Adrian Peterson         NaN
Akeem Hunt              NaN
Albert Wilson            wr
Alex Collins             RB
Alex Erickson           NaN
Alex Smith               QB
Alfred Blue             NaN
Alfred Morris            rb
Alvin Kamara          fb/rb
Amari Cooper             WR
Ameer Abdullah           RB
Andre Ellington         NaN
Andre Williams          NaN
Andy Dalton              QB
Andy Janovich            fb
Anthony Sherman          fb
ArDarius Stewart         wr
Austin Davis            NaN
Austin Ekeler           NaN
Ben Roethlisberger       QB
Benny Cunningham        NaN
Bernard Reedy           NaN
Bilal Powell             RB
Blaine Gabbert           qb
                      ...  
Terron Ward             NaN
Tevin Coleman         fb/rb
Theo Riddick             rb
Thomas Rawls             rb
Tion Green     