In [1]:
import requests
import time
import glob

import pandas as pd
pd.set_option('display.max_columns', 100)

from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

In [3]:
# Convert Pro-Football-Reference abbreviations to preferred abbreviations

# Only contains those with weird abbreviations
pfr_standard: dict[[str], str] = {
    'ram': 'lar', # Rams
    'nor': 'no', # Saints
    'sfo': 'sf', # 49ers
    'clt': 'ind', # Colts
    'htx': 'hou', # Texans
    'nwe': 'ne', # Patriots
    'rav': 'bal', # Ravens
    'kan': 'kc', # Chiefs
    'gnb': 'gb', # Packers
    'crd': 'ari', # Cardinals
    'oti': 'ten', # Titans
    'rai': 'lv', # Raiders
    'lvr': 'lv', # Raiders 2
    'sdg': 'lac', # Chargers
    'tam': 'tb', # Buccaneers
}

def standardize_initials(team):
    return pfr_standard.get(team.lower(), team).upper()


teamname_initials: dict[[str], str] = {
    'Arizona Cardinals': 'ARI',
    'Atlanta Falcons': 'ATL',
    'Baltimore Ravens': 'BAL',
    'Buffalo Bills': 'BUF',
    
    'Carolina Panthers': 'CAR',
    'Chicago Bears': 'CHI',
    'Cincinnati Bengals': 'CIN',
    'Cleveland Browns': 'CLE',
    
    'Dallas Cowboys': 'DAL',
    'Denver Broncos': 'DEN',
    'Detroit Lions': 'DET',
    'Green Bay Packers': 'GB',
    
    'Houston Texans': 'HOU',
    'Indianapolis Colts': 'IND',
    'Jacksonville Jaguars': 'JAX', # **
    'Kansas City Chiefs': 'KC',
    
    'Los Angeles Chargers': 'LAC',
    'Los Angeles Rams': 'LAR',
    'Las Vegas Raiders': 'LV', # **
    'Miami Dolphins': 'MIA',
    
    'Minnesota Vikings': 'MIN',
    'New England Patriots': 'NE',
    'New Orleans Saints': 'NO',
    'New York Giants': 'NYG',
    
    'New York Jets': 'NYJ',
    'Philadelphia Eagles': 'PHI',
    'Pittsburgh Steelers': 'PIT',
    'San Francisco 49ers': 'SF',
    
    'Seattle Seahawks': 'SEA',
    'Tampa Bay Buccaneers': 'TB',
    'Tennessee Titans': 'TEN',
    'Washington Commanders': 'WAS' # **
    
}


def convert_teamname(team_str):
    return teamname_initials[team_str]

# Only want team name
initials_teamname = {val: key.split(' ')[-1] for key, val in teamname_initials.items()}
# Can name these better
teamname_nocity_initials = {val: key for key, val in initials_teamname.items()}

def convert_initials(team_initials):
    return initials_teamname[team_initials]

def convert_teamname_nocity(teamname_nocity):
    return teamname_nocity_initials[teamname_nocity]

In [4]:
def clean_name(name: str) -> str:
        """
        Standardizes name across PFR, FD, DK
        """
        return ' '.join(name.split(' ')[:2]).replace('.', '')
    
data_columns = [
    'player', # within href
    'team',
    'pass_cmp',
    'pass_att',
    'pass_yds',
    'pass_td',
    'pass_int',
    'pass_sacked',
    'pass_sacked_yds',
    'pass_long',
    'pass_rating',
    'rush_att',
    'rush_yds',
    'rush_td',
    'rush_long',
    'targets',
    'rec',
    'rec_yds',
    'rec_td',
    'rec_long',
    'fumbles',
    'fumbles_lost'
]

def_td_categories = ['team', 'def_int_td', 'fumbles_rec_td']
int_cats = def_td_categories[1:]
parse_stat = lambda stat_, stat_val: int(stat_val) if stat_ in int_cats else standardize_initials(stat_val)

In [5]:
# game_url: str = f"{root_url}{game.find_all('td', class_='right gamelink')[0].find('a')['href']}"
game_url = 'https://www.pro-football-reference.com/boxscores/202309070kan.htm'

ff_options = Options()
ff_options.add_argument('--headless')

driver = webdriver.Firefox(options=ff_options)
driver.get(game_url)

game_soup = BeautifulSoup(driver.page_source, 'html.parser')

In [9]:
week = 1

scorebox = game_soup.find_all('div', class_='scorebox')[0]

away_team, home_team = tuple([convert_teamname(scorebox.find_all('strong')[i].get_text().replace('\n', '')) for i in (0,2)])
away_score, home_score =  tuple([int(score.get_text().replace('\n','')) for score in scorebox.find_all('div', class_='scores')])

In [25]:
# Now for advanced stats and snap-counts

# Snap Counts
snapcounts_tables = {
    away_team: game_soup.find_all('table', id='vis_snap_counts')[0], #vis not away 
    home_team: game_soup.find_all('table', id='home_snap_counts')[0]
}

# Two separate tables instead of one combined table --> 2d dict
snapcounts_data = {
    away_team: dict(),
    home_team: dict()
}

# Dont want lineman info (for now)
target_pos = ('QB', 'WR', 'RB', 'TE')
# Only want offensive data (for now) --> data-stat values in HTML
snapcount_data_stats = ('player', 'pos', 'offense', 'off_pct')

# Going to get everyone at first (easier) --> then will filter dict based on position / index
for team, snapcount_html_table in snapcounts_tables.items():
    snapcounts_data[team]['name'] = [
        clean_name(tag.get_text()) for tag in snapcount_html_table.find_all('th', attrs={'data-stat': 'player'})
        if tag.get_text() != 'Player'
    ]

    for stat in snapcount_data_stats[1:]:
        snapcounts_data[team][stat] = [td.get_text() for td in snapcount_html_table.find_all('td', attrs={'data-stat': stat})]


# Now cleaning

for team, snap_info in snapcounts_data.items():
    # Indexes of positions in target_pos
    num_entries = len(snap_info['name'])
    pos_indexes = [i for i in range(num_entries) if snap_info['pos'][i] in target_pos]

    for stat in snap_info:
        target_pos_values = [snap_info[stat][i] for i in pos_indexes]
        if stat == 'offense':
            target_pos_values = [int(val) for val in target_pos_values]
        elif stat == 'off_pct':
            target_pos_values = [float(val[:-1]) / 100 for val in target_pos_values]
        snap_info[stat] = target_pos_values


# Flattening, adding game info for subsequent individual dataframes 
awayteam_df_data = snapcounts_data[away_team]
hometeam_df_data = snapcounts_data[home_team]

awayteam_num_rows = len(awayteam_df_data['name'])
hometeam_num_rows = len(hometeam_df_data['name'])

awayteam_df_data['team'] = [away_team] * awayteam_num_rows
awayteam_df_data['opp'] = [home_team] * awayteam_num_rows

hometeam_df_data['team'] = [home_team] * hometeam_num_rows
hometeam_df_data['opp'] = [away_team] * hometeam_num_rows

rename_columns = {
    'offense': 'snap_total',
    'off_pct': 'snap_percent'
}
# Make DataFrames
away_df, home_df = tuple([
    (pd
     .DataFrame(data_)
     .rename(rename_columns, axis=1)
    )
    for data_ in (awayteam_df_data, hometeam_df_data)
])

In [26]:
away_df

Unnamed: 0,name,pos,snap_total,snap_percent,team,opp
0,Jared Goff,QB,70,1.0,DET,KC
1,Amon-Ra St,WR,66,0.94,DET,KC
2,Sam LaPorta,TE,58,0.83,DET,KC
3,David Montgomery,RB,55,0.79,DET,KC
4,Josh Reynolds,WR,49,0.7,DET,KC
5,Marvin Jones,WR,39,0.56,DET,KC
6,Brock Wright,TE,27,0.39,DET,KC
7,Kalif Raymond,WR,19,0.27,DET,KC
8,Jahmyr Gibbs,RB,19,0.27,DET,KC
9,James Mitchell,TE,6,0.09,DET,KC


In [None]:
stat_table = game_soup.find_all('table', id='player_offense')[0]
names = [
    clean_name(tag.get_text()) for tag in stat_table.find_all('th', attrs={'data-stat': 'player'})
    if tag.get_text() != 'Player'
]

convert_stat_str = lambda stat, stat_val: stat_val if stat in ['player', 'team', 'pass_rating'] else int(stat_val)

table_data = {
    stat: [convert_stat_str(stat, td.get_text()) for td in stat_table.find_all('td', attrs={'data-stat': stat})]
    for stat in data_columns[1:]
}

# Will do rest of cleaning later on, just wanted to not have any NA values in saved files and have standardized names, teams, and positions
fix_rating = lambda rating_str: float(rating_str) if len(rating_str) else 0.0
teams = [standardize_initials(team) for team in set(table_data['team'])]

get_opp = lambda team_: teams[1] if team_ == teams[0] else teams[0]
get_score = lambda team_: home_score if team_ == home_team else away_score
get_opp_score = lambda team_: away_score if team_ == home_team else home_score

winning_team = home_team if home_score > away_score else away_team
is_winner = lambda team_: int(team_ == winning_team)
total_score = away_score + home_score

table_data['pass_rating'] = [ fix_rating(rating) for rating in table_data['pass_rating'] ]
table_data['team'] = [ standardize_initials(team) for team in table_data['team'] ]
table_data['opp'] = [ get_opp(team) for team in table_data['team'] ]
table_data['home'] = [ int(team == home_team) for team in table_data['team'] ]
table_data['score'] = [ get_score(team) for team in table_data['team'] ]
table_data['opp_score'] = [ get_opp_score(team) for team in table_data['team'] ]
table_data['winner'] = [ is_winner(team) for team in table_data['team'] ]
table_data['total'] = [total_score] * len(names)
table_data['week'] = [week] * len(names)

df = (pd
      .DataFrame(data={**{'name': names}, **table_data})
      # Need to cast columns
      .assign(fpts=lambda df: 0.04*df.pass_yds + 4.0*df.pass_td - 1.0*df.pass_int + 0.1*df.rush_yds + 6.0*df.rush_td + 1.0*df.rec + 0.1*df.rec_yds + 6.0*df.rec_td - 1.0*df.fumbles_lost)
     )

# Defensive handling

# This info comes from the offensive tables
# Team : # of sacks they received
# Example: BUF: # times Josh Allen was sacked
team_defense_stats = {
    team: {
        stat: df.loc[df['team'] == team, stat].astype('int').sum()
        for stat in ('pass_sacked', 'pass_int', 'fumbles_lost')
    }
    for team in teams
}

for team, def_stats in team_defense_stats.items():
    def_stats['pts_allowed'] = get_opp_score(team)

def_table = game_soup.find_all('table', id='player_defense')[0]

# ('team', 'def_int_td', 'fumbles_rec_td')
def_table_data = {
    stat: [parse_stat(stat, td.get_text()) for td in def_table.find_all('td', attrs={'data-stat': stat})]
    for stat in def_td_categories
}

# Need to count touchdowns for defense, most important after points allowed
team_def_tds = {team: 0 for team in teams}
for cat in int_cats:
    for i, td in enumerate(def_table_data[cat]):
        team_def_tds[def_table_data['team'][i]] += td


# These two be put in _info.py
def_fpts_rules = {
    'pass_sacked': 1.0,
    'pass_int': 2.0,
    'fumbles_lost': 2.0,
}

pts_allowed_rules = {
    range(0,1): 10.0,
    range(1,7): 7.0,
    range(7,14): 4.0,
    range(14,21): 1.0,
    range(21, 28): 0.0,
    range(28, 35): -1.0,
    range(35, 100): -4.0
}

# Initialize dictionary for fpts for defenses
def_fpts = {team: 0 for team in teams}

# Careful having all in one loop
for team in teams:
    for cat, multi in def_fpts_rules.items():
        def_fpts[team] += team_defense_stats[get_opp(team)][cat]*multi
    def_fpts[team] += 6.0*team_def_tds[team]
    # Defense not responsible for opposing defense getting TD
    team_defense_stats[team]['pts_allowed'] -= 6.0*team_def_tds[team]
    for pts_range, fpts_ in pts_allowed_rules.items():
        if team_defense_stats[team]['pts_allowed'] in pts_range:
            def_fpts[team] += fpts_
    

def_data = {
    'name': [convert_initials(team_) for team_ in teams],
    'team': teams,
    'opp': [get_opp(team_) for team_ in teams],
    'home': [ int(team == home_team) for team in teams ],
    'week': [week] * 2,
    'score': [ get_score(team_) for team_ in teams ],
    'opp_score':[ get_opp_score(team_) for team_ in teams ],
    'winner': [ is_winner(team) for team in teams ],
    'total': [total_score] * 2,
    'fpts': [def_fpts[team] for team in teams]
}

# Kicking data


kicking_table = game_soup.find_all('table', id='kicking')[0]

kickers = [
    clean_name(tag.get_text()) for tag in kicking_table.find_all('th', attrs={'data-stat': 'player'})
    if tag.get_text() != 'Player'
]

convert_kicking_val = lambda kick_val: int(kick_val) if len(kick_val) else 0
parse_kicking_stat = lambda kick_stat, kick_val: convert_kicking_val(kick_val) if kick_stat != 'team' else standardize_initials(kick_val)
kicking_data = {
    stat: [parse_kicking_stat(stat, td.get_text()) for td in kicking_table.find_all('td', attrs={'data-stat': stat})]
    for stat in ['team', 'xpm', 'fgm']
}

kicking_fpts_rules = {
    'xpm': 1.0,
    'fgm': 3.0
}

kicking_fpts = {kicker: 0.0 for kicker in kickers}

for stat, multi in kicking_fpts_rules.items():
    for i, kicking_val in enumerate(kicking_data[stat]):
        kicking_fpts[kickers[i]] += kicking_val*multi
        
kicking_df_data = {
    'name': kickers,
    'team': [standardize_initials(team_) for team_ in kicking_data['team']],
    'opp': [get_opp(team_) for team_ in kicking_data['team']],
    'home': [ int(team_ == home_team) for team_ in kicking_data['team']],
    'week': [week] * len(kickers),
    'score': [ get_score(team_) for team_ in kicking_data['team'] ],
    'opp_score':[ get_opp_score(team_) for team_ in kicking_data['team'] ],
    'winner': [ is_winner(team) for team_ in kicking_data['team'] ],
    'total': [total_score] * len(kickers),
    'fpts': [kicking_fpts[kicker] for kicker in kickers]
}



# Good DataFrame
# Position super easy for K and DST


final_df = (pd
            .concat([
                df,
                pd.DataFrame(def_data),
                pd.DataFrame(kicking_df_data)
            ])
            .fillna(0)
           )

In [None]:
final_df