In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
from tqdm.notebook import tqdm
from pathlib import Path
import warnings

In [2]:
resp = requests.get('https://www.pro-football-reference.com/boxscores/199509030atl.htm')

In [34]:
def extract_comments(response, css_selectors):
    if isinstance(response, requests.models.Response):
        soup = BeautifulSoup(response.text, 'lxml')
    if isinstance(response, str):
        soup = BeautifulSoup(response, 'lxml')
    if isinstance(response, BeautifulSoup):
        warnings.warn('Calling this function modifies the BeautifulSoup object by extracting the comment from the DOM')
        soup = response
    for element in soup(text=lambda text: isinstance(text, Comment)):
        comment = element.extract()
        ret = BeautifulSoup(comment).find('div')
        if ret:
            ret = ret.find('table', css_selectors)
            if ret:
                return ret
    return comments

In [121]:
def get_pdr_id(url):
    return url.split('/')[-1].split('.htm')[0]

In [35]:
table = extract_comments(resp, {'id':'pbp'})

In [39]:
# Get the fancy table headers hidden in the mouseover 
header_row = table.find('tr')
headers = [header['aria-label'] for header in header_row.find_all('th')]

In [177]:
table.find_all('tr')[2].find_all('td')[3].text.split(' ')[0]

'CAR'

In [187]:
plays = list()
# STarting at drive 1
drive = 1
score = False
play_number = 1  # Techinically don't need this cuz the index for the DF will have it; but we don't have a DF yet =]
# This is the hackiest of all hacks; basically figure out who kicked out based on where the ball is marked
pos_team = f"Not {table.find_all('tr')[2].find_all('td')[3].text.split(' ')[0]}"
other_team = table.find_all('tr')[2].find_all('td')[3].text.split(' ')[0]

for row in table.find_all('tr')[1:]:
    header_class = row.get('class')
    if header_class:
        # These are useless rows
        if 'thead' in header_class:
            continue
        # They bold between drives, so increment if we see one (this may break for kickoffs after touchdowns?)
        if 'divider' in header_class:
            drive = drive + 1
            
            # We can use these dividers to deal with posession.  This will likely break if possession changes multiple times in one play
            
        # If we score, yay!
        if 'score' in header_class:
            score = True
        else:
            score = False
    # Add all the "stock" information from the table to a dict, using the headers we defined before
    play.update(dict(zip(headers, [col.text for col in row.find_all(['th', 'td'])])))
    # Add some of our own inferred information
    play['drive'] = drive
    play['score'] = score
    play['play_number'] = play_number
    # Now for the hard part; let's start parsing the description field
    desc = row.find_all('td')[4]
    
    # Let's extract the action players
    action_players = [entity for entity in desc.find_all('a') if entity.text]
    ret_action_players = list()
    
    # We're gonna add a row that's a list of dicts for the action player (which is dumb to do in a dataframe, but whatever)
    # Each element in the list is a dictionary of: 'name': 'A', 'pfr_id': 'B'
    for player in action_players:
        ret_action_player = dict()
        ret_action_player['name'] = player.text
        ret_action_player['pfr_id'] = get_pdr_id(player['href'])
        ret_action_players.append(ret_action_player)
        
    play['action_players'] = ret_action_players
    
    # Passes
    print(f"Play {play['play_number']} on drive {play['drive']}, ", end='')
    if 'pass complete' in desc.text:
        print(f"Complete pass by {play['action_players'][0]}")
        # We can't guarantee there'll be an action reciever
        try:
            # There may be cases where the second action player is a tackler, etc.
            print(f"\t to {play['action_players'][1]}")
        except IndexError:
              pass
    elif 'pass incomplete' in desc.text:
        print(f"Incomplete pass by {play['action_players'][0]}")
    # Rushes
    elif 'middle' in desc.text or 'right' in desc.text or 'left' in desc.text:
        print(f"Rush by {play['action_players'][0]}")
    # Penalty
    elif 'Penalty' in desc.text:
        print(f"Penalty by {play['action_players'][0]}")
    # Sacked
    elif 'Sacked' in desc.text:
        print(f"Penalty by {play['action_players'][0]} sacked by {play['action_players'][1]} ")
    # Kickoff
    elif 'kicks off' in desc.text:
        print(f"Kickoff by {play['action_players'][0]}")
        if not 'touchback' in desc.text:
            print(f"\treturn by {play['action_players'][1]}")
        else:
            print(f"\ttouchback")
        
    
    # Etc. etc.

    
    plays.append(play)
    play_number = play_number + 1

Play 1 on drive 1, Kickoff by {'name': 'John Kasay', 'pfr_id': 'kasayjoh01'}
	return by {'name': 'Roell Preston', 'pfr_id': 'PresRo00'}
Play 2 on drive 1, Rush by {'name': 'Craig Heyward', 'pfr_id': 'HeywCr00'}
Play 3 on drive 1, Complete pass by {'name': 'Jeff George', 'pfr_id': 'GeorJe00'} on
	 to {'name': 'Steve Lofton', 'pfr_id': 'LoftSt20'}
Play 4 on drive 1, Incomplete pass by {'name': 'Eric Metcalf', 'pfr_id': 'MetcEr00'}
Play 5 on drive 1, Play 6 on drive 2, Complete pass by {'name': 'Frank Reich', 'pfr_id': 'ReicFr00'} on
	 to {'name': 'Bob Christian', 'pfr_id': 'ChriBo00'}
Play 7 on drive 2, Complete pass by {'name': 'Frank Reich', 'pfr_id': 'ReicFr00'} on
	 to {'name': 'Willie Green', 'pfr_id': 'GreeWi00'}
Play 8 on drive 2, Penalty by {'name': 'Derrick Graham', 'pfr_id': 'GrahDe21'}
Play 9 on drive 2, Complete pass by {'name': 'Frank Reich', 'pfr_id': 'ReicFr00'} on
	 to {'name': 'Mark Carrier', 'pfr_id': 'CarrMa00'}
Play 10 on drive 2, Play 11 on drive 2, Complete pass by 

In [144]:
df = pd.DataFrame(plays

In [145]:
df[df['drive']==1]

Unnamed: 0,Quarter,Time remaining in this quarter,Down,ToGo,Play location,Play description,Away Points,Home Points,"Expected points before this play, based on down, distance, and field position",Expected points after this play (or actual scoring results of the play if it was a score),drive,score,action_players
0,1,15:00,,,CAR 30,"John Kasay kicks off 66 yards, returned by Roe...",0,0,0.0,0.48,1,False,"[{'name': 'John Kasay', 'pfr_id': 'kasayjoh01'..."
1,1,,1.0,10.0,ATL 23,Craig Heyward middle for 3 yards (tackle by Ro...,0,0,0.48,0.34,1,False,"[{'name': 'Craig Heyward', 'pfr_id': 'HeywCr00..."
2,1,,2.0,7.0,ATL 26,Jeff George pass complete left for 6 yards (ta...,0,0,0.34,0.43,1,False,"[{'name': 'Jeff George', 'pfr_id': 'GeorJe00'}..."
3,1,,3.0,1.0,ATL 32,Eric Metcalf pass incomplete right intended fo...,0,0,0.43,-1.24,1,False,"[{'name': 'Eric Metcalf', 'pfr_id': 'MetcEr00'..."
4,1,,4.0,1.0,ATL 32,"Dan Stryzinski punts 43 yards, returned by Eri...",0,0,-1.24,-1.27,1,False,"[{'name': 'Dan Stryzinski', 'pfr_id': 'StryDa2..."


In [95]:
df

Unnamed: 0,quarter,Quarter,Time remaining in this quarter,Down,ToGo,Play location,Play description,Away Points,Home Points,"Expected points before this play, based on down, distance, and field position",drive,score
0,1,15:00,,,CAR 30,"John Kasay kicks off 66 yards, returned by Roe...",0,0,0.000,0.480,1,False
1,1,,1,10,ATL 23,Craig Heyward middle for 3 yards (tackle by Ro...,0,0,0.480,0.340,1,False
2,1,,2,7,ATL 26,Jeff George pass complete left for 6 yards (ta...,0,0,0.340,0.430,1,False
3,1,,3,1,ATL 32,Eric Metcalf pass incomplete right intended fo...,0,0,0.430,-1.240,1,False
4,1,,4,1,ATL 32,"Dan Stryzinski punts 43 yards, returned by Eri...",0,0,-1.240,-1.270,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...
199,OT,,2,8,CAR 29,Jeff George pass complete right to Eric Metcal...,20,20,3.240,3.070,33,False
200,OT,,3,4,CAR 25,Jeff George pass complete right to Bert Emanue...,20,20,3.070,4.240,33,False
201,OT,,1,10,CAR 20,Craig Heyward middle for 2 yards (tackle by La...,20,20,4.240,3.960,33,False
202,OT,,2,8,CAR 18,Eric Metcalf left for 2 yards (tackle by Geral...,20,20,3.960,3.520,33,False
