In [14]:
import requests
from bs4 import BeautifulSoup

year = 1978
url_year = f"https://www.footballdb.com/games/index.html?lg=NFL&yr={year}"
url_box = "https://www.footballdb.com//games/boxscore/new-york-jets-vs-cleveland-browns-1978121006"


In [39]:
import random
import time

def get_headers():
    """Returns a dictionary of headers with a randomly selected User-Agent"""
    # List of common User-Agent strings
    user_agents = [
        # Chrome on Windows
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        
        # Firefox on Windows
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
        
        # Safari on macOS
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
        
        # Chrome on macOS
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        
        # Edge on Windows
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
        
        # Chrome on Android
        'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36',
        'Mozilla/5.0 (Linux; Android 13; SM-S901B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36',
        
        # Safari on iOS
        'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1',
        'Mozilla/5.0 (iPad; CPU OS 17_2_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1'
    ]
    
    # Common headers that most browsers send
    headers = {
        'User-Agent': random.choice(user_agents),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',  # Do Not Track
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0'
    }
    
    return headers

# Example usage in your scraping script:
def make_request(url, delay=True):
    """
    Makes a request with random headers and optional delay
    
    Args:
        url (str): URL to request
        delay (bool): Whether to add a random delay between requests
    
    Returns:
        requests.Response object
    """
    if delay:
        time.sleep(random.uniform(.5, 1.5))
    
    headers = get_headers()
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response
    except requests.RequestException as e:
        print(f"Error making request to {url}: {e}")
        return None

In [21]:
import requests
from bs4 import BeautifulSoup
from typing import Dict, List
import re

def extract_boxscore_links(soup) -> Dict[int, List[str]]:
    """
    Extract boxscore links from the NFL schedule HTML content.
    
    Args:
        html_content (str): HTML content of the schedule page
        
    Returns:
        Dict[int, List[str]]: Dictionary mapping week numbers to lists of boxscore links
    """
    schedule_data = {}
    
    # Find all week dividers
    week_divs = soup.find_all('div', class_='ltbluediv')
    
    for week_div in week_divs:
        # Extract week number from the div header
        week_header = week_div.find(text=re.compile(r'Week \d+'))
        if not week_header:
            continue
            
        week_num = int(re.search(r'Week (\d+)', week_header).group(1))
        
        # Find the next table after the week header
        table = week_div.find_next('table')
        if not table:
            continue
            
        # Extract all boxscore links from the table
        boxscore_links = []
        for row in table.find_all('tr'):
            box_cell = row.find_all('td')[-1] if row.find_all('td') else None
            if box_cell and box_cell.find('a'):
                link = box_cell.find('a').get('href')
                if link and 'boxscore' in link:
                    # Add domain if the link is relative
                    if link.startswith('/'):
                        
                        link = f"https://www.footballdb.com/{link}"
                    boxscore_links.append(link)
        
        if boxscore_links:
            schedule_data[week_num] = boxscore_links
    
    return schedule_data

In [20]:
from bs4 import BeautifulSoup
import re
from typing import Dict, Union, List, Tuple

def extract_score_row_data(row) -> Tuple[str, List[int]]:
    """
    Extract team name and scores from a table row.
    
    Args:
        row: BeautifulSoup table row element
        
    Returns:
        Tuple containing team name and list of quarter scores
    """
    cells = row.find_all('td')
    
    # Extract team name from first cell, removing any extra whitespace and asterisks
    team_name = cells[0].text.strip().replace('*', '').strip()
    
    # Extract scores from remaining cells
    scores = []
    for cell in cells[1:]:  # Skip first cell (team name)
        # Extract number, handling both regular and bold scores
        score_text = cell.text.strip().replace('*', '')
        if score_text:
            try:
                scores.append(int(score_text))
            except ValueError:
                continue
                
    return team_name, scores

def get_boxscore(soup) -> Dict[str, Union[str, int]]:
    """
    Parse NFL boxscore HTML and return structured dictionary of game data.
    
    Args:
        html_content (str): Raw HTML content of boxscore page
        
    Returns:
        Dictionary containing structured game data
    """
    score_rows = soup.find_all('tr', class_='row0 center')
    
    away_team, away_scores = extract_score_row_data(score_rows[0])
    home_team, home_scores = extract_score_row_data(score_rows[1])
    
    # Create result dictionary
    result = {
        "home_team": home_team,
        "home_q1": home_scores[0],
        "home_q2": home_scores[1],
        "home_q3": home_scores[2],
        "home_q4": home_scores[3],
        "home_q5": home_scores[4] if len(home_scores) > 4 else 0,  # OT quarter if exists
        "home_final": home_scores[-1],  # Last score is final
        
        "away_team": away_team,
        "away_q1": away_scores[0],
        "away_q2": away_scores[1],
        "away_q3": away_scores[2],
        "away_q4": away_scores[3],
        "away_q5": away_scores[4] if len(away_scores) > 4 else 0,  # OT quarter if exists
        "away_final": away_scores[-1]  # Last score is final
    }
    
    return result

get_boxscore(soup)

{'home_team': 'Cleveland BrownsCleveland (8-7)',
 'home_q1': 7,
 'home_q2': 10,
 'home_q3': 10,
 'home_q4': 7,
 'home_q5': 3,
 'home_final': 37,
 'away_team': 'New York JetsNY Jets (8-7)',
 'away_q1': 0,
 'away_q2': 10,
 'away_q3': 0,
 'away_q4': 24,
 'away_q5': 0,
 'away_final': 34}

In [97]:
start_time = time.time()
years = range(1990, 1995)
boxscores_dict = {}
#boxscores_df = pd.DataFrame()

for year in years:
    boxscores_dict[year] = {}
    
    url_year = f"https://www.footballdb.com/games/index.html?lg=NFL&yr={year}"
    response_year = make_request(url_year)
    soup_year = BeautifulSoup(response_year.content, 'html.parser')

    links = extract_boxscore_links(soup_year)
    weeks = range(1, 18)
    for week in weeks:
        boxscores_dict[year][week] = {}
        if week not in links.keys():
            break
        
        for game_ind, url_game in enumerate(links[week]):
            #print(game_ind, url_game)
            response_game = make_request(url_game)
            soup_game = BeautifulSoup(response_game.content, 'html.parser')
            boxscore = get_boxscore(soup_game)
            game_df = pd.DataFrame([boxscore])
            game_df["season"] = year
            game_df["week"] = week
        
            boxscores_df = pd.concat([boxscores_df, game_df], ignore_index=True)

    end_time = time.time()
    print("Season", year)
    print(f"elapsed time: {end_time - start_time}s")

boxscores_df.head()

  week_header = week_div.find(text=re.compile(r'Week \d+'))


Season 1990
elapsed time: 334.54921793937683s


  week_header = week_div.find(text=re.compile(r'Week \d+'))


Season 1991
elapsed time: 669.450915813446s


  week_header = week_div.find(text=re.compile(r'Week \d+'))


Season 1992
elapsed time: 1001.6928899288177s


  week_header = week_div.find(text=re.compile(r'Week \d+'))


Season 1993
elapsed time: 1313.5077607631683s


  week_header = week_div.find(text=re.compile(r'Week \d+'))


Season 1994
elapsed time: 1651.2089519500732s


Unnamed: 0,home_team,home_q1,home_q2,home_q3,home_q4,home_q5,home_final,away_team,away_q1,away_q2,away_q3,away_q4,away_q5,away_final,season,week
0,Tampa Bay BuccaneersTampa Bay (0-1),7,3,0,3,13,13,New York GiantsNY Giants (1-0),7,3,6,3,19,19,1978,1
1,Detroit LionsDetroit (0-1),0,0,7,0,7,7,Green Bay PackersGreen Bay (1-0),6,0,7,0,13,13,1978,1
2,Atlanta FalconsAtlanta (1-0),0,14,3,3,20,20,Houston OilersHouston (0-1),7,0,7,0,14,14,1978,1
3,Cincinnati BengalsCincinnati (0-1),0,9,0,14,23,23,Kansas City ChiefsKansas City (1-0),7,10,7,0,24,24,1978,1
4,Philadelphia EaglesPhiladelphia (0-1),0,0,0,14,14,14,Los Angeles RamsLos Angeles (1-0),3,3,0,10,16,16,1978,1


In [98]:
nulls = boxscores_df.isnull().sum().sum()
rows = boxscores_df.shape[0]
print(f"Rows: {rows}, errors: {nulls}")
boxscores_df.tail()

Rows: 3626, errors: 0


Unnamed: 0,home_team,home_q1,home_q2,home_q3,home_q4,home_q5,home_final,away_team,away_q1,away_q2,away_q3,away_q4,away_q5,away_final,season,week
3621,Los Angeles RaidersLA Raiders (9-7),0,3,0,6,9,9,Kansas City ChiefsKansas City (9-7),7,7,3,2,19,19,1994,17
3622,Houston OilersHouston (2-14),7,6,8,3,24,24,New York JetsNY Jets (6-10),0,7,3,0,10,10,1994,17
3623,San Diego ChargersSan Diego (11-5),3,14,7,13,37,37,Pittsburgh SteelersPittsburgh (12-4),0,13,6,15,34,34,1994,17
3624,Miami DolphinsMiami (10-6),7,20,0,0,27,27,Detroit LionsDetroit (9-7),3,7,3,7,20,20,1994,17
3625,Minnesota VikingsMinnesota (10-6),7,3,11,0,21,21,San Francisco 49ersSan Francisco (13-3),0,7,0,7,14,14,1994,17


In [99]:
boxscores_df.to_csv("data/nfl box scores 2.csv", index=False)