In [ ]:
# Core imports for NFL schedule parsing
import re
import pandas as pd
import json
import pdfplumber
from datetime import datetime

In [ ]:
# DEPRECATED: Old PyPDF2 parser - replaced by pdfplumber version
# This function had issues with mangled text extraction and incorrect dates
# Left here for reference but use parse_nfl_with_pdfplumber_full instead

In [ ]:
import pdfplumber
import pandas as pd
import re
import json
from datetime import datetime

def parse_nfl_with_pdfplumber_full(pdf_path, season_year):
    """Parse NFL schedule PDF using pdfplumber - full version with URL generation"""
    
    # Load team codes for URL generation
    with open('teams.json', 'r') as f:
        team_codes = json.load(f)
    
    games = []
    game_id = 1
    current_week = 1
    current_date = None
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Extract text from page
            text = page.extract_text()
            if not text:
                continue
                
            lines = text.split('\n')
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                
                # Check for week headers
                if 'WEEK' in line and re.search(r'WEEK\s+\d+', line):
                    week_match = re.search(r'WEEK\s+(\d+)', line)
                    if week_match:
                        current_week = int(week_match.group(1))
                
                # Check for date patterns
                date_match = re.search(r'(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s*(\d{4})', line)
                if date_match:
                    day_name = date_match.group(1)
                    month_name = date_match.group(2)
                    day = int(date_match.group(3))
                    year = int(date_match.group(4))
                    
                    month_map = {
                        'January': 1, 'February': 2, 'March': 3, 'April': 4,
                        'May': 5, 'June': 6, 'July': 7, 'August': 8,
                        'September': 9, 'October': 10, 'November': 11, 'December': 12
                    }
                    month = month_map[month_name]
                    current_date = datetime(year, month, day).date()
                
                # Check for games - both "at" and "vs" (international)
                elif (' at ' in line or ' vs ' in line) and current_date:
                    # Skip header lines
                    if 'GAME' in line and 'LOCAL' in line and 'ET' in line:
                        continue
                    
                    # Split on either "at" or "vs"
                    if ' at ' in line:
                        parts = line.split(' at ')
                        is_international = False
                    else:
                        parts = line.split(' vs ')
                        is_international = True
                        
                    if len(parts) == 2:
                        away_team = parts[0].strip()
                        
                        # Extract home team (before times/other info)
                        rest = parts[1]
                        # Split on common patterns to isolate team name
                        home_parts = re.split(r'\s+\d+:\d+|\s+\([^)]*\)\s+\d|\s+[A-Z]{2,4}\s*$', rest)
                        home_team = home_parts[0].strip()
                        
                        # Clean up team names
                        home_team = re.sub(r'\s+\([^)]*\)$', '', home_team)  # Remove trailing parentheses
                        
                        if away_team and home_team and len(away_team) > 3 and len(home_team) > 3:
                            # Generate PFR URL
                            home_code = team_codes.get(home_team)
                            away_code = team_codes.get(away_team)
                            
                            pfr_url = None
                            if home_code and current_date:
                                date_str = current_date.strftime('%Y%m%d')
                                pfr_url = f"https://www.pro-football-reference.com/boxscores/{date_str}0{home_code}.htm"
                            
                            # Extract additional details
                            day_match = re.search(r'\((Thu|Mon|Tue|Wed|Fri|Sat|Sun)\)', line)
                            day_of_week = day_match.group(1) if day_match else None
                            
                            times = re.findall(r'\d+:\d+p', line)
                            local_time = times[0] if times else None
                            et_time = times[1] if len(times) > 1 else times[0] if times else None
                            
                            networks = ['NBC', 'CBS', 'FOX', 'ESPN', 'NFLN', 'Prime Video', 'Peacock', 'ESPN/ABC']
                            tv_network = None
                            for network in networks:
                                if network in line:
                                    tv_network = network
                                    break
                            
                            games.append({
                                'game_id': f"{season_year}_{game_id:03d}",
                                'season': season_year,
                                'week': current_week,
                                'game_date': current_date,
                                'day_of_week': day_of_week,
                                'away_team': away_team,
                                'home_team': home_team,
                                'local_time': local_time,
                                'et_time': et_time,
                                'tv_network': tv_network,
                                'pfr_url': pfr_url,
                                'pfr_home_code': home_code,
                                'pfr_away_code': away_code,
                                'is_international': is_international
                            })
                            game_id += 1
    
    return pd.DataFrame(games)

# Parse the full 2024 season
print("Parsing full 2024 NFL schedule with pdfplumber...")
games_2024_full = parse_nfl_with_pdfplumber_full('NFL-Regular-season-2024.pdf', 2024)

print(f"Found {len(games_2024_full)} games total")

# Check for international games
international_games = games_2024_full[games_2024_full['is_international'] == True]
print(f"International games (vs): {len(international_games)}")

print("\nFirst 5 games:")
print(games_2024_full[['game_id', 'week', 'game_date', 'away_team', 'home_team', 'is_international']].head(5))

print("\nLast 5 games:")
print(games_2024_full[['game_id', 'week', 'game_date', 'away_team', 'home_team', 'is_international']].tail(5))

# Save to CSV for batch scraper
csv_filename = 'nfl_2024_schedule.csv'
games_2024_full.to_csv(csv_filename, index=False)
print(f"\nðŸ“„ Saved {len(games_2024_full)} games to {csv_filename} for batch scraper")

# Also save corrected version for auditing
games_2024_full.to_csv('nfl_2024_schedule_corrected.csv', index=False)
print(f"ðŸ“„ Saved audit copy to nfl_2024_schedule_corrected.csv")

# NFL Player Stats Scraper

Test scraping player statistics from Pro Football Reference box scores.

In [ ]:
# REMOVED: Debug code for testing single game scraping
# This was used during development but no longer needed

In [ ]:
# REMOVED: Debug code for parsing table structure
# This was used during development but no longer needed

In [ ]:
# REMOVED: Debug code for finding advanced tables in HTML comments
# This was used during development but no longer needed

In [ ]:
# This cell removed - was just testing single game scraping and creating unnecessary files

# Batch NFL Stats Scraper

Scrape all 272 games from 2024 season in random order with delays.

In [ ]:
import random
import time
import os
import shutil
from datetime import datetime, timedelta

def scrape_game_tables_with_id(soup, game_id, game_url):
    """Extract all 4 tables and add game_id to each record"""
    
    tables = {}
    
    # 1. Basic offense table  
    basic_table = soup.find('table', {'id': 'player_offense'})
    if basic_table:
        basic_data = []
        tbody = basic_table.find('tbody')
        if tbody:
            rows = tbody.find_all('tr')
            
            for row in rows:
                if row.get('class') and ('thead' in str(row.get('class')) or 'over_header' in str(row.get('class'))):
                    continue
                    
                cells = [td.get_text().strip() for td in row.find_all(['td', 'th'])]
                
                if cells and cells[0] == 'Player':
                    continue
                    
                if len(cells) > 1 and cells[0]:
                    # Add game_id as first column
                    row_data = [game_id] + cells
                    basic_data.append(row_data)
        
        if basic_data:
            columns = ['game_id', 'Player', 'Tm', 'Pass_Cmp', 'Pass_Att', 'Pass_Yds', 'Pass_TD', 'Pass_Int', 
                      'Pass_Sk', 'Pass_Sk_Yds', 'Pass_Lng', 'Pass_Rate', 'Rush_Att', 'Rush_Yds', 
                      'Rush_TD', 'Rush_Lng', 'Rec_Tgt', 'Rec_Rec', 'Rec_Yds', 'Rec_TD', 'Rec_Lng', 
                      'Fmb', 'FL']
            while len(columns) < len(basic_data[0]):
                columns.append(f'Col_{len(columns)}')
            
            tables['basic_offense'] = pd.DataFrame(basic_data, columns=columns[:len(basic_data[0])])
    
    # 2-4. Advanced tables from comments
    from bs4 import Comment
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    
    advanced_sections = {
        'advanced_passing': 'Advanced Passing',
        'advanced_rushing': 'Advanced Rushing', 
        'advanced_receiving': 'Advanced Receiving'
    }
    
    for table_name, section_text in advanced_sections.items():
        for comment in comments:
            if section_text.lower() in comment.lower():
                comment_soup = BeautifulSoup(comment, 'html.parser')
                adv_table = comment_soup.find('table')
                
                if adv_table:
                    headers = ['game_id']  # Start with game_id
                    header_row = adv_table.find('thead') or adv_table.find('tr')
                    if header_row:
                        for th in header_row.find_all(['th', 'td']):
                            headers.append(th.get_text().strip())
                    
                    data_rows = []
                    rows = adv_table.find_all('tr')[1:]
                    
                    for row in rows:
                        if row.get('class') and ('thead' in str(row.get('class')) or 'over_header' in str(row.get('class'))):
                            continue
                            
                        cells = [td.get_text().strip() for td in row.find_all(['td', 'th'])]
                        
                        if cells and cells[0] == 'Player':
                            continue
                            
                        if len(cells) > 1 and cells[0]:
                            # Add game_id as first column
                            row_data = [game_id] + cells
                            data_rows.append(row_data)
                    
                    if data_rows and headers:
                        max_cols = max(len(headers), len(data_rows[0]) if data_rows else 0)
                        while len(headers) < max_cols:
                            headers.append(f'Col_{len(headers)}')
                        
                        tables[table_name] = pd.DataFrame(data_rows, columns=headers[:len(data_rows[0])])
                break
    
    return tables

def log_error(message, log_file="scraper_errors.log"):
    """Log errors to file with timestamp"""
    with open(log_file, 'a') as f:
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        f.write(f"[{timestamp}] {message}\n")

def format_time_remaining(seconds):
    """Format seconds into readable time string"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    if hours > 0:
        return f"{hours}h {minutes}m"
    else:
        return f"{minutes}m"

def load_nfl_facts():
    """Load NFL facts from facts.txt file"""
    try:
        with open('facts.txt', 'r') as f:
            content = f.read()
        
        # Split by lines and filter out empty lines
        facts = [line.strip() for line in content.split('\n') if line.strip() and not line.strip().isdigit()]
        return facts
    except FileNotFoundError:
        return ["Did you know? The NFL was founded in 1920!"]  # Fallback fact

def batch_scrape_season():
    """
    WARNING: This will scrape ALL 272 games from 2024 season!
    Estimated time: 2-3 hours with 15-45 second delays
    """
    
    print("BATCH NFL SCRAPER STARTING")
    print("This will take 2-3 hours to complete all 272 games")
    
    # Clean up previous runs
    files_to_remove = ["scraper_errors.log"] + [f for f in os.listdir('.') if f.startswith('clean_game_')]
    for file in files_to_remove:
        if os.path.exists(file):
            os.remove(file)
    
    if os.path.exists("nfl_2024_data"):
        shutil.rmtree("nfl_2024_data")
        print("Cleaned up previous run data")
    
    # Load NFL facts for entertainment
    nfl_facts = load_nfl_facts()
    
    # Create output directory
    output_dir = "nfl_2024_data"
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")
    
    # Initialize error log
    log_error("=== BATCH SCRAPING SESSION STARTED ===")
    
    start_time = datetime.now()
    print(f"\nSCRAPING STARTED at {start_time.strftime('%H:%M:%S')}")
    
    # Load schedule
    schedule_df = pd.read_csv('nfl_2024_schedule.csv')
    total_games = len(schedule_df)
    print(f"Loaded {total_games} games to scrape")
    
    # Calculate time estimates
    avg_delay = (15 + 45) / 2  # 30 seconds average
    estimated_total_seconds = total_games * avg_delay
    estimated_hours = estimated_total_seconds / 3600
    estimated_end = start_time + timedelta(seconds=estimated_total_seconds)
    
    print(f"Estimated completion: {estimated_end.strftime('%H:%M:%S')} ({estimated_hours:.1f} hours)")
    
    # Randomize order
    games_to_scrape = schedule_df.sample(frac=1).reset_index(drop=True)
    print("Shuffled games randomly")
    
    # Initialize master DataFrames
    master_tables = {
        'basic_offense': [],
        'advanced_passing': [],
        'advanced_rushing': [],
        'advanced_receiving': []
    }
    
    successful_scrapes = 0
    failed_scrapes = 0
    
    for i, game in games_to_scrape.iterrows():
        game_id = game['game_id']
        url = game['pfr_url']
        
        # Progress calculation
        progress_pct = (i / total_games) * 100
        games_remaining = total_games - i - 1
        elapsed = datetime.now() - start_time
        
        print(f"\n[{i+1}/{total_games}] ({progress_pct:.1f}%) Scraping {game_id}")
        print(f"Game: {game['away_team']} @ {game['home_team']}")
        print(f"URL: {url}")
        
        try:
            # Make request
            response = requests.get(url)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                game_tables = scrape_game_tables_with_id(soup, game_id, url)
                
                # Check if we got any data
                total_players = sum(len(df) for df in game_tables.values() if not df.empty)
                
                if total_players == 0:
                    # No data found - log as potential date/parsing issue
                    error_msg = f"NO DATA FOUND - {game_id} | {url} | Teams: {game['away_team']} @ {game['home_team']} | Date: {game.get('game_date', 'Unknown')}"
                    log_error(error_msg)
                    print(f"  WARNING: No player data found - logged to errors")
                else:
                    # Accumulate results
                    table_counts = {}
                    for table_name, df in game_tables.items():
                        if not df.empty:
                            master_tables[table_name].append(df)
                            table_counts[table_name] = len(df)
                        else:
                            table_counts[table_name] = 0
                    
                    # Display table counts - more concise
                    total_records = sum(table_counts.values())
                    print(f"  Data: {total_records} total records ({', '.join([f'{k}: {v}' for k, v in table_counts.items()])})")
                
                successful_scrapes += 1
                success_rate = (successful_scrapes / (successful_scrapes + failed_scrapes)) * 100
                print(f"  SUCCESS ({successful_scrapes}/{successful_scrapes+failed_scrapes}) {success_rate:.1f}%")
                
            elif response.status_code == 404:
                # Try alternate URL with other team's code
                print(f"  404 on primary URL, trying alternate...")
                
                # Create alternate URL with away team code - handle both string and datetime
                if isinstance(game['game_date'], str):
                    date_str = game['game_date'].replace('-', '')  # Convert 2024-12-08 to 20241208
                else:
                    date_str = game['game_date'].strftime('%Y%m%d') if pd.notnull(game.get('game_date')) else 'UNKNOWN'
                
                alt_url = f"https://www.pro-football-reference.com/boxscores/{date_str}0{game['pfr_away_code']}.htm"
                print(f"  Alt URL: {alt_url}")
                
                alt_response = requests.get(alt_url)
                
                if alt_response.status_code == 200:
                    print(f"  SUCCESS with alternate URL")
                    soup = BeautifulSoup(alt_response.content, 'html.parser')
                    game_tables = scrape_game_tables_with_id(soup, game_id, alt_url)
                    
                    # Check if we got any data
                    total_players = sum(len(df) for df in game_tables.values() if not df.empty)
                    
                    if total_players == 0:
                        error_msg = f"NO DATA FOUND - {game_id} | {alt_url} | Teams: {game['away_team']} @ {game['home_team']} | Date: {game.get('game_date', 'Unknown')}"
                        log_error(error_msg)
                        print(f"  WARNING: No player data found - logged to errors")
                    else:
                        # Accumulate results
                        table_counts = {}
                        for table_name, df in game_tables.items():
                            if not df.empty:
                                master_tables[table_name].append(df)
                                table_counts[table_name] = len(df)
                            else:
                                table_counts[table_name] = 0
                        
                        # Display table counts - more concise
                        total_records = sum(table_counts.values())
                        print(f"  Data: {total_records} total records ({', '.join([f'{k}: {v}' for k, v in table_counts.items()])})")
                    
                    successful_scrapes += 1
                    success_rate = (successful_scrapes / (successful_scrapes + failed_scrapes)) * 100
                    print(f"  SUCCESS ({successful_scrapes}/{successful_scrapes+failed_scrapes}) {success_rate:.1f}%")
                else:
                    # Both URLs failed - log error
                    error_msg = f"404 BOTH URLs - {game_id} | Primary: {url} | Alternate: {alt_url} | Teams: {game['away_team']} @ {game['home_team']} | Date: {game.get('game_date', 'Unknown')}"
                    log_error(error_msg)
                    print(f"  ERROR: Both URLs 404'd - logged to errors")
                    failed_scrapes += 1
                
            else:
                # Other HTTP error
                error_msg = f"HTTP {response.status_code} - {game_id} | {url}"
                log_error(error_msg)
                print(f"  ERROR: HTTP {response.status_code}")
                failed_scrapes += 1
                
        except Exception as e:
            error_msg = f"EXCEPTION - {game_id} | {url} | Error: {str(e)}"
            log_error(error_msg)
            print(f"  ERROR: {e}")
            failed_scrapes += 1
        
        # Calculate time remaining - simplified
        if i > 0:
            avg_time_per_game = elapsed.total_seconds() / (i + 1)
            estimated_remaining_seconds = games_remaining * avg_time_per_game
            estimated_completion = datetime.now() + timedelta(seconds=estimated_remaining_seconds)
            
            print(f"  ETA: {estimated_completion.strftime('%H:%M:%S')} ({format_time_remaining(estimated_remaining_seconds)} remaining)")
        
        # Random delay between requests with live countdown and random NFL fact
        if i < len(games_to_scrape) - 1:  # Don't delay after last game
            delay = random.randint(15, 45)
            
            # Show random NFL fact with subtle formatting
            random_fact = random.choice(nfl_facts)
            print(f"\nNFL FACT: {random_fact}\n")
            
            for countdown in range(delay, 0, -1):
                print(f"\r  Waiting {countdown} seconds...", end="", flush=True)
                time.sleep(1)
            print()  # New line after countdown
    
    # Combine and save all DataFrames
    total_time = datetime.now() - start_time
    print(f"\n=== COMBINING RESULTS ===")
    print(f"Total scraping time: {str(total_time).split('.')[0]}")
    
    final_tables = {}
    
    for table_name, df_list in master_tables.items():
        if df_list:
            combined_df = pd.concat(df_list, ignore_index=True)
            final_tables[table_name] = combined_df
            
            # Save to protected subdirectory
            filename = os.path.join(output_dir, f"2024_season_{table_name}.csv")
            combined_df.to_csv(filename, index=False)
            print(f"SAVED: {table_name}: {len(combined_df)} records -> {filename}")
        else:
            print(f"ERROR: {table_name}: No data collected")
    
    # Save backup copies with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_dir = os.path.join(output_dir, f"backup_{timestamp}")
    os.makedirs(backup_dir, exist_ok=True)
    
    for table_name, df in final_tables.items():
        backup_filename = os.path.join(backup_dir, f"2024_season_{table_name}_backup.csv")
        df.to_csv(backup_filename, index=False)
    
    print(f"BACKUP: All files backed up to {backup_dir}")
    
    # Final logging
    final_msg = f"SCRAPING COMPLETE - Success: {successful_scrapes} | Failed: {failed_scrapes} | Total time: {str(total_time).split('.')[0]}"
    log_error(final_msg)
    
    print(f"\nSCRAPING COMPLETE!")
    print(f"Successful: {successful_scrapes}")
    print(f"Failed: {failed_scrapes}")
    print(f"Success rate: {successful_scrapes/(successful_scrapes+failed_scrapes)*100:.1f}%")
    print(f"Total time: {str(total_time).split('.')[0]}")
    print(f"Errors logged to: scraper_errors.log")
    print(f"Data saved to: {output_dir}")
    
    return final_tables

# DANGER ZONE: Uncomment to run full season scrape
# batch_scrape_season()

In [ ]:
# REMOVED: Debug code for parser troubleshooting
# This was used during development but no longer needed