In [None]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import concurrent.futures
from datetime import datetime
import time

def process_league_season(league, season, base_url):
    url = f'{base_url}/{league}/{season}'
    try:
        # Implement exponential backoff for robust scraping
        for attempt in range(3):
            try:
                with requests.Session() as session:
                    # Add user agent to mimic browser request
                    headers = {
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
                    }
                    res = session.get(url, headers=headers, timeout=15)
                    
                    # Check for successful response
                    res.raise_for_status()
                    
                    soup = BeautifulSoup(res.content, "lxml")
                    
                    # More robust script finding
                    scripts = soup.find_all('script', string=lambda text: text and 'teamsData' in text)
                    if not scripts:
                        print(f"No data found for {league} {season}")
                        return None
                    
                    # Extract JSON data
                    string_with_json_obj = scripts[0].text.strip()
                    
                    # More robust JSON extraction
                    ind_start = string_with_json_obj.index("('") + 2
                    ind_end = string_with_json_obj.index("')")
                    json_data = string_with_json_obj[ind_start:ind_end].encode('utf8').decode('unicode_escape')
                    data = json.loads(json_data)
                    
                    break  # Success, exit retry loop
            
            except (requests.RequestException, ValueError) as e:
                # Exponential backoff
                if attempt < 2:
                    time.sleep(2 ** attempt)
                else:
                    print(f"Failed to process {league} {season}: {e}")
                    return None
        
        # Process teams data
        teams_data = {}
        for team_id, team_info in data.items():
            team_name = team_info['title']
            team_history = team_info['history']
            
            # Efficient data processing
            processed_rows = [list(row.values()) for row in team_history]
            df = pd.DataFrame(processed_rows, columns=list(team_history[0].keys()))
            
            # Vectorized calculations with robust error handling
            df['ppda_coef'] = df['ppda'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
            df['oppda_coef'] = df['ppda_allowed'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
            
            # Aggregate statistics
            sum_cols = ['xG', 'xGA', 'npxG', 'npxGA', 'deep', 'deep_allowed', 'scored', 'missed', 'xpts', 'wins', 'draws', 'loses', 'pts', 'npxGD']
            mean_cols = ['ppda_coef', 'oppda_coef']
            
            sum_data = df[sum_cols].sum()
            mean_data = df[mean_cols].mean()
            
            final_data = pd.concat([sum_data, mean_data])
            final_data['team'] = team_name
            final_data['matches'] = len(df)
            
            teams_data[team_name] = final_data
        
        # Convert to DataFrame more efficiently
        full_stat = pd.DataFrame.from_dict(teams_data, orient='index').reset_index()
        
        # Additional transformations
        full_stat = full_stat.rename(columns={'index': 'team'})
        full_stat['season'] = season
        full_stat['league'] = league
        full_stat['position'] = range(1, len(full_stat) + 1)
        
        # Vectorized calculations
        full_stat['xG_diff'] = full_stat['xG'] - full_stat['scored']
        full_stat['xGA_diff'] = full_stat['xGA'] - full_stat['missed']
        full_stat['xpts_diff'] = full_stat['xpts'] - full_stat['pts']
        
        # Select and order columns
        col_order = [
            'league', 'season', 'position', 'team', 'matches', 
            'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 
            'xG', 'xG_diff', 'npxG', 'xGA', 'xGA_diff', 'npxGA', 
            'npxGD', 'ppda_coef', 'oppda_coef', 
            'deep', 'deep_allowed', 'xpts', 'xpts_diff'
        ]
        
        return full_stat[col_order]
    
    except Exception as e:
        print(f"Comprehensive error in {league} {season}: {e}")
        return None

def comprehensive_data_collection(leagues, seasons, base_url):
    # Use ThreadPoolExecutor for concurrent processing
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Create futures for each league-season combination
        futures = {
            executor.submit(process_league_season, league, season, base_url): 
            (league, season) 
            for league in leagues 
            for season in seasons
        }
        
        # Collect results
        full_data = []
        for future in concurrent.futures.as_completed(futures):
            try:
                result = future.result()
                if result is not None:
                    full_data.append(result)
            except Exception as e:
                print(f"Processing error: {e}")
        
        # Combine all collected data
        if full_data:
            return pd.concat(full_data, ignore_index=True)
        else:
            return pd.DataFrame()

# Configuration
base_url = 'https://understat.com/league'
leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'Ligue_1', 'RFPL']

# Dynamic season generation
current_year = datetime.now().year
seasons = [str(year) for year in range(2014, current_year + 1)]

# Collect and process data
comprehensive_football_data = comprehensive_data_collection(leagues, seasons, base_url)

# Optional: Save to CSV for further analysis
comprehensive_football_data.to_csv(r'C:\Users\kevo\Desktop\Data_Analysis_Projects\epl_webscraping\comprehensive_football_stats.csv', index=False)

# Display summary
print(comprehensive_football_data.groupby(['league', 'season']).size())