In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

BASE_URL = "https://sofifa.com/teams?hl=en-US"

# Headers to mimic a browser visit
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Target URLs for each roster date
EXTRA_TEAM_REQUESTS = {
    "FIFA 17": {
        "Aug 3, 2017": "https://sofifa.com/teams?type=all&lg%5B0%5D=13&r=170086&set=true&col=oa&sort=desc",  # All Premier League teams
        "Aug 10, 2017": "https://sofifa.com/teams?type=all&lg%5B0%5D=14&r=170087&set=true"  # Championship (filter: Newcastle, Brighton, Huddersfield)
    },
    "FIFA 19":{"Jul 18, 2019": "https://sofifa.com/teams?type=all&lg%5B0%5D=14&r=190072&set=true"},
    "FIFA 21": {
        "Aug 6, 2021": "https://sofifa.com/teams?type=all&lg%5B0%5D=14&r=210058&set=true"  # Championship (filter: Brentford, Nottingham Forest)
    },
    "FIFA 22": {
        "Aug 18, 2022": "https://sofifa.com/teams?type=all&lg%5B0%5D=14&r=220069&set=true"  # Championship (filter: Nottingham Forest)
    }
}

# Define team filters for specific dates
TEAM_FILTERS = {
    "FIFA 17": {
        "Aug 3, 2017": None,  # Keep all teams
        "Aug 10, 2017": ["Newcastle United", "Brighton & Hove Albion", "Huddersfield Town"]
    },
    "FIFA 19": {
        "Jul 18, 2019": ["Sheffield United", "Aston Villa", "Norwich City"]
    },
    "FIFA 21": {
        "Aug 6, 2021": ["Brentford", "Nottingham Forest"]
    },
    "FIFA 22": {
        "Aug 18, 2022": ["Nottingham Forest"]
    }
}

def get_versions_and_roster_dates():
    """Retrieve available FIFA versions (only FIFA18 - FC25) and corresponding roster update dates."""
    response = requests.get(BASE_URL, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None, None
    
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract FIFA versions and filter only FIFA18 - FC25
    version_select = soup.select_one('select[name="version"]')
    versions = {}

    if version_select:
        for option in version_select.find_all('option'):
            version_text = option.text.strip()
            version_value = option['value']
            
            # Ensure we only include FIFA18 to FC25
            valid_versions = ["FIFA 18", "FIFA 19", "FIFA 20", "FIFA 21", "FIFA 22", "FIFA 23", "FC 24", "FC 25"]
            if version_text in valid_versions:
                versions[version_text] = version_value
    
    print(f"Filtered FIFA versions: {versions}")

    # Extract roster dates for each version
    version_rosters = {}

    for version_name, version_value in versions.items():
        response = requests.get(f"{BASE_URL}&version={version_value}", headers=HEADERS)
        soup = BeautifulSoup(response.content, 'html.parser')

        roster_select = soup.select_one('select[name="roster"]')
        roster_dates = {}

        if roster_select:
            for option in roster_select.find_all('option'):
                roster_text = option.text.strip()
                roster_value = option['value']
                roster_dates[roster_text] = roster_value

        version_rosters[version_name] = roster_dates

    return versions, version_rosters

def scrape_premier_league_teams(version_name, version_value, roster_text, roster_value):
    """Scrape Premier League teams for a given FIFA version and roster date."""
    url = f"{BASE_URL}&version={version_value}&roster={roster_value}&lg[]=13"  # League ID 13 = Premier League
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code != 200:
        print(f"Failed to retrieve {url}. Status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    team_rows = soup.select('tbody > tr')
    teams_data = []

    for row in team_rows:
        try:
            team_name_element = row.select_one('td.s20 > a')
            team_name = team_name_element.text.strip() if team_name_element else "N/A"
            team_url = team_name_element.get('href') if team_name_element else "N/A"
            team_id = team_url.split('/')[2] if team_url != "N/A" else "N/A"
            
            league_element = row.select_one('td.s20 a.sub')
            league = league_element.text.strip() if league_element else "N/A"
            
            overall = row.select_one('td[data-col="oa"]').text.strip() if row.select_one('td[data-col="oa"]') else "N/A"
            attack = row.select_one('td[data-col="at"]').text.strip() if row.select_one('td[data-col="at"]') else "N/A"
            midfield = row.select_one('td[data-col="md"]').text.strip() if row.select_one('td[data-col="md"]') else "N/A"
            defense = row.select_one('td[data-col="df"]').text.strip() if row.select_one('td[data-col="df"]') else "N/A"
            
            teams_data.append({
                'FIFA Version': version_name,
                'Roster Date': roster_text,  # Assign human-readable date
                'Team ID': team_id,
                'Team Name': team_name,
                'Team URL': f"https://sofifa.com{team_url}" if team_url != "N/A" else "N/A",
                'League': league,
                'Overall Rating': overall,
                'Attack': attack,
                'Midfield': midfield,
                'Defense': defense,
            })
            
        except Exception as e:
            print(f"Error processing team: {e}")

    return teams_data

def scrape_additional_teams(version_name, roster_text, url, filter_teams=None):
    """Scrape teams for a given FIFA version and roster date."""
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code != 200:
        print(f"Failed to retrieve {url}. Status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    team_rows = soup.select('tbody > tr')
    teams_data = []

    for row in team_rows:
        try:
            team_name_element = row.select_one('td.s20 > a')
            team_name = team_name_element.text.strip() if team_name_element else "N/A"
            team_url = team_name_element.get('href') if team_name_element else "N/A"
            team_id = team_url.split('/')[2] if team_url != "N/A" else "N/A"
            
            league_element = row.select_one('td.s20 a.sub')
            league = league_element.text.strip() if league_element else "N/A"
            
            overall = row.select_one('td[data-col="oa"]').text.strip() if row.select_one('td[data-col="oa"]') else "N/A"
            attack = row.select_one('td[data-col="at"]').text.strip() if row.select_one('td[data-col="at"]') else "N/A"
            midfield = row.select_one('td[data-col="md"]').text.strip() if row.select_one('td[data-col="md"]') else "N/A"
            defense = row.select_one('td[data-col="df"]').text.strip() if row.select_one('td[data-col="df"]') else "N/A"
            
            teams_data.append({
                'FIFA Version': version_name,
                'Roster Date': roster_text,  # Assign human-readable date
                'Team ID': team_id,
                'Team Name': team_name,
                'Team URL': f"https://sofifa.com{team_url}" if team_url != "N/A" else "N/A",
                'League': league,
                'Overall Rating': overall,
                'Attack': attack,
                'Midfield': midfield,
                'Defense': defense,
            })
            
        except Exception as e:
            print(f"Error processing team: {e}")

    return teams_data

def filter_teams(df):
    """Filter the dataframe to keep only required rows."""
    filtered_data = []

    for version_name, version_filters in TEAM_FILTERS.items():
        for roster_text, allowed_teams in version_filters.items():
            # Convert text date to datetime for filtering
            roster_date = pd.to_datetime(roster_text, format='%b %d, %Y', errors='coerce')

            # Select rows matching the FIFA version and Roster Date
            subset = df[(df["FIFA Version"] == version_name) & (df["Roster Date"] == roster_date)]

            if allowed_teams is None:  # Keep all teams for this date
                filtered_data.append(subset)
            else:  # Keep only specific teams
                filtered_data.append(subset[subset["Team Name"].isin(allowed_teams)])

    return pd.concat(filtered_data, ignore_index=True) if filtered_data else df


def main():
    versions, version_rosters = get_versions_and_roster_dates()
    
    if not versions or not version_rosters:
        print("Failed to retrieve FIFA versions or roster dates.")
        return
    
    all_premier_league_data = []
    all_additional_team_data = []

    # Iterate through each FIFA version
    for version_name, version_value in versions.items():
        print(f"Scraping {version_name}...")

        # Get roster dates for this specific version
        roster_dates = version_rosters.get(version_name, {})

        print(f"Found {len(roster_dates)} roster dates for {version_name}")

        # Scrape data for each roster date
        for roster_text, roster_value in roster_dates.items():
            print(f"Scraping {version_name} - {roster_text} (Premier League)")
            team_data = scrape_premier_league_teams(version_name, version_value, roster_text, roster_value)
            
            if team_data:
                all_premier_league_data.extend(team_data)
            
            time.sleep(2)  # Avoid overwhelming the server


    # Scrape extra teams based on specific requests
    for version_name, details in EXTRA_TEAM_REQUESTS.items():
        for roster_text, url in details.items():
            print(f"Scraping {version_name} - {roster_text}")

            # Scrape data
            team_data = scrape_additional_teams(version_name, roster_text, url)
            if team_data:
                all_additional_team_data.extend(team_data)
            
            time.sleep(2)  # Avoid overwhelming the server

    # Convert to DataFrame
    premier_league_df = pd.DataFrame(all_premier_league_data)
    additional_team_df = pd.DataFrame(all_additional_team_data)

    # Convert Roster Date to proper date format
    premier_league_df['Roster Date'] = pd.to_datetime(premier_league_df['Roster Date'], format='%b %d, %Y', errors='coerce')
    additional_team_df['Roster Date'] = pd.to_datetime(additional_team_df['Roster Date'], format='%b %d, %Y', errors='coerce')

    # Apply filtering
    additional_team_df_filtered = filter_teams(additional_team_df)

    #Combine both DF together
    df = pd.concat([premier_league_df, additional_team_df_filtered], axis=0, ignore_index= True)

    # Sorting: Latest FIFA → Latest Date → Highest Overall Rating
    df.sort_values(by=["FIFA Version", "Roster Date", "Overall Rating"], ascending=[False, False, False], inplace=True)

    # Save to CSV
    df.to_csv('fifa_team_rating.csv', index=False)
    print("Data saved to 'fifa_team_rating.csv'")

if __name__ == "__main__":
    main()


Filtered FIFA versions: {'FC 25': '/teams?hl=en-US&r=250026&set=true', 'FC 24': '/teams?hl=en-US&r=240050&set=true', 'FIFA 23': '/teams?hl=en-US&r=230054&set=true', 'FIFA 22': '/teams?hl=en-US&r=220069&set=true', 'FIFA 21': '/teams?hl=en-US&r=210064&set=true', 'FIFA 20': '/teams?hl=en-US&r=200061&set=true', 'FIFA 19': '/teams?hl=en-US&r=190075&set=true', 'FIFA 18': '/teams?hl=en-US&r=180084&set=true'}
Scraping FC 25...
Found 25 roster dates for FC 25
Scraping FC 25 - Mar 12, 2025 (Premier League)
Scraping FC 25 - Mar 5, 2025 (Premier League)
Scraping FC 25 - Feb 24, 2025 (Premier League)
Scraping FC 25 - Feb 19, 2025 (Premier League)
Scraping FC 25 - Feb 14, 2025 (Premier League)
Scraping FC 25 - Feb 5, 2025 (Premier League)
Scraping FC 25 - Feb 4, 2025 (Premier League)
Scraping FC 25 - Jan 23, 2025 (Premier League)
Scraping FC 25 - Jan 16, 2025 (Premier League)
Scraping FC 25 - Jan 9, 2025 (Premier League)
Scraping FC 25 - Dec 18, 2024 (Premier League)
Scraping FC 25 - Dec 11, 2024 (P