In [7]:
import os
import sys
from httpx import AsyncClient
import re
import json
import asyncio
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
module_path = os.path.abspath(os.path.join('../transfermarket/'))
sys.path.append(module_path)

In [4]:
client=AsyncClient()

In [5]:
next_delay = 0

async def delayed_fetch(url:str,match_date:str,sleep_time_seconds:int = 3):
    global next_delay
    # need to increase the delay on each api request 
    # (since they all start the timers simulataneously)
    next_delay += sleep_time_seconds
    await asyncio.sleep(next_delay)
    
    print(".", end="")
    fetch = await client.get(url)
    # need to keep track of which match this is so we can add metadata
    # e.g. opponent name, match result
    return match_date, fetch

In [6]:
from httpx import Response

nunez_player_id = "4d77b365"

async def get_match_data(player_id:str, season:str = '2022-2023')-> dict[str, dict[str,str]]:
    match_summary_url = f"https://fbref.com/en/players/{player_id}/matchlogs/{season}/summary/"

    match_page = await client.get(match_summary_url)
    matches = pd.read_html(match_page.content ,extract_links="all")[0].iloc[:, [0,4,5,6,7,-1]]
    matches.columns = [(col[1][0]) for col in matches.columns] #type:ignore
    # remove links for everything but match report
    matches.iloc[:, :-1] = matches.iloc[:, :-1].applymap(lambda x:x[0])
    matches.iloc[:, -1] = matches.iloc[:, -1].apply(lambda x:x[-1])

    match_dates:dict[str, dict] = matches.dropna().set_index('Date').to_dict('index') #type:ignore
    return match_dates

async def get_all_shots(match_dates:dict[str, dict[str,str]]):
    fetch_coros = []
    next_delay = 0

    async def delayed_fetch(url:str,match_date:str,sleep_time_seconds:int = 3)->tuple[str,Response]:
        nonlocal next_delay
        # need to increase the delay on each api request 
        # (since they all start the timers simulataneously)
        next_delay += sleep_time_seconds
        await asyncio.sleep(next_delay)
        
        print(".", end="")
        fetch = await client.get(url)
        # need to keep track of which match this is so we can add metadata
        # e.g. opponent name, match result
        return match_date, fetch

    for date, match_details in match_dates.items():
        match_url = match_details['Match Report']
        fetch_coros.append(delayed_fetch(f"https://fbref.com/{match_url}", date))

    match_pages = await asyncio.gather(*fetch_coros)
    return match_pages

def process_shot_data(match_pages:tuple[str,Response]):
    match_data =[]
    for match_date, data in match_pages:
        res_dfs = pd.read_html(data.content)

        longest_xg_df = 0
        xg_df = None
        #we need to find xg table with most entries (to ensure we arent picking a filtered table)
        for df in res_dfs:
            try:
                if 'xG' in df.droplevel(level=0, axis=1).columns:
                    if len(df) > longest_xg_df:
                        longest_xg_df = len(df)
                        xg_df = df
            except ValueError:
                # we cant drop level so this isnt an xg table
                continue
    
        if xg_df is None:
            # no xg data available for this game
            continue
        shot_data = xg_df.iloc[:,[0,1,3,5] ]
        shot_data= shot_data.droplevel(0, axis=1)
        shot_data['date'] = match_date
        match_data.append(shot_data)
        
    all_matches = pd.concat(match_data)
    return all_matches

In [8]:
# player_ids = {
#     "kane":"21a66f6a", 
#     "haaland":"1f44ac21", 
#     "firmino": "4c370d81", 
#     "salah": "e342ad68", 
#     "nunez": "4d77b365",
#     "gakpo": "1971591f",
#     "jota": "178ae8f8", 
#     "diaz":"4a1a9578"
#     }

player_ids = {"salah": "e342ad68"}

for player, player_id in player_ids.items():
    print(f"grabbing {player} data")
    shot_data = []
    match_data = []

    for season in ["2020-2021","2021-2022","2022-2023"]:
        print(season)
        matches = await get_match_data(player_id, season=season)
        match_data.append(matches)
        shots = await get_all_shots(matches)
        processed_shots = process_shot_data(shots)
        shot_data.append(processed_shots)

    all_shot_data = pd.concat(shot_data).to_csv(f"{player}_shot_data.csv")
    all_match_data = pd.concat([pd.DataFrame.from_dict(match, orient='index') for match in match_data]).to_csv(f"{player}_match_data.csv")

grabbing salah data
2020-2021
.....................................................2021-2022
..................................................................2022-2023
............................