## Comparison Plot : Progressive Passes V Progressive Carries

In [1]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
from scipy.stats import beta
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import re
from urllib.request import urlopen
from PIL import Image

# Function to scrape data from fbref
def scrape_category(category):
    url = f"https://fbref.com/en/comps/676/{category}/European-Championship-Stats"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    category_html = None
    for comment in comments:
        if f"div_stats_{category}" in comment:
            category_html = BeautifulSoup(comment, 'html.parser')
            break

    table = category_html.select_one(f"#div_stats_{category} table")
    df = pd.read_html(str(table))[0]
    df.columns = ['_'.join(col).strip() for col in df.columns.values]
    df.columns = [col.split('_')[-1] if col.startswith('Unnamed') else col for col in df.columns]

    urls = [a['href'] for a in category_html.select("td[data-stat='player'] a")]

    output = {'df': df, 'urls': urls}
    return output

categories = ["passing", "possession", "gca"]

data = {}
for category in categories:
    output = scrape_category(category)
    data[category] = output['df']
    data[f"{category}_urls"] = output['urls']

df_list = []
# Loop through each URL in the list
for url in data['passing_urls']:
  url_parts = url.split('/')
  
  # Extract player ID (assuming alphanumeric)
  player_id = url_parts[-2]
  
  # Extract player name (assuming everything after last '/')
  player_name = url_parts[-1]
  
  # Create a dictionary with extracted data
  player_dict = {'url': url, 'playerId': player_id, 'playerName': player_name}
  
  # Append the dictionary to the list
  df_list.append(player_dict)

# Create the DataFrame from the list of dictionaries
df_passing_urls = pd.DataFrame(df_list)

df_list = []
# Loop through each URL in the list
for url in data['possession_urls']:
  url_parts = url.split('/')
  
  # Extract player ID (assuming alphanumeric)
  player_id = url_parts[-2]
  
  # Extract player name (assuming everything after last '/')
  player_name = url_parts[-1]
  
  # Create a dictionary with extracted data
  player_dict = {'url': url, 'playerId': player_id, 'playerName': player_name}
  
  # Append the dictionary to the list
  df_list.append(player_dict)

# Create the DataFrame from the list of dictionaries
df_possession_urls = pd.DataFrame(df_list)

passing = data['passing']
possession = data['possession']
passing['Total_Cmp'] = pd.to_numeric(passing['Total_Cmp'], errors='coerce') # Cmp : Completed total passes
passing['PrgP'] = pd.to_numeric(passing['PrgP'], errors='coerce') # PrgP : Progressive passes
passing['prog_pct'] = passing['PrgP'] / passing['Total_Cmp']
passing = passing[['Player', 'Squad', 'Total_Cmp', 'PrgP', 'prog_pct']]
passing.columns = ['player', 'team', 'passes', 'prog_passes', 'prog_pct']
#passing = passing.dropna().reset_index(drop=True)
passing = passing.loc[passing['player'] != 'Player'].reset_index(drop=True) # Filtering out rows where player names are not found
passing = pd.concat([passing,df_passing_urls],axis=1)
passing = passing[passing['passes'] != 0].reset_index(drop=True) # Filtering out rows for players who made 0 passes so far

possession['Carries_Carries'] = pd.to_numeric(possession['Carries_Carries'], errors='coerce')
possession['Carries_PrgC'] = pd.to_numeric(possession['Carries_PrgC'], errors='coerce') # PrgC : Progressive Carries
possession['prog_pct'] = possession['Carries_PrgC'] / possession['Carries_Carries']
possession = possession[['Player', 'Squad', 'Carries_Carries', 'Carries_PrgC', 'prog_pct']]
possession.columns = ['player', 'team', 'carries', 'prog_carries', 'prog_pct']
#possession = possession.dropna().reset_index(drop=True)
possession = possession[possession['player'] != 'Player'].reset_index(drop=True)
possession = pd.concat([possession,df_possession_urls],axis=1)
possession = possession[possession['carries'] != 0].reset_index(drop=True)

passes_filt = passing[(passing['passes'] >= 50) & (passing['prog_pct'] > 0) & (passing['prog_pct'] < 1)]
carry_filt = possession[(possession['carries'] >= 50) & (possession['prog_pct'] > 0) & (possession['prog_pct'] < 1)]

pass_alpha, pass_beta, _, _ = beta.fit(passes_filt['prog_pct'], floc=0, fscale=1)
carry_alpha, carry_beta, _, _ = beta.fit(carry_filt['prog_pct'], floc=0, fscale=1)

passing['adj_prog_pct'] = (passing['prog_passes'] + pass_alpha) / (passing['passes'] + pass_alpha + pass_beta)
possession['adj_prog_pct'] = (possession['prog_carries'] + carry_alpha) / (possession['carries'] + carry_alpha + carry_beta)

passing['adj_prog_pct'] = (passing['adj_prog_pct'] - passing['adj_prog_pct'].mean()) / passing['adj_prog_pct'].std()
possession['adj_prog_pct'] = (possession['adj_prog_pct'] - possession['adj_prog_pct'].mean()) / possession['adj_prog_pct'].std()

passes_final = passing[['player', 'team', 'url', 'adj_prog_pct']].sort_values(by='adj_prog_pct', ascending=False)
possession_final = possession[['player', 'team', 'url', 'adj_prog_pct']].sort_values(by='adj_prog_pct', ascending=False)

passes_final.columns = ['player', 'team', 'url', 'pass_metric']
possession_final.columns = ['player', 'team', 'url', 'carry_metric']

passes_final['id'] = passes_final['url'].apply(lambda x: re.search(r"(?<=players/)\w+", x).group())
passes_final['headshot_url'] = "https://fbref.com/req/202302030/images/headshots/" + passes_final['id'] + "_2022.jpg"

possession_final['id'] = possession_final['url'].apply(lambda x: re.search(r"(?<=players/)\w+", x).group())
possession_final['headshot_url'] = "https://fbref.com/req/202302030/images/headshots/" + possession_final['id'] + "_2022.jpg"

flags = pd.read_csv("flags_iso.csv")[['Alpha-2 code', 'URL']]
flags.columns = ['team', 'flag']

passes_final['team'] = passes_final['team'].str.upper().str[:2]
possession_final['team'] = possession_final['team'].str.upper().str[:2]

passes_final = passes_final.merge(flags, how='left', left_on='team', right_on='team')
passes_final['flag'] = passes_final['flag'].fillna("https://cdn.britannica.com/44/344-004-494CC2E8/Flag-England.jpg")
passes_final['pass_metric'] = passes_final['pass_metric'].round(2)

possession_final = possession_final.merge(flags, how='left', left_on='team', right_on='team')
possession_final['flag'] = possession_final['flag'].fillna("https://cdn.britannica.com/44/344-004-494CC2E8/Flag-England.jpg")
possession_final['carry_metric'] = possession_final['carry_metric'].round(2)

t10_pass = passes_final.head(10)
t10_carry = possession_final.head(10)

final_stats = pd.merge(passes_final, possession_final, on=['headshot_url', 'player', 'flag'])

passes_100 = passing[passing['passes'] >= 100]
carries_100 = possession[possession['carries'] >= 100]

final_stats = final_stats[final_stats['player'].isin(passes_100['player']) & final_stats['player'].isin(carries_100['player'])]

fig, ax = plt.subplots(figsize=(16,9))
fig.set_facecolor('black')
ax.set_facecolor('black')
ax.grid(color='white', linestyle='--', linewidth=0.5, alpha=0.7)

for index,row in final_stats.iterrows():
  if ((row['pass_metric'] >= 0.0) & (row['carry_metric'] >= 0.5)) or (row['pass_metric'] >= 1.5) or (row['carry_metric'] >= 1.5):
    ax.scatter(row['pass_metric'], row['carry_metric'],color='red')
    ax.annotate(row['player'],xy = (row['pass_metric']+0.05,row['carry_metric']+0.05),color='white',fontsize=15)
  else:
    ax.scatter(row['pass_metric'], row['carry_metric'],color='red')


#ax.tick_params(axis='x', colors='white')
#ax.tick_params(axis='y', colors='white')
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
ax.spines['bottom'].set_color('white')
ax.spines['left'].set_color('white')

ax.set_xlabel("Progressive Playmaking Value (Passing)",fontsize=20,color='white')
ax.set_ylabel("Progressive Playmaking Value (Carrying)",fontsize=20,color='white')
ax.set_title("Comparing Progressive Playmaking Values: Passing vs Carrying",fontsize=20,color='white')
plt.suptitle("Players With 100+ Passes and Carries In The Group Stage",fontsize=25,color='white')
fig.text(0.5, 0, "Data from FBRef | Ishdeep Chahda | @indian_citizen", ha='center', fontsize=10,color='white')
plt.savefig("comp_plot.png")

AttributeError: 'NoneType' object has no attribute 'select_one'