# Scraping and checking data

In [371]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
import plotly.graph_objects as go
import plotly.express as px
from scipy import stats

In [3]:
urls={'Premier League': 'https://fbref.com/en/comps/9/history/Premier-League-Seasons',
    'La Liga': 'https://fbref.com/en/comps/12/history/La-Liga-Seasons',
    'Serie A': 'https://fbref.com/en/comps/11/history/Serie-A-Seasons',
    'Bundesliga': 'https://fbref.com/en/comps/20/history/Bundesliga-Seasons',
    'Ligue 1': 'https://fbref.com/en/comps/13/history/Ligue-1-Seasons'}

#Get URLS from each season via history page
def get_urls(history_url, season_number=15):
    page=requests.get(history_url)
    soup=BeautifulSoup(page.content,'html.parser')
    table=soup.find('table')
    urls=[]
    
    for row in table.find('tbody').find_all('tr'):
        first_cell=row.find('th')
        if first_cell and first_cell.find('a'):
            link=first_cell.find('a')['href']
            full_link='https://fbref.com'+link
            urls.append(full_link)
    return urls[:season_number]

#Initialise list and year
all_data=[]
starting_year=2025

#Scraoe data 
for league, history_url in urls.items():
    print(f"\n Webscraping {league}")
    urls=get_urls(history_url)
    year=starting_year
    for season_url in urls:
        try:
            tables=pd.read_html(season_url)
            league_table=tables[0]
            time.sleep(random.randint(4, 8))
            #Organise table
            league_table2=league_table[['Rk', 'Squad', 'Pts', 'MP', 'Pts/MP',
                                        'W', 'D', 'L']]
            league_table2.columns=['Position', 'Team', 'Points', 
                                    'Matches Played', 'Points per Match',
                                   'Wins', 'Draws', 'Losses']
            league_table2=league_table2.dropna()
            #Create league table and season year
            league_table2['League']=league
            league_table2['Season']=year
            #Format columns
            league_table2=league_table2[['League', 'Season', 'Position',
                                         'Team', 'Points', 'Matches Played', 
                                         'Points per Match', 'Wins', 'Draws', 'Losses']]
            all_data.append(league_table2)

            print(f"Scraped {league} {year}")
            #Count down the seasons
            year-=1 
        except Exception as e:
            print(f"Error in {league} {year}: {e}")

#Put into one dataframe
combined_df=pd.concat(all_data, ignore_index=True)

#Create CSV
combined_df.to_csv('data.csv', index=False)

print("\n Data saved")


 webscraping Premier League
Scraped Premier League 2025
Scraped Premier League 2024
Scraped Premier League 2023
Scraped Premier League 2022
Scraped Premier League 2021
Scraped Premier League 2020
Scraped Premier League 2019
Scraped Premier League 2018
Scraped Premier League 2017
Scraped Premier League 2016
Scraped Premier League 2015
Scraped Premier League 2014
Scraped Premier League 2013
Scraped Premier League 2012
Scraped Premier League 2011

 webscraping La Liga
Scraped La Liga 2025
Scraped La Liga 2024
Scraped La Liga 2023
Scraped La Liga 2022
Scraped La Liga 2021
Scraped La Liga 2020
Scraped La Liga 2019
Scraped La Liga 2018
Scraped La Liga 2017
Scraped La Liga 2016
Scraped La Liga 2015
Scraped La Liga 2014
Scraped La Liga 2013
Scraped La Liga 2012
Scraped La Liga 2011

 webscraping Serie A
Scraped Serie A 2025
Scraped Serie A 2024
Scraped Serie A 2023
Scraped Serie A 2022
Scraped Serie A 2021
Scraped Serie A 2020
Scraped Serie A 2019
Scraped Serie A 2018
Scraped Serie A 2017
Scr

In [513]:
df=pd.read_csv('data.csv')
df.head(1500)

Unnamed: 0,League,Season,Position,Team,Points,Matches Played,Points per Match,Wins,Draws,Losses
0,Premier League,2025,1,Liverpool,79,33,2.39,24,7,2
1,Premier League,2025,2,Arsenal,67,34,1.97,18,13,3
2,Premier League,2025,3,Newcastle Utd,62,34,1.82,19,5,10
3,Premier League,2025,4,Manchester City,61,34,1.79,18,7,9
4,Premier League,2025,5,Chelsea,60,34,1.76,17,9,8
...,...,...,...,...,...,...,...,...,...,...
1461,Ligue 1,2011,16,Brest,46,38,1.21,11,13,14
1462,Ligue 1,2011,17,Nice,46,38,1.21,11,13,14
1463,Ligue 1,2011,18,Monaco,44,38,1.16,9,17,12
1464,Ligue 1,2011,19,Lens,35,38,0.92,7,14,17


In [338]:
#Check if any data cleaning is required
print(df.isnull().sum())
print(df.duplicated().sum())
#Check types are all numeric where they need to be
print(df.dtypes)

League              0
Season              0
Position            0
Team                0
Points              0
Matches Played      0
Points per Match    0
Wins                0
Draws               0
Losses              0
dtype: int64
0
League               object
Season                int64
Position              int64
Team                 object
Points                int64
Matches Played        int64
Points per Match    float64
Wins                  int64
Draws                 int64
Losses                int64
dtype: object


# Competitivity in Europes 5 biggest football leagues

## Part 1: which league title is the most competitive?

### Background
Competitiveness is a crucial part of football and is the second most important metric by which quality of a league is assessed, after team's ability. Among all forms of competition within a league, the title race holds the most significance. It makes headlines, fuels fairytale stories (such as Leicester City's Premier League win), and legitimises the existence of club football.

Previous studies investigating the competitiveness of the major European league titles have contradicted popular opinion. For example, a study by the well-respected sports newspaper *The Athletic* found Ligue 1 to be the most competitive of the major European leagues. This study flipped popular opinion on its head and demonstrates the necessity of empiricism when evaluating something as emotionally charged as sport.
### Aim
Following in the footsteps of the study by *The Athletic* this blog aims to quantitively measure the competitiveness of each major European league title. 
### Data
The data analysed covers the last 15 sesasons for each league. This time span was selected as it defines an era in the competitiveness of football. In 2011 Serie A introduced financial fairplay rules, and the other leagues shortly followed suit, teams could no longer buy their way out of competition.

### Competetiveness index
To assess the competition for a given league's title we will use an index made up of 3 components:

*0.25 x (number of title contenders) + 0.25 x (1-average winning margin) + 0.5 x (1-titles won by dominant team)* 
#### Definitions
Number of title contenders: The average number of teams within 3 points of the league winner in a given season

Average winning margin: The margin of points between 1st and second place at the end of a season

Titles won by dominant team: The proportion of titles won by the most successful team during the period
#### Weigthing
While number of title contenders and average winning margin are both important measures of competitiveness they are highly correlated, so they are weighted such that they hold the same weight together as "titles won by dominant team".
#### Normalisation and scaling
All variables are normalised such that the best performing league in each metric gets a score of 1, the worst gets 0, and others are scaled proportionally. 

The average winning margin is adjusted based on the number of matches played in a season, as more matches and therefore points will inflate a win margin proportionally.
#### Limitations
It is also important to acknowledge the limitations of this index. This index does not tell us how probable it is a team from the middle or bottom of the table can win the league, but rather the uncertainty with which you can predict one given title winner. For a dominant team this index captures on average whether they face competition, whether they win comfortably and how often they win.

The results for the index are below:

In [493]:
df=pd.read_csv('data.csv')
#Calculate (38/matches played) so the winning margin can be standardised
games_ratio=df.groupby('League')['Matches Played'].mean().reset_index()
games_ratio['Match Ratio']=38/games_ratio['Matches Played']

#Initialise lists
margins=[]
contenders=[]
dominant_teams=[]

#Calculate metrics
grouped=df.groupby(['League', 'Season'])

for (league, season), group in grouped:
    #Winning margin calculation
    margin=(group.iloc[0]['Points']-group.iloc[1]['Points'])
    margins.append({'League': league, 'Winning Margin': margin})
    
    #Title contenders calculation
    group_sorted=group.sort_values('Position')
    top_points=group_sorted.iloc[0]['Points']
    contender_count=(group_sorted['Points']>=top_points-3).sum()
    contenders.append({'League': league, 'Season': season, 
                       'Contenders': contender_count})

    #Proportion of titles won by dominant team
    if group_sorted.iloc[0]['Position']==1:
        dominant_teams.append({'League': league,
                               'Team': group_sorted.iloc[0]['Team']})


#Create data frames
margins_df=pd.DataFrame(margins)
contenders_df=pd.DataFrame(contenders)
dominant_team_df=pd.DataFrame(dominant_teams)

#Calculate averages across seasons
avg_margin=margins_df.groupby('League', as_index=False)['Winning Margin'].mean()
avg_contenders=contenders_df.groupby('League')['Contenders'].mean().reset_index()

#Scale winning margin
avg_margin=pd.merge(avg_margin, games_ratio, on='League')
avg_margin['Weighted Margin']=avg_margin['Winning Margin']*avg_margin['Match Ratio']

#Title count of most dominant team in each league
title_counts=dominant_team_df.groupby(['League',
                                       'Team']).size().reset_index(name='Titles')

#Identify most dominant team
dominant_team=title_counts.sort_values(['League', 'Titles']
                                ,ascending=[True, False]).drop_duplicates('League')
dominant_team['Dominance Ratio']=1-(dominant_team['Titles']/15)
dominance_ratio=dominant_team[['League', 'Dominance Ratio']].copy()

#Create normalisation function to easily normalise metrics
def normalise(df,column):
    min_val=df[column].min()
    max_val=df[column].max()
    df[f'Normalised {column}']=(df[column]-min_val)/(max_val-min_val)
    return df

#Normalise all metrics
avg_margin=normalise(avg_margin, 'Winning Margin')
avg_margin['Normalised Winning Margin']=1-avg_margin['Normalised Winning Margin']
avg_contenders=normalise(avg_contenders, 'Contenders')
avg_contenders['Normalised Title Contenders']=avg_contenders['Normalised Contenders']
dominance_ratio=normalise(dominance_ratio, 'Dominance Ratio')

#Merge all normalised data
merged=avg_contenders[['League', 'Normalised Title Contenders']]\
    .merge(dominance_ratio[['League', 'Normalised Dominance Ratio']], on='League')\
    .merge(avg_margin[['League', 'Normalised Winning Margin']], on='League')\
    .merge(games_ratio[['League', 'Match Ratio']], on='League')

#Calculate Competitiveness Score
competitive_title=(0.25*merged['Normalised Title Contenders']) + \
                    (0.5*merged['Normalised Dominance Ratio']) + \
                    (0.25*merged['Normalised Winning Margin'])

#Rename columns
merged.rename(columns={
    'Normalised Title Contenders': 'Average amount of contenders for title (Normalised)',
    'Normalised Dominance Ratio': 'Titles won by dominant team (Normalised)',
    'Normalised Winning Margin' : 'Average winning margin (Normalised)',
    'Match Ratio': 'Relative matches per season'}, inplace=True)

#Add competitiveness score to the dataframe
merged['Title competitiveness index']=competitive_title
merged=merged.sort_values(by='Title competitiveness index', ascending=False)
merged

Unnamed: 0,League,Average amount of contenders for title (Normalised),Titles won by dominant team (Normalised),Average winning margin (Normalised),Relative matches per season,Title competitiveness index
1,La Liga,1.0,1.0,1.0,1.009028,1.0
3,Premier League,0.6,1.0,0.842857,1.007601,0.860714
4,Serie A,0.2,0.75,0.642857,1.00885,0.585714
2,Ligue 1,0.2,0.25,0.1,1.038021,0.2
0,Bundesliga,0.0,0.0,0.0,1.124753,0.0


### Results
These results seem to largely affirm popular perception with La Liga and the Premier league dominanting the index. The unparalleled competitiveness between Barcelona and Real Madrid has placed La Liga on top, this rivalry makes it very hard to predict which of the two will win a given season, hence we see narrow winning margins and low levels of one team dominance in the data. 

The premier league in second benefits from the fact unlike other leagues it tends to go through eras of different teams being dominant, giving it a good score in the "titles won by dominant teams" metric. 

The unwavering dominance of PSG and Bayern Munich respectively place Ligue 1 and the Bundesliga at the bottom of the table, as would be expected. These two leagues are characterised by huge win margins by their top clubs, little variation in title winning teams and few contenders for titles.

We can see how each team performed relative to the others more clearly in the plot below:

In [494]:
#Get data from merged
labels=['Average amount of contenders for title (Normalised)',
        'Titles won by dominant team (Normalised)'
        , 'Average winning margin (Normalised)']
fig=go.Figure()


#Plot each league
for i, row in merged.iterrows():
    fig.add_trace(go.Scatterpolar(
        r=row[labels].tolist()+[row[labels].iloc[0]],theta=labels+[labels[0]], 
        fill='toself', name=row['League']))

fig.update_layout(title="League's Performance Across Title Competitiveness Metrics",
    template='plotly_dark')


fig.show()


### Graph interpretation
The graph shows a lot of agreement between the metrics in our index. No team with a higher score in the index has performed worse in any metric than a team with a lower score. However we choose to adjust the weights for any metric our results will remain the same so there is no ambiguity about the rankings this index has given us.

## Part 1 conclusion:
The index has given us quite clear results about which league titles are more competitive than others that have confirmed popular sentiment

## Part 2: how has overall competitiveness in the 5 major leagues changed over time?

### Background
The financial fair play regulations implemented at the beginning of the selected 15 year data period reflect a perennial worry in football, that league competitiveness is diminishing over time. In spite of these rules, more successful teams continue to get richer through performance-based access to hugely profitable competitions such as the champions league and merchandise sales meanwhile smaller teams fall behind with no opportunity to spend in order to catch up. As financial fair play permits spending according to revenues, it is ineffective at solving this problem. Over time this results in a decrease in the general competitiveness of footballing leagues, making the game uninteresting for those supporting teams not at the top.

### Aim
The aim of this section is to determine whether the decreasing competitiveness of footballing leagues is a real phenomenon, and by extension whether the implementation of financial fair play rules has been enough to mitigate the decline in competition across Europes 5 major leagues. 

### Methodology
#### HHI index
Competitiveness will be quantified using the Herfindahl-Hirschman Index (HHI) which is traditionally used in economics for measuring market competition by analysing the concerntration of market share, in this case market share will be substituted for points share. 
#### Normalisation
HHI will have to be normalised, as different leagues have different numbers of teams. For example, the Bundesliga has 18 teams meanwhile the premier league has 20 so one teams point share, which HHI measures, will be more diluted in the premier league than in the Bundesliga without normalisation.

In [545]:
#Get total points per league and season
df['Total Points']=df.groupby(['League', 'Season'])['Points'].transform('sum')

#Calculate HHI
#Create new columns necessary for calculation
df['Points Share']=df['Points']/df['Total Points']
df['Points Share Squared']=df['Points Share']**2

#Sum points
hhi_df=df.groupby(['League', 'Season'])['Points Share Squared'].sum().reset_index()
hhi_df.rename(columns={'Points Share Squared': 'HHI'}, inplace=True)

#Normalise HHI for number of teams in a league
#Get Number of teams per league and season
team_count=df.groupby(['League', 'Season'])['Team'].nunique().reset_index()
team_count.rename(columns={'Team': 'Number of Teams'}, inplace=True)

#Merge HHI with team count
hhi_df=hhi_df.merge(team_count, on=['League', 'Season'])

#Get normalised HHI
hhi_df['HHI Normalised']=(hhi_df['HHI']-1/
                          hhi_df['Number of Teams'])/(1-1/hhi_df['Number of Teams'])
#Plot HHI
fig = go.Figure()
#Scatter plot for league HHI normalised by season
for league in hhi_df['League'].unique():
    league_data=hhi_df[hhi_df['League']==league]
    fig.add_trace(go.Scatter(x=league_data['Season'], y=league_data['HHI Normalised']
                             , name=league))

#Line of best fit across all leagues
slope, intercept = stats.linregress(hhi_df['Season'], hhi_df['HHI Normalised'])[:2]
y2=slope*hhi_df['Season']+intercept

fig.add_trace(go.Scatter(x=hhi_df['Season'], y=y2, mode='lines',
                         name='Line of Best Fit'))

#Label
fig.update_layout(title='Normalised League HHI by Season',
    xaxis_title='Season',
    yaxis_title='Normalised League HHI',
    xaxis=dict(tickangle=45),
    legend_title='League',
    template='plotly_dark')

fig.show()

#Correlation coefficient
corr=hhi_df['Season'].corr(hhi_df['HHI Normalised'])
print("Correlation coefficient:", round(corr,3))

Correlation coefficient: 0.401


### Graph interpretation
Two things are interesting about this graph. Firstly, there is a lot of noise in the HHI index for any given league across the seasons and no discernable divergence between leagues. This suggests there aren't league specific trends to changes in competitiveness, and rather any trends identified span all leagues. The only exception to this is La Liga which had a unique and somewhat sustained boost to competitiveness after 2015, potentially attributable to the financial issues faced by Barcelona. Secondly, we can see a increase in HHI across all leagues, this indicates a full in competitiveness and more specifically the increasing dominance of a select few teams. The trend does appear to be a decrease in competitivity. According to this metric declining competitiveness in football is a real phenomenon, and financial fairplay has been insufficient at completely stopping it.

The correlation coefficient indicates a moderate positive relationship between time, increased HHI and decreased competitivity.

The following graphs will serve to support this point:

In [546]:
#Get Mean and SD ready for calculating CV
cv_df=df.groupby(['League', 'Season']).agg(
    Mean_Points=('Points', 'mean'),
    Std_Points=('Points', 'std')).reset_index()

#Calculate CV
cv_df['CV']=cv_df['Std_Points']/cv_df['Mean_Points']

#Plot CV
fig=go.Figure()

#Scatter plot for CV by league and season
for league in cv_df['League'].unique():
    league_data=cv_df[cv_df['League']==league]
    fig.add_trace(go.Scatter(x=league_data['Season'], y=league_data['CV'],
        mode='markers',name=league))

#Add line of best fit
slope, intercept=stats.linregress(cv_df['Season'], cv_df['CV'])[:2]
y1=slope*cv_df['Season']+intercept
fig.add_trace(go.Scatter(x=cv_df['Season'], y=y1, mode='lines',
                         name='Line of best fit'))

#Label
fig.update_layout(title='Coefficient of Variation across all leagues by Season',
    xaxis_title='Season',
    yaxis_title='Coefficient of Variation',
    xaxis=dict(tickangle=45),
    legend_title='League',
    template='plotly_dark')

fig.show()

#Correlation coefficient
corr=cv_df['Season'].corr(cv_df['CV'])
print("Correlation coefficient:", round(corr,3))

Correlation coefficient: 0.394


### Graph interpretation
This graph demonstrates the trend is consistent across different metrics of competitiveness. The coefficient of variation measures disparity between top and bottom teams, a higher coefficient means a more disparity. So with these two measures in conjunction we can see both increasing dominance in points share and a indication of smaller teams "falling behind" due to the widening points gaps indicated by the increasing coefficient.

In [540]:
fig=go.Figure()

#Define colour map to be the same for scatter plot and line
colours=px.colors.qualitative.Plotly
colour_map={league: colours[i % len(colours)] for i,
             league in enumerate(hhi_df['League'].unique())}

#Loop through each league to plot multiple lines of best fit
for league in hhi_df['League'].unique():
    league_data = hhi_df[hhi_df['League']==league]
    
    #Scatter plots
    fig.add_trace(go.Scatter(x=league_data['Season'], y=league_data['HHI Normalised'],
        mode='markers', name=league, marker_color=colour_map[league], 
                             legendgroup=league))

    #Line of best fit for iterated league
    slope, intercept=stats.linregress(league_data['Season'],
                                      league_data['HHI Normalised'])[:2]
    y_fit=slope*league_data['Season']+intercept

    fig.add_trace(go.Scatter(x=league_data['Season'], y=y_fit, mode='lines',
        name=f'{league} Trend', showlegend=False, legendgroup=league,
        line_color=colour_map[league]))

#Label
fig.update_layout(
    title="Best Fit Lines for Each League's HHI by Season",
    xaxis_title='Season',
    yaxis_title='Normalised HHI',
    xaxis=dict(tickangle=45),
    template='plotly_dark')

fig.show()


### Graph interpretation
This graph shows clearly how uniform the decrease in competitivity across in all leagues barring La Liga is. This indicates issue spans leagues and does not occur within leagues. The decline in competitiveness is indicative of continent wide issues in football.

## Part 2 conclusion:
The decline in competitiveness across European football appears to be real and a continent wide issue, not confined to any single league. Addressing this issue requires a unilateral approach. Financial fair play regulations have proven insufficient in entirely mitigating increasing uncompetitiveness, suggesting that either stricter enforcement or greater support for smaller clubs is necessary. The recent introduction of the new Club World Cup, restricted to historically successful teams and offering them yet another exclusive revenue stream, will only increase this imbalance. The actions of European footballing authorities, such as UEFA, therefore seem to be in contradiction with the trends identified in this blog, and change is needed.