In this repository, it is used the data corresponding to **Liga F** (Spanish First Division). 

This notebook shows how data is collected, using *Python* and *BeautyfulSoup*, and how this data will be loaded in *Power BI* dashboard.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import io

# Liga F stats url
url_league = 'https://fbref.com/en/comps/230/stats/Liga-F-Stats'

# Liga F full stats
url_full_stats = 'https://fbref.com/en/comps/230/Liga-F-Stats'

# Liga F fixtures and matches
url_matches = 'https://fbref.com/en/comps/230/schedule/Liga-F-Scores-and-Fixtures'

# Example match report
url_example_match = 'https://fbref.com/en/matches/87c755cd/Alaves-Madrid-CFF-September-17-2022-Liga-F'

## League Standings
First type of data consists on global data about the whole season

In [2]:
# Function that reads all elements of a class from the html indicated by the url
# Returns a list of dataframes 
def get_tables(url, element='table'):
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    
    data = [pd.read_html(str(t))[0] for t in soup.find_all(element)]
    return data

In [3]:
get_tables(url_league)[2].head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Playing Time,Playing Time,Playing Time,...,Expected,Expected,Expected,Expected,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Unnamed: 32_level_0
Unnamed: 0_level_1,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,xG,npxG,xAG,npxG+xAG,xG,xAG,xG+xAG,npxG,npxG+xAG,Matches
0,1,Teresa Abilleira,es ESP,MF,Real Madrid,22-299,2000,6,4,397,...,0.4,0.4,0.8,1.2,0.08,0.18,0.26,0.08,0.26,Matches
1,2,Jessica Aby,ci CIV,"FW,MF",Alavés,24-141,1998,6,4,315,...,0.2,0.2,0.2,0.4,0.07,0.05,0.12,0.07,0.12,Matches
2,3,Júlia Aguado,,DF,Levante,22-186,2000,2,0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
3,4,Yolanda Aguirre,es ESP,GK,Sevilla,24-012,1998,2,2,180,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
4,5,Rasheedat Ajibade,ng NGA,FW,Atlético Madrid,22-331,1999,7,4,400,...,3.8,3.8,0.2,4.1,0.86,0.06,0.92,0.86,0.92,Matches


In [4]:
# Using the function above, obtains the 'Standard Stats' data
def get_league_stats(url):

    dfs = get_tables(url) 

    squad_stats = dfs[0]
    squad_stats_against = dfs[1]

    players = dfs[2]

    return [squad_stats, squad_stats_against, players]

## Full Stats by Squad
Also, it is available stats refered to fields as *'passing'*, *'shooting'*, *'defending'*...

In [5]:
def get_full_stats(url):
    
    full_stats = get_tables(url_full_stats)
    # Filter only first 24 tables...
    # the rest are players leading some stats...
    full_stats = full_stats[0:24]
    
    return full_stats

# To access these tables, use desired key...
indexes_fs = {
    'ranks': slice(0,2),
    'stats': slice(2,4),
    'gk': slice(4,6),
    'adv_gk': slice(6,8),
    'shooting': slice(8,10),
    'passing': slice(10,12),
    'pass_type': slice(12,14),
    'gs_creation': slice(14,16),
    'defensive': slice(16,18),
    'possession': slice(18,20),
    'play_time': slice(20,22),
    'misc': slice(22,24),
}
# rank, rank_op = full_stats[indexes_fs['ranks']]
# passing, passing_op = full_stats[indexes_fs['passing']]

## Matches Results
Other type of data collected consists on the results of finished matches. This also retrieves future fixtures.


In [6]:
def get_matches_url(url, element='table'):
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    
    data = [t for t in soup.find_all(element)]
    
    # Links are contained in <a> sections, with the text 'Match Report'
    match_link = re.compile("Match Report")
    # Extracting links for the reports
    links = [ link['href'] for link in data[0].find_all('a', href=True) if match_link.match(link.contents[0]) ]    
    df = pd.read_html(str(data[0]))[0]
    
    # Fill matches that dont have Match Report (future matches)
    while len(links) < df.shape[0]: links.append('')
    # Modifying links to access after 
    df['Match Report'] = links
    
    # Drop invalid rows and convert Week number to integer
    df = df[df['Wk'].notna()]
    df.Wk = df.Wk.astype(int)

    return df

## Match Report
Each match report url has a defined structure of tables:
1. Home Squad + Bench
2. Away Squad + Bench
3. Match Stats
4. For the Home and Away Team, special stats
    - Summary
    - Passing
    - Pass Types
    - Deffensive Actions
    - Possession
    - Miscelaneous
    - GoalKeeper Stats
5. Shots info
    - Both teams
    - Home team
    - Away Team

In [7]:
def get_match_data(url, element='table'):
    dfs = get_tables(url)
    
    idx_home = [0,3,4,5,6,7,8,9,18]
    home_team_stats = [dfs[i] for i in idx_home]
    
    idx_away = [1,10,11,12,13,14,15,16,19]
    away_team_stats = [dfs[i] for i in idx_away]
    
    idx_general = [2,17]
    general_stats = [dfs[i] for i in idx_general]
    return home_team_stats, away_team_stats, general_stats

In [8]:
def parse_match_report(url):
    
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')

    events_div = soup.find('div', {'id' : 'events_wrap'})
    events = events_div.find_all('div', {'class' : ['event a', 'event b', 'event_header']})
    headers = ['Kick Off', 'Half Time']
    
    
    # CSV Headers
    data = 'Minute, Event, ScoreH, ScoreA, Player, 2nd Player\n'

    for e in events:
        # Discard headers (not interesting events)
        if e.text not in headers:
            text = re.sub(r'\s{2,}', ' ', e.text)

            minute = re.search(r'(\d|\+){1,5}', text)
            data += text[minute.start():minute.end()] + ', '

            ev = re.search(r'\s—\s', text)
            data += re.sub(r'\d:\d', '', text[(ev.span()[1]):] + ', ')

            score = re.search(r'\d+:\d+', text)
            sc = re.match(r'.*(\d+):(\d+).*', text)
            if sc:
                data += sc.groups()[0] + ', ' + sc.groups()[1] + ', '
            # data += text[score.span()[0]:score.span()[1]] + ', '

            assist = re.search(r'(Assist:) | (for) ', text)
            if assist:
                data += text[score.span()[1]:assist.start()] + ', '
                data += text[assist.span()[1]: ev.start()] + '\n'
            else:
                data += text[score.span()[1]:ev.start()] + ', \n'
    
    return data

# Use Examples

In [9]:
from IPython.display import display, Markdown

league_stats = get_league_stats(url_league)
league_full_stats = get_full_stats(url_full_stats)
league_matches = get_matches_url(url_matches)

In [10]:
display(Markdown('## General Stats'))
display(league_stats[0].head())
display(Markdown('## Opponent Stats'))
display(league_stats[1].head())
display(Markdown('## Players Stats'))
display(league_stats[2].head())

## General Stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Playing Time,Playing Time,Playing Time,Playing Time,Performance,Performance,...,Per 90 Minutes,Expected,Expected,Expected,Expected,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes
Unnamed: 0_level_1,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,G+A-PK,xG,npxG,xAG,npxG+xAG,xG,xAG,xG+xAG,npxG,npxG+xAG
0,Alavés,20,27.0,40.3,7,77,630,7.0,5,3,...,1.14,4.5,4.5,2.6,7.2,0.65,0.38,1.02,0.65,1.02
1,Alhama,20,26.6,43.6,7,77,630,7.0,3,1,...,0.57,5.6,5.6,3.6,9.2,0.81,0.51,1.32,0.81,1.32
2,Athletic Club,20,24.6,47.9,7,77,630,7.0,6,4,...,1.43,6.6,6.6,4.7,11.2,0.94,0.66,1.6,0.94,1.6
3,Atlético Madrid,20,26.3,56.3,7,77,630,7.0,15,9,...,3.29,12.5,10.9,8.0,18.9,1.78,1.15,2.93,1.55,2.7
4,Barcelona,24,26.8,69.4,7,77,630,7.0,23,17,...,5.71,17.2,17.2,13.1,30.4,2.46,1.88,4.34,2.46,4.34


## Opponent Stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Playing Time,Playing Time,Playing Time,Playing Time,Performance,Performance,...,Per 90 Minutes,Expected,Expected,Expected,Expected,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes
Unnamed: 0_level_1,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,G+A-PK,xG,npxG,xAG,npxG+xAG,xG,xAG,xG+xAG,npxG,npxG+xAG
0,vs Alavés,20,25.0,59.7,7,77,630,7.0,21,16,...,5.29,13.4,13.4,10.8,24.2,1.91,1.54,3.45,1.91,3.45
1,vs Alhama,20,26.1,56.4,7,77,630,7.0,15,8,...,3.14,10.1,9.3,5.3,14.6,1.44,0.76,2.2,1.33,2.09
2,vs Athletic Club,20,26.8,52.1,7,77,630,7.0,13,10,...,3.29,8.2,8.2,5.7,13.9,1.17,0.82,1.99,1.17,1.99
3,vs Atlético Madrid,20,25.5,43.7,7,77,630,7.0,5,2,...,0.86,4.1,3.3,2.3,5.6,0.59,0.33,0.91,0.48,0.8
4,vs Barcelona,24,25.6,30.6,7,77,630,7.0,2,2,...,0.57,3.0,3.0,2.3,5.4,0.44,0.33,0.77,0.44,0.77


## Players Stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Playing Time,Playing Time,Playing Time,...,Expected,Expected,Expected,Expected,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Unnamed: 32_level_0
Unnamed: 0_level_1,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,xG,npxG,xAG,npxG+xAG,xG,xAG,xG+xAG,npxG,npxG+xAG,Matches
0,1,Teresa Abilleira,es ESP,MF,Real Madrid,22-299,2000,6,4,397,...,0.4,0.4,0.8,1.2,0.08,0.18,0.26,0.08,0.26,Matches
1,2,Jessica Aby,ci CIV,"FW,MF",Alavés,24-141,1998,6,4,315,...,0.2,0.2,0.2,0.4,0.07,0.05,0.12,0.07,0.12,Matches
2,3,Júlia Aguado,,DF,Levante,22-186,2000,2,0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
3,4,Yolanda Aguirre,es ESP,GK,Sevilla,24-012,1998,2,2,180,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
4,5,Rasheedat Ajibade,ng NGA,FW,Atlético Madrid,22-331,1999,7,4,400,...,3.8,3.8,0.2,4.1,0.86,0.06,0.92,0.86,0.92,Matches


In [12]:
display(Markdown('## Full Stats'))
defensive, defensive_op = league_full_stats[indexes_fs['defensive']]
display(defensive.head(), defensive_op.head())

## Full Stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Tackles,Tackles,Tackles,Tackles,Tackles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Blocks,Blocks,Blocks,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0
Unnamed: 0_level_1,Squad,# Pl,90s,Tkl,TklW,Def 3rd,Mid 3rd,Att 3rd,Tkl,Att,Tkl%,Past,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err
0,Alavés,20,7.0,112,71,59,42,11,46,92,50.0,46,69,20,49,68,180,159,5
1,Alhama,20,7.0,170,101,74,80,16,65,120,54.2,55,93,26,67,86,256,130,1
2,Athletic Club,20,7.0,124,75,69,39,16,53,104,51.0,51,67,17,50,92,216,124,2
3,Atlético Madrid,20,7.0,109,64,42,56,11,22,47,46.8,25,62,10,52,104,213,98,3
4,Barcelona,24,7.0,105,72,34,48,23,27,50,54.0,23,47,6,41,52,157,56,1


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Tackles,Tackles,Tackles,Tackles,Tackles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Blocks,Blocks,Blocks,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0
Unnamed: 0_level_1,Squad,# Pl,90s,Tkl,TklW,Def 3rd,Mid 3rd,Att 3rd,Tkl,Att,Tkl%,Past,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err
0,vs Alavés,20,7.0,119,74,40,55,24,39,65,60.0,26,66,10,56,65,184,104,1
1,vs Alhama,20,7.0,97,72,39,43,15,33,54,61.1,21,64,17,47,85,182,140,3
2,vs Athletic Club,20,7.0,93,63,38,36,19,43,81,53.1,38,61,11,50,80,173,138,1
3,vs Atlético Madrid,20,7.0,147,86,81,53,13,46,99,46.5,53,60,16,44,107,254,132,1
4,vs Barcelona,24,7.0,150,91,92,40,18,59,99,59.6,40,104,41,63,96,246,149,3


In [12]:
display(Markdown('## Full Stats'))
defensive, defensive_op = league_full_stats[indexes_fs['defensive']]
display(defensive.head(), defensive_op.head())

## Full Stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Tackles,Tackles,Tackles,Tackles,Tackles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Blocks,Blocks,Blocks,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0
Unnamed: 0_level_1,Squad,# Pl,90s,Tkl,TklW,Def 3rd,Mid 3rd,Att 3rd,Tkl,Att,Tkl%,Past,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err
0,Alavés,20,7.0,112,71,59,42,11,46,92,50.0,46,69,20,49,68,180,159,5
1,Alhama,20,7.0,170,101,74,80,16,65,120,54.2,55,93,26,67,86,256,130,1
2,Athletic Club,20,7.0,124,75,69,39,16,53,104,51.0,51,67,17,50,92,216,124,2
3,Atlético Madrid,20,7.0,109,64,42,56,11,22,47,46.8,25,62,10,52,104,213,98,3
4,Barcelona,24,7.0,105,72,34,48,23,27,50,54.0,23,47,6,41,52,157,56,1


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Tackles,Tackles,Tackles,Tackles,Tackles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Blocks,Blocks,Blocks,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0
Unnamed: 0_level_1,Squad,# Pl,90s,Tkl,TklW,Def 3rd,Mid 3rd,Att 3rd,Tkl,Att,Tkl%,Past,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err
0,vs Alavés,20,7.0,119,74,40,55,24,39,65,60.0,26,66,10,56,65,184,104,1
1,vs Alhama,20,7.0,97,72,39,43,15,33,54,61.1,21,64,17,47,85,182,140,3
2,vs Athletic Club,20,7.0,93,63,38,36,19,43,81,53.1,38,61,11,50,80,173,138,1
3,vs Atlético Madrid,20,7.0,147,86,81,53,13,46,99,46.5,53,60,16,44,107,254,132,1
4,vs Barcelona,24,7.0,150,91,92,40,18,59,99,59.6,40,104,41,63,96,246,149,3


In [13]:
display(Markdown('## Match Data'))

home_team_stats, away_team_stats, general_stats = get_match_data(url_example_match)
pd.DataFrame(home_team_stats[1])

## Match Data

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Performance,Performance,Performance,Performance,...,Expected,Expected,SCA,SCA,Passes,Passes,Passes,Passes,Dribbles,Dribbles
Unnamed: 0_level_1,Player,#,Nation,Pos,Age,Min,Gls,Ast,PK,PKatt,...,npxG,xAG,SCA,GCA,Cmp,Att,Cmp%,Prog,Succ,Att
0,Sara Carrillo,19.0,,FW,20-028,67,0,0,0,0,...,0.0,0.0,1,0,12,24,50.0,1,1,3
1,Lice Chamorro,18.0,py PAR,FW,23-269,23,0,0,0,0,...,0.0,0.0,0,0,4,8,50.0,0,0,0
2,Ane Miren Martínez,7.0,,LW,30-012,67,0,0,0,0,...,0.0,0.0,0,0,9,16,56.3,0,0,3
3,Carla Morera,21.0,,RW,27-184,23,1,0,0,0,...,0.5,0.0,0,0,2,4,50.0,0,0,0
4,Carla Armengol,11.0,es ESP,"RW,LW",24-168,77,0,0,0,0,...,0.0,0.0,1,0,11,16,68.8,0,1,2
5,Alba Aznar,10.0,,LW,29-015,13,0,0,0,0,...,0.0,0.0,0,0,3,4,75.0,1,0,0
6,Gema Soliveres,6.0,,LM,21-318,57,0,0,0,0,...,0.0,0.0,0,0,11,20,55.0,0,0,0
7,Jessica Aby,20.0,ci CIV,LM,24-093,33,0,0,0,0,...,0.0,0.0,0,0,8,11,72.7,0,2,3
8,Miriam Diéguez,14.0,es ESP,CM,36-136,90,0,0,0,0,...,0.0,0.1,2,0,15,25,60.0,3,0,0
9,Fátima Pinto,17.0,pt POR,RM,26-244,90,0,0,0,0,...,0.1,0.0,2,0,19,26,73.1,0,0,2


In [14]:
display(Markdown('## Match Events'))

# Output file
# f = open('report.csv', 'w', encoding='UTF-8')
info = parse_match_report(url_example_match)
# f.write(info)
# f.close()

pd.read_csv(io.StringIO(info))

## Match Events

Unnamed: 0,Minute,Event,ScoreH,ScoreA,Player,2nd Player
0,33,Goal,0,1,Lucia Pardo,Gabi Nunes
1,58,Substitute,0,1,Jessica Aby,Gema Soliveres
2,68,Substitute,0,1,Lice Chamorro,Sara Carrillo
3,68,Substitute,0,1,Carla Morera,Ane Miren Martínez
4,69,Substitute,0,1,Karen Araya,Estela Fernández
5,77,Yellow Card,0,1,Carla Morera,
6,78,Substitute,0,1,Alba Aznar,Carla Armengol
7,78,Substitute,0,1,Osinachi Ohale,Cristina Auñón
8,79,Substitute,0,1,Racheal Kundananji,Lucia Pardo
9,82,Goal,0,2,Florencia Bonsegundo,
