In this repository, it is used the data corresponding to **Liga F** (Spanish First Division). 

This notebook shows how data is collected, using *Python* and *BeautyfulSoup*, and how this will be used in *Power BI* dashboard.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Liga F stats url
url_league = 'https://fbref.com/en/comps/230/stats/Liga-F-Stats'

# Liga F fixtures and matches
url_matches = 'https://fbref.com/en/comps/230/schedule/Liga-F-Scores-and-Fixtures'

# Example match report
url_example_match = 'https://fbref.com/en/matches/87c755cd/Alaves-Madrid-CFF-September-17-2022-Liga-F'

## League Standings
First type of data consists on global data about the whole season

In [2]:
# Function that reads all elements of a class from the html indicated by the url
# Returns a list of dataframes 
def get_tables(url, element='table'):
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    
    data = [pd.read_html(str(t)) for t in soup.find_all(element)]
    return data

In [3]:
# Using the function above, obtains the 'Standard Stats' data
def get_league_stats(url_league_stats):

    dfs = get_tables(url_league_stats) 

    squad_stats = dfs[0][0]
    squad_stats_against = dfs[1][0]

    players = dfs[2][0]

    return [squad_stats, squad_stats_against, players]

## Matches Results
Other type of data collected consists on the results of finished matches. This also retrieves future fixtures.


In [4]:
def get_matches_url(url, element='table'):
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    
    data = [t for t in soup.find_all(element)]
    
    # Links are contained in <a> sections, with the text 'Match Report'
    match_link = re.compile("Match Report")
    # Extracting links for the reports
    links = [ link['href'] for link in data[0].find_all('a', href=True) if match_link.match(link.contents[0]) ]    
    df = pd.read_html(str(data[0]))[0]
    
    # Fill matches that dont have Match Report (future matches)
    while len(links) < df.shape[0]: links.append('')
    # Modifying links to access after 
    df['Match Report'] = links
    
    # Drop invalid rows and convert Week number to integer
    df = df[df['Wk'].notna()]
    df.Wk = df.Wk.astype(int)

    return df

## Match Report
Each match report url has a defined structure of tables:
1. Local Squad + Bench
2. Away Squad + Bench
3. Match Stats
4. For the Local and Away Team, special stats
    - Summary
    - Passing
    - Pass Types
    - Deffensive Actions
    - Possession
    - Miscelaneous
    - GoalKeeper Stats
5. Shots info
    - Both teams
    - Local team
    - Away Team

In [5]:
def get_match_data(url, element='table'):
    dfs = get_tables(url)
    # local_team_stats = dfs[0,3,4,5,6,7,8,9,18]
    # away_team_stats = dfs[1,10,11,12,13,14,15,16,19]
    # general_stats = dfs[2,17]
    # return local_team_stats, away_team_stats, general_stats
    return dfs


get_match_data(url_example_match);

In [6]:
def parse_match_report(url, filename):
    
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')

    events_div = soup.find('div', {'id' : 'events_wrap'})
    events = events_div.find_all('div', {'class' : ['event a', 'event b', 'event_header']})
    headers = ['Kick Off', 'Half Time']

    f = open(filename, 'w', encoding='UTF-8')
    f.write('Minute, Event, Score, Player, 2nd Player\n')
    for e in events:
        if e.text in headers:
            continue #print(e.text)
        else:
            row = ''
            text = re.sub(r'\s{2,}', ' ', e.text)

            minute = re.search(r'(\d|\+){1,5}', text)
            row += text[minute.start():minute.end()] + ', '

            ev = re.search(r'\s—\s', text)
            row += re.sub(r'\d:\d', '', text[(ev.span()[1]):] + ', ')

            score = re.search(r'\d+:\d+', text)
            row += text[score.span()[0]:score.span()[1]] + ', '

            assist = re.search(r'(Assist:) | (for) ', text)
            if assist:
                row += text[score.span()[1]:assist.start()] + ', '
                row += text[assist.span()[1]: ev.start()] 
            else:
                row += text[score.span()[1]:ev.start()] + ', '

            f.write(row + '\n')
    f.close()

parse_match_report(url_example_match, 'prueba.csv')

# Use Examples

In [7]:
from IPython.display import display, Markdown

league_stats = get_league_stats(url_league)
league_matches = get_matches_url(url_matches)

In [8]:
display(Markdown('## General Stats'))
display(league_stats[0].head())
display(Markdown('## Opponent Stats'))
display(league_stats[1].head())
display(Markdown('## Players Stats'))
display(league_stats[2].head())

## General Stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Playing Time,Playing Time,Playing Time,Playing Time,Performance,Performance,...,Per 90 Minutes,Expected,Expected,Expected,Expected,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes
Unnamed: 0_level_1,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,G+A-PK,xG,npxG,xAG,npxG+xAG,xG,xAG,xG+xAG,npxG,npxG+xAG
0,Alavés,20,27.1,41.7,6,66,540,6.0,4,3,...,1.17,4.2,4.2,2.5,6.6,0.69,0.41,1.11,0.69,1.11
1,Alhama,20,26.6,43.6,7,77,630,7.0,3,1,...,0.57,5.6,5.6,3.6,9.2,0.81,0.51,1.32,0.81,1.32
2,Athletic Club,20,24.6,47.9,7,77,630,7.0,6,4,...,1.43,6.6,6.6,4.7,11.2,0.94,0.66,1.6,0.94,1.6
3,Atlético Madrid,20,26.3,56.3,7,77,630,7.0,15,9,...,3.29,12.5,10.9,8.0,18.9,1.78,1.15,2.93,1.55,2.7
4,Barcelona,21,27.0,67.3,6,66,540,6.0,19,14,...,5.5,15.3,15.3,11.8,27.1,2.55,1.97,4.52,2.55,4.52


## Opponent Stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Playing Time,Playing Time,Playing Time,Playing Time,Performance,Performance,...,Per 90 Minutes,Expected,Expected,Expected,Expected,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes
Unnamed: 0_level_1,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,G+A-PK,xG,npxG,xAG,npxG+xAG,xG,xAG,xG+xAG,npxG,npxG+xAG
0,vs Alavés,20,25.1,58.3,6,66,540,6.0,19,14,...,5.5,11.6,11.6,9.7,21.3,1.94,1.62,3.56,1.94,3.56
1,vs Alhama,20,26.1,56.4,7,77,630,7.0,15,8,...,3.14,10.1,9.3,5.3,14.6,1.44,0.76,2.2,1.33,2.09
2,vs Athletic Club,20,26.8,52.1,7,77,630,7.0,13,10,...,3.29,8.2,8.2,5.7,13.9,1.17,0.82,1.99,1.17,1.99
3,vs Atlético Madrid,20,25.5,43.7,7,77,630,7.0,5,2,...,0.86,4.1,3.3,2.3,5.6,0.59,0.33,0.91,0.48,0.8
4,vs Barcelona,21,25.5,32.7,6,66,540,6.0,2,2,...,0.67,3.0,3.0,2.3,5.3,0.5,0.39,0.88,0.5,0.88


## Players Stats

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Playing Time,Playing Time,Playing Time,...,Expected,Expected,Expected,Expected,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Unnamed: 32_level_0
Unnamed: 0_level_1,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,xG,npxG,xAG,npxG+xAG,xG,xAG,xG+xAG,npxG,npxG+xAG,Matches
0,1,Teresa Abilleira,es ESP,MF,Real Madrid,22-297,2000,5,4,352,...,0.4,0.4,0.5,0.8,0.09,0.12,0.21,0.09,0.21,Matches
1,2,Jessica Aby,ci CIV,"FW,MF",Alavés,24-139,1998,6,4,315,...,0.2,0.2,0.2,0.4,0.07,0.05,0.12,0.07,0.12,Matches
2,3,Júlia Aguado,,DF,Levante,22-184,2000,2,0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
3,4,Yolanda Aguirre,es ESP,GK,Sevilla,24-010,1998,2,2,180,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
4,5,Rasheedat Ajibade,ng NGA,FW,Atlético Madrid,22-329,1999,7,4,400,...,3.8,3.8,0.2,4.1,0.86,0.06,0.92,0.86,0.92,Matches


In [9]:
display(Markdown('## League Matches'))
display(league_matches.head())

## League Matches

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes
0,2,Sat,2022-09-17,12:00,Alavés,1.1,1–2,1.2,Madrid CFF,,Ciudad Deportiva José Luis Compañón,Manuel Pascali,/en/matches/87c755cd/Alaves-Madrid-CFF-Septemb...,
1,2,Sat,2022-09-17,12:00,Barcelona,1.9,2–0,0.4,UDG Tenerife,,Estadi Johan Cruyff,María Dolores Martínez Madrona,/en/matches/4df3a732/Barcelona-UDG-Tenerife-Se...,
2,2,Sat,2022-09-17,16:00,Real Madrid,1.6,2–0,0.8,Valencia,,Estadio Alfredo Di Stéfano,Marta Huerta de Aza,/en/matches/d0329f46/Real-Madrid-Valencia-Sept...,
3,2,Sat,2022-09-17,16:00,Real Sociedad,0.7,2–0,0.3,Villarreal,,Estadio Zubieta XXI,Alicia Espinosa Ríos,/en/matches/abfde9d9/Real-Sociedad-Villarreal-...,
4,2,Sat,2022-09-17,18:00,Sevilla,1.1,1–3,1.4,Atlético Madrid,,Estadio Viejo Nervión,Bruno Gallo,/en/matches/f4452586/Sevilla-Atletico-Madrid-S...,
