# Parsing html using Beautiful Soup

we will download the html of our page using requests library

In [None]:
import requests

In [None]:
standings_url = "https://fbref.com/en/comps/12/La-Liga-Stats"

html file of page downloaded in "data"

In [None]:
data = requests.get(standings_url)

look at the page and we use inspect to observe we need href links from a tag of each teams

we'll use beautiful soup to parse the html

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(data.text)

we notice that data we want is inside a table tag with class name "stats_table"
we use select () property of beautifulsoup, it can select css selectors tags,elements, ids, classes. we will only take first element.

where as find_all () finds only the tags

In [None]:
standings_table = soup.select('table.stats_table')[0]

In [None]:
links = standings_table.find_all('a')

now we have all "a" tags in links, now we'll just take href from "a" tag in links and then keep only squads links

In [None]:
links = [l.get("href") for l in links]

we take only those links which have '/squads/' in them and remove the rest.

In [None]:
links = [l for l in links if '/squads/' in l]

In [None]:
links

['/en/squads/53a2f082/Real-Madrid-Stats',
 '/en/squads/9024a00a/Girona-Stats',
 '/en/squads/206d90db/Barcelona-Stats',
 '/en/squads/db3b9613/Atletico-Madrid-Stats',
 '/en/squads/2b390eca/Athletic-Club-Stats',
 '/en/squads/fc536746/Real-Betis-Stats',
 '/en/squads/e31d1cd9/Real-Sociedad-Stats',
 '/en/squads/0049d422/Las-Palmas-Stats',
 '/en/squads/dcc91a7b/Valencia-Stats',
 '/en/squads/7848bd64/Getafe-Stats',
 '/en/squads/03c57e2b/Osasuna-Stats',
 '/en/squads/8d6fd021/Alaves-Stats',
 '/en/squads/2a8183b3/Villarreal-Stats',
 '/en/squads/98e8af82/Rayo-Vallecano-Stats',
 '/en/squads/ad2be733/Sevilla-Stats',
 '/en/squads/f25da7fb/Celta-Vigo-Stats',
 '/en/squads/2aa12281/Mallorca-Stats',
 '/en/squads/ee7c297c/Cadiz-Stats',
 '/en/squads/a0435291/Granada-Stats',
 '/en/squads/78ecf4bb/Almeria-Stats']

we notice links are incomplete, without a domain. so we gonna complete them using format string which will add our domain at beginning of each link

In [None]:
team_urls = [f"http://fbref.com{l}" for l in links ]

In [None]:
team_urls

['http://fbref.com/en/squads/53a2f082/Real-Madrid-Stats',
 'http://fbref.com/en/squads/9024a00a/Girona-Stats',
 'http://fbref.com/en/squads/206d90db/Barcelona-Stats',
 'http://fbref.com/en/squads/db3b9613/Atletico-Madrid-Stats',
 'http://fbref.com/en/squads/2b390eca/Athletic-Club-Stats',
 'http://fbref.com/en/squads/fc536746/Real-Betis-Stats',
 'http://fbref.com/en/squads/e31d1cd9/Real-Sociedad-Stats',
 'http://fbref.com/en/squads/0049d422/Las-Palmas-Stats',
 'http://fbref.com/en/squads/dcc91a7b/Valencia-Stats',
 'http://fbref.com/en/squads/7848bd64/Getafe-Stats',
 'http://fbref.com/en/squads/03c57e2b/Osasuna-Stats',
 'http://fbref.com/en/squads/8d6fd021/Alaves-Stats',
 'http://fbref.com/en/squads/2a8183b3/Villarreal-Stats',
 'http://fbref.com/en/squads/98e8af82/Rayo-Vallecano-Stats',
 'http://fbref.com/en/squads/ad2be733/Sevilla-Stats',
 'http://fbref.com/en/squads/f25da7fb/Celta-Vigo-Stats',
 'http://fbref.com/en/squads/2aa12281/Mallorca-Stats',
 'http://fbref.com/en/squads/ee7c297c/

# Extract Match Stats using Pandas and Requests

In [None]:
team_url = team_urls[0]

In [None]:
data = requests.get(team_url)

so we selected the Real Madrid page and downloaded it's html in "data"

now go to the link and look for which table do we want. we notice we want table that has string "Scores & Fixtures" in it.

we need to convert that table into pandas dataframe, use use property of pandas read_html()

In [None]:
import pandas as pd

In [None]:
matches = pd.read_html(data.text,match ="Scores & Fixtures")

matches takes the table , which has matching string present in it, or we could also find it by id "div_matchlogs_for"

In [None]:
matches[0].head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2023-08-12,21:30,La Liga,Matchweek 1,Sat,Away,W,2.0,0.0,Athletic Club,0.9,0.4,54.0,48927.0,Dani Carvajal,4-1-2-1-2◆,Jesús Gil,Match Report,
1,2023-08-19,19:30,La Liga,Matchweek 2,Sat,Away,W,3.0,1.0,Almería,2.0,1.3,57.0,17561.0,Dani Carvajal,4-3-1-2,José Sánchez,Match Report,
2,2023-08-25,21:30,La Liga,Matchweek 3,Fri,Away,W,1.0,0.0,Celta Vigo,1.4,1.2,63.0,23057.0,Dani Carvajal,4-1-2-1-2◆,Isidro Díaz de Mera,Match Report,
3,2023-09-02,16:15,La Liga,Matchweek 4,Sat,Home,W,2.0,1.0,Getafe,2.8,0.4,76.0,66747.0,Luka Modrić,4-3-1-2,Mario Melero,Match Report,
4,2023-09-17,21:00,La Liga,Matchweek 5,Sun,Home,W,2.0,1.0,Real Sociedad,2.0,1.6,52.0,70092.0,Dani Carvajal,4-1-2-1-2◆,César Soto,Match Report,


now we'll take take the Data in the Shooting table to help get data for predicting. we will take it in a similar manner as before.

In [None]:
soup = BeautifulSoup(data.text)

In [None]:
links = soup.find_all('a')

In [None]:
links = [l.get("href") for l in links]

In [None]:
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [None]:
links

['/en/squads/53a2f082/2023-2024/matchlogs/all_comps/shooting/Real-Madrid-Match-Logs-All-Competitions',
 '/en/squads/53a2f082/2023-2024/matchlogs/all_comps/shooting/Real-Madrid-Match-Logs-All-Competitions',
 '/en/squads/53a2f082/2023-2024/matchlogs/all_comps/shooting/Real-Madrid-Match-Logs-All-Competitions',
 '/en/squads/53a2f082/2023-2024/matchlogs/all_comps/shooting/Real-Madrid-Match-Logs-All-Competitions']

In [None]:
data = requests.get(f"https://fbref.com{links[0]}")

now we'll save shooting data in "shooting" variable using "read_html"

In [None]:
shooting = pd.read_html(data.text,match = "Shooting")[0]

In [None]:
shooting.head()  # head() lets us see first 5 elements

Unnamed: 0_level_0,For Real Madrid,For Real Madrid,For Real Madrid,For Real Madrid,For Real Madrid,For Real Madrid,For Real Madrid,For Real Madrid,For Real Madrid,For Real Madrid,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-12,21:30,La Liga,Matchweek 1,Sat,Away,W,2.0,0.0,Athletic Club,...,16.0,2.0,0,0,0.9,0.9,0.07,1.1,1.1,Match Report
1,2023-08-19,19:30,La Liga,Matchweek 2,Sat,Away,W,3.0,1.0,Almería,...,17.0,1.0,0,0,2.0,2.0,0.08,1.0,1.0,Match Report
2,2023-08-25,21:30,La Liga,Matchweek 3,Fri,Away,W,1.0,0.0,Celta Vigo,...,19.4,0.0,0,1,1.4,0.6,0.06,-0.4,0.4,Match Report
3,2023-09-02,16:15,La Liga,Matchweek 4,Sat,Home,W,2.0,1.0,Getafe,...,17.7,0.0,0,0,2.8,2.8,0.11,-0.8,-0.8,Match Report
4,2023-09-17,21:00,La Liga,Matchweek 5,Sun,Home,W,2.0,1.0,Real Sociedad,...,15.9,1.0,0,0,2.0,2.0,0.13,0.0,0.0,Match Report


# Cleaning and Merging Dataframes using Pandas

there are two columns in data , we don't need the first one so we use droplevel()

In [None]:
shooting.columns = shooting.columns.droplevel()

In [None]:
shooting.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-12,21:30,La Liga,Matchweek 1,Sat,Away,W,2.0,0.0,Athletic Club,...,16.0,2.0,0,0,0.9,0.9,0.07,1.1,1.1,Match Report
1,2023-08-19,19:30,La Liga,Matchweek 2,Sat,Away,W,3.0,1.0,Almería,...,17.0,1.0,0,0,2.0,2.0,0.08,1.0,1.0,Match Report
2,2023-08-25,21:30,La Liga,Matchweek 3,Fri,Away,W,1.0,0.0,Celta Vigo,...,19.4,0.0,0,1,1.4,0.6,0.06,-0.4,0.4,Match Report
3,2023-09-02,16:15,La Liga,Matchweek 4,Sat,Home,W,2.0,1.0,Getafe,...,17.7,0.0,0,0,2.8,2.8,0.11,-0.8,-0.8,Match Report
4,2023-09-17,21:00,La Liga,Matchweek 5,Sun,Home,W,2.0,1.0,Real Sociedad,...,15.9,1.0,0,0,2.0,2.0,0.13,0.0,0.0,Match Report


In [None]:
shooting["Date"] # gives us dates column

0     2023-08-12
1     2023-08-19
2     2023-08-25
3     2023-09-02
4     2023-09-17
5     2023-09-20
6     2023-09-24
7     2023-09-27
8     2023-09-30
9     2023-10-03
10    2023-10-07
11    2023-10-21
12    2023-10-24
13    2023-10-28
14    2023-11-05
15    2023-11-08
16    2023-11-11
17    2023-11-26
18    2023-11-29
19    2023-12-02
20    2023-12-09
21    2023-12-12
22    2023-12-17
23    2023-12-21
24    2024-01-03
25    2024-01-06
26    2024-01-10
27    2024-01-14
28    2024-01-18
29    2024-01-21
30    2024-01-27
31    2024-02-01
32    2024-02-04
33    2024-02-10
34           NaN
Name: Date, dtype: object

both our data frames have data of similar matches , so we gonna merge them into one using pandas merge() , but we gonna take only some cols from "shooting" dataframe

In [None]:
team_data = matches[0].merge(shooting[["Date","Sh","SoT","Dist","FK","PK","PKatt"]], on = "Date")

In [None]:
team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2023-08-12,21:30,La Liga,Matchweek 1,Sat,Away,W,2.0,0.0,Athletic Club,...,4-1-2-1-2◆,Jesús Gil,Match Report,,14,8,16.0,2.0,0,0
1,2023-08-19,19:30,La Liga,Matchweek 2,Sat,Away,W,3.0,1.0,Almería,...,4-3-1-2,José Sánchez,Match Report,,25,9,17.0,1.0,0,0
2,2023-08-25,21:30,La Liga,Matchweek 3,Fri,Away,W,1.0,0.0,Celta Vigo,...,4-1-2-1-2◆,Isidro Díaz de Mera,Match Report,,9,2,19.4,0.0,0,1
3,2023-09-02,16:15,La Liga,Matchweek 4,Sat,Home,W,2.0,1.0,Getafe,...,4-3-1-2,Mario Melero,Match Report,,26,12,17.7,0.0,0,0
4,2023-09-17,21:00,La Liga,Matchweek 5,Sun,Home,W,2.0,1.0,Real Sociedad,...,4-1-2-1-2◆,César Soto,Match Report,,17,8,15.9,1.0,0,0


In [None]:
matches[0].shape #check no of rows,cols of merges dataframes.. if they match or not

(50, 19)

In [None]:
shooting.shape

(35, 26)

# Scraping Data for Multiple Seasons and Teams using Loop

In [None]:
years = list(range(2024, 2022, -1))
all_matches = []

In [None]:
years

[2024, 2023]

In [None]:
standings_url = "https://fbref.com/en/comps/12/La-Liga-Stats"

In [None]:
team_url.split("/")[-1].replace("-Stats","").replace("-"," ") #finding teamname from url
#we use "try" so that if shooting data for that match do not exist, it should just skip merging

'Real Madrid'

In [None]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "La Liga"]

        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Season"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Team"] = team_name


In [None]:
match_df = pd.concat(all_matches)

In [None]:
match_df.columns = [c.lower() for c in match_df.columns] #convert columns name to lowercase

In [None]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2023-08-12,21:30,La Liga,Matchweek 1,Sat,Away,W,2.0,0.0,Athletic Club,...,Match Report,,14.0,8.0,16.0,2.0,0,0,2024,Real Madrid
1,2023-08-19,19:30,La Liga,Matchweek 2,Sat,Away,W,3.0,1.0,Almería,...,Match Report,,25.0,9.0,17.0,1.0,0,0,2024,Real Madrid
2,2023-08-25,21:30,La Liga,Matchweek 3,Fri,Away,W,1.0,0.0,Celta Vigo,...,Match Report,,9.0,2.0,19.4,0.0,0,1,2024,Real Madrid
3,2023-09-02,16:15,La Liga,Matchweek 4,Sat,Home,W,2.0,1.0,Getafe,...,Match Report,,26.0,12.0,17.7,0.0,0,0,2024,Real Madrid
4,2023-09-17,21:00,La Liga,Matchweek 5,Sun,Home,W,2.0,1.0,Real Sociedad,...,Match Report,,17.0,8.0,15.9,1.0,0,0,2024,Real Madrid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2023-05-14,16:15,La Liga,Matchweek 34,Sun,Home,W,1,0,Atlético Madrid,...,Match Report,,15.0,4.0,13.4,0.0,0,0,2023,Elche
37,2023-05-20,18:30,La Liga,Matchweek 35,Sat,Away,D,1,1,Getafe,...,Match Report,,13.0,4.0,12.8,0.0,0,0,2023,Elche
38,2023-05-24,19:30,La Liga,Matchweek 36,Wed,Home,D,1,1,Sevilla,...,Match Report,,20.0,7.0,17.5,0.0,0,0,2023,Elche
39,2023-05-28,19:00,La Liga,Matchweek 37,Sun,Away,W,1,0,Athletic Club,...,Match Report,,6.0,2.0,14.2,0.0,0,0,2023,Elche


In [None]:
len(all_matches)

40

In [None]:
match_df.to_csv("matches.csv")