In [27]:
import requests
from bs4 import BeautifulSoup

In [28]:
scraping_url = "https://fbref.com/pt/comps/24/Serie-A-Estatisticas"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
}

In [29]:
res = requests.get(scraping_url, headers=headers)
if res.status_code == 429:
    exit()

In [30]:
soup = BeautifulSoup(res.text)

In [31]:
import pandas as pd
from io import StringIO

competition_table = pd.read_html(StringIO(res.text), match="Regular season")[0]

competition_table.rename(columns={
    'Cl': '#',
    'MP': 'PJ',
    'GD': 'SG'
})

Unnamed: 0,#,Equipe,PJ,V,E,D,GP,GC,SG,Pt,Pts/PPJ,xG,xGA,xGD,xGD/90,Últimos 5,Público,Artilheiro da equipe,Goleiro,Notas
0,1,Bragantino,3,2,1,0,5,3,2,7,233,2.6,4.1,-1.6,-53,E V V,4.733,Vitinho - 2,"Cleiton Schwengber, Lucão",
1,2,Flamengo,3,2,1,0,4,2,2,7,233,2.7,2.7,0.0,2,V V E,,Nicolás De La Cruz - 2,Agustín Rossi,
2,3,Botafogo (RJ),3,2,0,1,8,4,4,6,200,5.6,2.8,2.8,93,D V V,,"Danilo Barbosa, Tiquinho Soares - 2",Gatito Fernández,
3,4,Ath Paranaense,3,2,0,1,5,2,3,6,200,3.0,3.0,0.0,-1,V D V,14.892,Agustín Canobbio - 2,Bento,
4,5,Grêmio,3,2,0,1,4,2,2,6,200,3.0,2.4,0.6,19,D V V,6.618,Franco Cristaldo - 2,Agustín Marchesín,
5,6,Internacional,3,2,0,1,3,2,1,6,200,4.3,2.9,1.4,46,V V D,,Wesley - 2,Sergio Rochet,
6,7,Atlético Mineiro,3,1,2,0,4,1,3,5,167,2.4,1.6,0.8,26,E E V,19.771,"Guilherme Arana, Gustavo Scarpa... - 1",Éverson,
7,8,Fortaleza,2,1,1,0,3,2,1,4,200,1.0,1.3,-0.4,-18,V E,,"Juan Martín Lucero, Imanol Machuca... - 1",João Ricardo Riedi,
8,9,Bahia,3,1,1,1,5,5,0,4,133,3.8,3.6,0.2,7,D V E,23.08,Biel - 2,Marcos Felipe,
9,10,Fluminense,3,1,1,1,5,5,0,4,133,3.4,4.1,-0.7,-23,E D V,8.823,Lima - 2,Fábio,


In [32]:
years = list(range(2024, 2019, -1))

In [33]:
import time

all_matches = []

for year in years:
    data = requests.get(scraping_url, headers=headers)
    soup = BeautifulSoup(data.text)    
    stats_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in stats_table.find_all('a')]
    links = [l for l in links if '/equipes/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select('a.prev')[0].get('href')
    scraping_url = f"https://fbref.com{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Estatisticas", "").replace("-", " ")

        data = requests.get(team_url, headers=headers)
        matches = pd.read_html(StringIO(data.text), match="Resultados e Calendários")[0]

        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}", headers=headers)
        shooting = pd.read_html(StringIO(data.text), match="Chutes")[0]
        shooting.columns = shooting.columns.droplevel()

        try:
            team_data = matches.merge(shooting[["Data", "TC", "CaG", "Dist", "FK", "PB", "PT"]], on="Data")
        except ValueError:
            continue

        team_data = team_data[team_data["Camp."] == "Série A"]
        team_data["Temporada"] = year
        team_data["Time"] = team_name
        all_matches.append(team_data)
        time.sleep(3)

In [34]:
match_df = pd.concat(all_matches)
match_df

Unnamed: 0,Data,Horário,Camp.,Rodada,Dia,Local,Resultado,GP,GC,Oponente,...,Relatório da Partida,Notas,TC,CaG,Dist,FK,PB,PT,Temporada,Time
6,2024-04-13,21:00,Série A,Rodada da semana 1,sáb,Visitante,E,2,2,Fluminense,...,Relatório da Partida,,16,8,187.0,0.0,0,0,2024,Bragantino
7,2024-04-17,19:00,Série A,Rodada da semana 2,qua,Em casa,V,2,1,Vasco da Gama,...,Relatório da Partida,,17,5,199.0,1.0,0,0,2024,Bragantino
8,2024-04-20,18:30,Série A,Rodada da semana 3,sáb,Em casa,V,1,0,Corinthians,...,Relatório da Partida,,10,3,225.0,0.0,0,0,2024,Bragantino
2,2024-04-14,16:00,Série A,Rodada da semana 1,dom,Visitante,V,2.0,1.0,Atl Goianiense,...,Relatório da Partida,,14,1,242.0,1.0,1,1,2024,Flamengo
3,2024-04-17,21:30,Série A,Rodada da semana 2,qua,Em casa,V,2.0,1.0,São Paulo,...,Relatório da Partida,,12,6,163.0,0.0,0,0,2024,Flamengo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,2021-02-05,20:00,Série A,Rodada da semana 34,sex,Em casa,D,0,1,Sport Recife,...,Relatório da Partida,,25,5,200.0,2.0,0,0,2020,Botafogo RJ
34,2021-02-08,20:00,Série A,Rodada da semana 35,seg,Em casa,D,2,5,Grêmio,...,Relatório da Partida,,18,6,174.0,1.0,0,0,2020,Botafogo RJ
35,2021-02-13,17:00,Série A,Rodada da semana 36,sáb,Visitante,D,0,2,Goiás,...,Relatório da Partida,,11,3,179.0,0.0,0,0,2020,Botafogo RJ
36,2021-02-22,20:00,Série A,Rodada da semana 37,seg,Em casa,V,1,0,São Paulo,...,Relatório da Partida,,19,8,213.0,1.0,0,0,2020,Botafogo RJ


In [35]:
match_df = match_df.drop('Notas', axis=1)
match_df.columns = [c.lower() for c in match_df.columns]
match_df.to_csv('matches.csv')