In [1]:
import requests
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz, process
import json
import logging
import time
import pandas as pd
import os

# Configuración de logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)

# Headers y función make_request
request_headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
}


In [2]:
def make_request(url, retries=5, timeout=20, retry_delay=5):
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=request_headers, timeout=timeout)
            response.raise_for_status()
            return response
        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError):
            logging.warning(f"Intento {attempt + 1}: Error de conexión o tiempo agotado. Reintentando...")
        except requests.exceptions.HTTPError as e:
            logging.warning(f"HTTPError {e.response.status_code} para {url}. Reintentando...")
        time.sleep(retry_delay)
    logging.error(f"No se pudo obtener respuesta de {url} después de {retries} intentos.")
    return None

In [3]:
def scrape_transfermarkt_league(league_id, base_year, hemisphere="north"):
    """
    Scrapea clubes desde Transfermarkt para una liga y temporada específicas.
    """
    tm_year = base_year if hemisphere == "north" else base_year - 1
    url = f"https://www.transfermarkt.es/-/startseite/wettbewerb/{league_id}/plus/?saison_id={tm_year}"

    logging.info(f"Scrapeando Transfermarkt: {url}")
    response = make_request(url)
    if response is None:
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    tbody_league = soup.find_all('span')
    country_league = tbody_league[0].find('a').text.strip()
    
    tbody = soup.find_all('table')[-1]
    if not tbody:
        logging.warning("No se encontró la tabla de equipos en Transfermarkt.")
        return []

    clubs = []
    for row in tbody.find_all("tr"):
        team_cell = row.find('td', class_="no-border-links hauptlink")
        if not team_cell:
            continue

        link = team_cell.find('a')
        if not link:
            continue

        team_name = link.text.strip()
        team_id = link['href'].split('/')[4]
        clubs.append({'id': team_id, 'name': team_name, 'liga_pais':country_league})

    return clubs

In [4]:
def scrape_fbref_league(league_id, base_year, hemisphere="north"):
    """
    Scrapea clubes desde FBRef para una liga y temporada específicas.
    """
    fbref_season = f"{base_year}-{base_year + 1}" if hemisphere == "north" else str(base_year)
    url = f"https://fbref.com/en/comps/{league_id}/{fbref_season}/{fbref_season}-Stats"

    logging.info(f"Scrapeando FBRef: {url}")
    response = make_request(url)
    if response is None:
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    tbody = soup.select_one('table tbody')
    if not tbody:
        logging.warning("No se encontró la tabla de equipos en FBRef.")
        return []

    clubs = []
    for row in tbody.find_all("tr"):
        team_cell = row.find('td', {'data-stat': 'team'})
        if not team_cell:
            continue

        link = team_cell.find('a')
        if not link:
            continue

        team_name = link.text.strip()
        team_id = link['href'].split('/')[3]
        clubs.append({'id': team_id, 'name': team_name})

    return clubs

In [5]:
def match_clubs(transfermarkt_clubs, fbref_clubs, threshold=100):
    """
    Compara los clubes de Transfermarkt y FBRef utilizando fuzzy matching.
    """
    matched_clubs = {}
    unmatched_clubs = []

    for tm_club in transfermarkt_clubs:
        best_match = process.extractOne(
            tm_club['name'],
            [club['name'] for club in fbref_clubs],
            scorer=fuzz.token_set_ratio
        )

        if best_match and best_match[1] >= threshold:
            match = next(club for club in fbref_clubs if club['name'] == best_match[0])
            matched_clubs[tm_club['id']] = {
                'Club_Name': tm_club['name'],
                'FBRef_name': match['name'],
                'liga_pais': tm_club['liga_pais'],
                'FBRef_id': match['id'],
                'match_score': best_match[1]
            }
        else:
            unmatched_clubs.append({
                'id_transfermarkt': tm_club['id'],
                'name_transfermarkt': tm_club['name'],
                'best_match_name_fbref': best_match[0] if best_match else None,
                'best_match_score': best_match[1] if best_match else None,
                'liga_pais': tm_club['liga_pais']
            })

    return matched_clubs, unmatched_clubs

def save_results(file_path, data):
    """
    Guarda resultados acumulativos en JSON sin sobrescribir claves existentes.
    También imprime cuántos clubes nuevos se agregaron.
    """
    # Cargar datos existentes si el archivo ya existe
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            existing_data = json.load(f)
    else:
        existing_data = {}

    # Contar claves antes de la actualización
    initial_count = len(existing_data)

    # Agregar solo claves nuevas
    for key, value in data.items():
        if key not in existing_data:
            existing_data[key] = value

    # Contar claves después de la actualización
    final_count = len(existing_data)
    new_count = final_count - initial_count

    # Guardar resultados actualizados
    with open(file_path, 'w') as f:
        json.dump(existing_data, f, indent=4)

    # Imprimir la cantidad de nuevos clubes añadidos
    print(f"Se agregaron {new_count} clubes nuevos al archivo '{file_path}'.")



def save_unmatched_to_csv(file_path, unmatched):
    """
    Guarda los resultados no emparejados en un archivo CSV, conservando la mejor coincidencia.
    """
    unmatched_df = pd.DataFrame(unmatched)

    # Si el archivo ya existe, combinar los datos existentes con los nuevos
    if os.path.exists(file_path):
        existing = pd.read_csv(file_path)
        combined = pd.concat([existing, unmatched_df], ignore_index=True)
        unmatched_df = (
            combined.sort_values('best_match_score', ascending=False)
            .drop_duplicates(subset=['id_transfermarkt'], keep='first')
        )
    else:
        unmatched_df = unmatched_df.drop_duplicates(subset=['id_transfermarkt'], keep='first')

    # Guardar los datos actualizados en el archivo
    unmatched_df.to_csv(file_path, index=False)

In [21]:
# Ejecución
# league_id_tm = "BRA1"
# league_id_fbref = 24
base_years = range(2020, 2025)
hemisphere = "north"
leagues_id_tm = ['NL1', 'MLS1', 'FR1', 'L1']
leagues_id_fbref = [23, 22, 13, 20]

file_path = 'club_mapping_with_code.json'
unmatched_file = 'unmatched_clubs.csv'
for league_id_tm, league_id_fbref in zip (leagues_id_tm, leagues_id_fbref):
    for year in base_years:
        tm_clubs = scrape_transfermarkt_league(league_id_tm, year, hemisphere)
        fbref_clubs = scrape_fbref_league(league_id_fbref, year, hemisphere)
    
        matched, unmatched = match_clubs(tm_clubs, fbref_clubs, threshold=80)
    
        save_results(file_path, matched)
        save_unmatched_to_csv(unmatched_file, unmatched)

logging.info(f"Procesamiento completo para temporadas: {list(base_years)}")

2024-12-02 10:08:55,118 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/NL1/plus/?saison_id=2020
2024-12-02 10:09:02,605 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/23/2020-2021/2020-2021-Stats
2024-12-02 10:09:09,416 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/NL1/plus/?saison_id=2021


Se agregaron 18 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:09:17,925 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/23/2021-2022/2021-2022-Stats
2024-12-02 10:09:24,334 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/NL1/plus/?saison_id=2022


Se agregaron 3 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:09:31,380 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/23/2022-2023/2022-2023-Stats
2024-12-02 10:09:39,342 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/NL1/plus/?saison_id=2023


Se agregaron 2 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:09:41,022 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/23/2023-2024/2023-2024-Stats
2024-12-02 10:09:47,405 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/NL1/plus/?saison_id=2024


Se agregaron 1 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:09:49,092 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/23/2024-2025/2024-2025-Stats
2024-12-02 10:09:57,652 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/MLS1/plus/?saison_id=2020


Se agregaron 1 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:10:04,495 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/22/2020-2021/2020-2021-Stats
2024-12-02 10:10:14,035 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/MLS1/plus/?saison_id=2021


Se agregaron 0 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:10:20,894 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/22/2021-2022/2021-2022-Stats
2024-12-02 10:10:29,761 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/MLS1/plus/?saison_id=2022


Se agregaron 1 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:10:37,263 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/22/2022-2023/2022-2023-Stats
2024-12-02 10:10:47,581 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/MLS1/plus/?saison_id=2023


Se agregaron 0 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:10:51,134 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/22/2023-2024/2023-2024-Stats
2024-12-02 10:11:01,643 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/MLS1/plus/?saison_id=2024


Se agregaron 0 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:11:04,247 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/22/2024-2025/2024-2025-Stats
2024-12-02 10:11:13,107 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/FR1/plus/?saison_id=2020


Se agregaron 0 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:11:21,180 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/13/2020-2021/2020-2021-Stats
2024-12-02 10:11:28,761 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/FR1/plus/?saison_id=2021


Se agregaron 15 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:11:35,831 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/13/2021-2022/2021-2022-Stats
2024-12-02 10:11:42,388 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/FR1/plus/?saison_id=2022


Se agregaron 2 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:11:58,104 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/13/2022-2023/2022-2023-Stats
2024-12-02 10:12:04,671 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/FR1/plus/?saison_id=2023


Se agregaron 3 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:12:11,841 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/13/2023-2024/2023-2024-Stats
2024-12-02 10:12:18,555 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/FR1/plus/?saison_id=2024


Se agregaron 1 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:12:27,213 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/13/2024-2025/2024-2025-Stats
2024-12-02 10:12:34,039 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/L1/plus/?saison_id=2020


Se agregaron 0 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:12:35,859 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/20/2020-2021/2020-2021-Stats
2024-12-02 10:12:42,485 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/L1/plus/?saison_id=2021


Se agregaron 12 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:12:51,791 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/20/2021-2022/2021-2022-Stats
2024-12-02 10:12:57,698 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/L1/plus/?saison_id=2022


Se agregaron 2 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:13:05,169 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/20/2022-2023/2022-2023-Stats
2024-12-02 10:13:11,427 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/L1/plus/?saison_id=2023


Se agregaron 0 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:13:19,156 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/20/2023-2024/2023-2024-Stats
2024-12-02 10:13:25,330 - INFO - Scrapeando Transfermarkt: https://www.transfermarkt.es/-/startseite/wettbewerb/L1/plus/?saison_id=2024


Se agregaron 2 clubes nuevos al archivo 'club_mapping_with_code.json'.


2024-12-02 10:13:27,350 - INFO - Scrapeando FBRef: https://fbref.com/en/comps/20/2024-2025/2024-2025-Stats
2024-12-02 10:13:34,555 - INFO - Procesamiento completo para temporadas: [2020, 2021, 2022, 2023, 2024]


Se agregaron 2 clubes nuevos al archivo 'club_mapping_with_code.json'.


In [18]:
matched

{'3711': {'Club_Name': 'Cruz Azul',
  'FBRef_name': 'Cruz Azul',
  'liga_pais': 'México',
  'FBRef_id': '632f1838',
  'match_score': 100},
 '1804': {'Club_Name': 'Toluca',
  'FBRef_name': 'Toluca',
  'liga_pais': 'México',
  'FBRef_id': '44b88a4e',
  'match_score': 100},
 '7055': {'Club_Name': 'Tigres UANL',
  'FBRef_name': 'UANL',
  'liga_pais': 'México',
  'FBRef_id': 'd9e1bd51',
  'match_score': 100},
 '7633': {'Club_Name': 'Pumas UNAM',
  'FBRef_name': 'UNAM',
  'liga_pais': 'México',
  'FBRef_id': 'c9d59c6c',
  'match_score': 100},
 '2407': {'Club_Name': 'Monterrey',
  'FBRef_name': 'Monterrey',
  'liga_pais': 'México',
  'FBRef_id': 'dd5ca9bd',
  'match_score': 100},
 '13353': {'Club_Name': 'Club Tijuana',
  'FBRef_name': 'Tijuana',
  'liga_pais': 'México',
  'FBRef_id': 'a42ddf2f',
  'match_score': 100},
 '3631': {'Club_Name': 'CF América',
  'FBRef_name': 'América',
  'liga_pais': 'México',
  'FBRef_id': '18d3c3a3',
  'match_score': 100},
 '8590': {'Club_Name': 'Atlas',
  'FBRe

In [19]:
pd.read_csv(unmatched_file).drop_duplicates()

Unnamed: 0,id_transfermarkt,name_transfermarkt,best_match_name_fbref,best_match_score,liga_pais
0,19775,Dep. Riestra,Deportivo Riestra,79,Argentina
1,8781,Dep. Cuenca,Deportivo Cuenca,77,Ecuador
2,940,RC Celta,Celta Vigo,77,España
3,18099,Manta FC,Manta Fútbol Club,77,Ecuador
4,31284,Central Córdoba,Rosario Central,76,Argentina
5,9961,Deportivo Cali,AD Cali,73,Colombia
6,2402,Def. y Justicia,Defensa y Just,71,Argentina
7,10093,Indep. Medellín,Independiente,69,Colombia
8,6195,Nápoles,Napoli,67,Italia
9,679,Athletico-PR,Atlético Mineiro,67,Brasil


## Descartado