In [177]:
from bs4 import BeautifulSoup
from dataclasses import dataclass
import pandas as pd
from collections import defaultdict
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import re


## Fiche combattant 

In [161]:
driver = webdriver.Chrome()

url = "https://www.ufc.com/athlete/brandon-moreno"

driver.get(url)

driver.implicitly_wait(10)

html_content = driver.page_source

driver.quit()

In [162]:
soup = BeautifulSoup(html_content, "html.parser")

In [187]:
@dataclass
class Fighter:
    name: str
    age: int
    height: str
    weight: str
    division : str
    reach: int
    leg_reach: int
    combat_style: str
    win : int
    loss : int
    draw : int
    ko_win : int
    sub_win : int
    decision_win : int
    first_round_finishes : int
    Title_holder : bool
    strike_precision : int
    take_down_precision : int
    sig_str_atteri : int
    sig_frappes_encaissées : int
    takedown_avg : int
    envoi_avg : int
    sig_str_defense : float
    take_down_defense : float
    knock_down_avg : int
    temps_de_combat_moyen : int
    position_permanente : int
    position_clinch : int
    position_sol : int
    sig_str_head: int
    sig_str_body: int
    sig_str_leg: int



In [186]:
def extraire_info_combattant(soup):
    dictio = defaultdict(str)
    recap_combattant = soup.select_one("div.hero-profile > div.hero-profile__info")
    info_combatant = soup.select("div.c-bio__field")
    fiche_combattant, cbt_name = recap_combattant.find_all("p"), recap_combattant.find("h1").text
    required = ["Style de combat","Âge","La Taille","Poids","Reach","Portée de la jambe"]
    dictio['Name'] = cbt_name

    infos_principal_combattant(fiche_combattant, dictio)
    Bio_combatant(info_combatant, dictio,required)
    Tenant_titre(soup,dictio)
    stats_combatant(soup,dictio)
    stats_corps_combatant(soup,dictio)
    pourcentage_touche_takedown(soup,dictio)
    mesures_combattant(soup,dictio)

    return dictio

def infos_principal_combattant(fiche_combattant, dictio):
    for item in fiche_combattant:
        if any(clss in ['hero-profile__division-title', 'hero-profile__division-body'] for clss in item.get('class', [])):
            text = item.text.strip()
            if ' (W-L-D)' in text:
                record, _ = text.split(' (')
                wins, losses, draws = record.split('-')
                dictio['Win'] = int(wins)
                dictio['Losses'] = int(losses)
                dictio['Draws'] = int(draws)
            else:
                dictio["Division"] = text
                if "Women's" in text:
                    dictio["Genre"] = "Female"
                else:
                    dictio["Genre"] = "Male"

def Bio_combatant(info_combatant, dictio, required):
    for item in info_combatant:
        label = item.find("div", class_="c-bio__label")
        text = item.find("div", class_="c-bio__text")
    
        if label and text:
            if label.text.strip() in required:
                if text.find("div"):
                    text = text.find("div") #cas de couche caché 
                val = text.text.strip() if text else None
                dictio[label.text.strip()] = float(val) if bool(re.fullmatch(r'\d+(\.\d+)?',val)) else val

def Tenant_titre(soup,dictio):
    soup.find_all("p", class_="hero-profile__tag")
    if any('Title Holder' in tag.text for tag in soup.find_all("p", class_="hero-profile__tag")):
        dictio["Title_holder"] = True
    else : 
        dictio["Title_holder"] = False

def stats_combatant(soup,dictio) :
    liste_objective = ['Permanent', 'Clinch', 'Sol', 'KO/TKO', 'DEC', 'SUB']
    groups = soup.find_all("div", class_="c-stat-3bar__group")
    if groups:
        for group in groups:
            label = group.find("div", class_="c-stat-3bar__label") #case bas gauche et bas droite de la section stats
            value = group.find("div", class_="c-stat-3bar__value")
            if label and value:
                cleaned_value = re.sub(r'\s*\(.*?\)', '', value.text).strip()
                dictio[label.text.strip()] = int(cleaned_value)
            else:
                dictio[label.text.strip()] = None
    else :
        for obj in liste_objective:
            dictio[obj] = None

def stats_corps_combatant(soup,dictio):
    # ['sig_str_head', 'sig_str_body', 'sig_str_leg']
    body_part = ["head", "body", "leg"]
    for part in body_part:
        small_soup = soup.find("g", id=f"e-stat-body_x5F__x5F_{part}-txt")
        if small_soup:
            texts = small_soup.find_all('text')
            if len(texts) > 1:
                dictio[f"sig_str_{part}"] = int(texts[1].text.strip()) # 1 On prend l'entier , mettre 0 pour le pourcentage
        else:
            dictio[f"sig_str_{part}"] = None

def pourcentage_touche_takedown(soup,dictio):
    liste_objective = ["Précision_saisissante", "Précision_de_Takedown"]
    percentage_text = soup.select('svg.e-chart-circle > title')
    pattern = re.compile(r'([a-zA-Zéèêàç\s]+)(\d+%)')

    if not percentage_text:
        dictio["Précision_saisissante"] = None
        dictio["Précision_de_Takedown"] = None
    else :
        for chaine in percentage_text:
            match = pattern.match(chaine.text)
            if match:
                mots = match.group(1).strip().replace(' ', '_')
                pourcentage = match.group(2).strip()
                dictio[mots] = float(pourcentage.rstrip('%'))     
        mot_manquants = [mot for mot in liste_objective if mot not in dictio.keys()]
        if mot_manquants:
            dictio[f"{mot_manquants[0]}"] = None

def mesures_combattant(soup, dictio):
    liste_objective = [
        'Sig. Str. A atterri', 'Sig. Frappes Encaissées', 
        'Takedown avg', 'Envoi avg', 
        'Sig. Str.défense', 'Défense de démolition', 
        'Knockdown Avg', 'Temps de combat moyen'
    ]

    def extract_number(element):
        if not element:
            return None
        try:
            if element.find("div", class_="c-stat-compare__percent"):
                element.find("div", class_="c-stat-compare__percent").extract()
            text = element.text.strip()
            return float(re.sub(r'[^\d.]+', '', text))
        except ValueError:
            return None

    def convert_minutes(time_str):
        try:
            hours, minutes = map(int, time_str.split(':'))
            return hours * 60 + minutes
        except ValueError:
            return None

    groups = soup.find_all("div", class_="c-stat-compare__group")

    temp_data = {}

    for group in groups:
        label = group.find("div", class_="c-stat-compare__label")
        value = group.find("div", class_="c-stat-compare__number")

        if label:
            label_text = label.text.strip()
            if value:
                value_text = value.text.strip()
                if ":" in value_text:  
                    temp_data[label_text] = convert_minutes(value_text)
                else:
                    temp_data[label_text] = extract_number(value)
            else:
                temp_data[label_text] = None  

    for obj in liste_objective:
        dictio[obj] = temp_data.get(obj, None) # Pour eviter les eventuelles decalages

dictio = extraire_info_combattant(soup)
dictio


AttributeError: 'NoneType' object has no attribute 'find_all'

In [165]:
len(dictio)

32

voir pour implémenter l'anciennete du combattant en prenant la date de son premier combat

Cas combattant sans stats 

In [175]:
cbt = "https://www.ufc.com/athlete/jose-aldo"

driver = webdriver.Chrome()

driver.get(cbt)

driver.implicitly_wait(10)

html_content = driver.page_source

driver.quit()


In [176]:
soup = BeautifulSoup(html_content, "html.parser")

dictio = extraire_info_combattant(soup)

dictio

defaultdict(str,
            {'Name': 'Jose Aldo',
             'Division': 'Bantamweight Division',
             'Genre': 'Male',
             'Win': 32,
             'Losses': 9,
             'Draws': 0,
             'Style de combat': 'Jiu-Jitsu',
             'Âge': 38.0,
             'La Taille': 67.0,
             'Poids': 136.0,
             'Reach': 70.0,
             'Portée de la jambe': 40.0,
             'Title_holder': False,
             'Permanent': None,
             'Clinch': None,
             'Sol': None,
             'KO/TKO': None,
             'DEC': None,
             'SUB': None,
             'sig_str_head': None,
             'sig_str_body': None,
             'sig_str_leg': None,
             'Précision_saisissante': None,
             'Précision_de_Takedown': None,
             'Sig. Str. A atterri': None,
             'Sig. Frappes Encaissées': None,
             'Takedown avg': None,
             'Envoi avg': None,
             'Sig. Str.défense': None,
   

Voir pour implémenter une recherche de tous les combats d'un combattant ne possédant pas de stats sur sa main page afin de recolter des stats.

## Page principale

In [179]:
driver = webdriver.Chrome()

driver.get("https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23")

driver.implicitly_wait(5)

front_content = driver.page_source

driver.quit()


In [185]:
def recolte_pages_combattants(soup):
    elements = soup.find_all("a", href = re.compile(r'/athlete/[\w]+-[\w]+') ,class_="e-button--black")
    hrefs = [f'https://www.ufc.com{element['href']}' for element in elements]
    return hrefs

In [180]:
soup = BeautifulSoup(front_content, "html.parser")

hrefs = recolte_pages_combattants(soup)

hrefs

['https://www.ufc.com/athlete/nariman-abbassov',
 'https://www.ufc.com/athlete/hamdy-abdelwahab',
 'https://www.ufc.com/athlete/mansur-abdul-malik',
 'https://www.ufc.com/athlete/israel-adesanya',
 'https://www.ufc.com/athlete/fabio-agu',
 'https://www.ufc.com/athlete/jesus-aguilar',
 'https://www.ufc.com/athlete/nick-aguirre',
 'https://www.ufc.com/athlete/abdul-kareem-al-selwady',
 'https://www.ufc.com/athlete/herdem-alacabek',
 'https://www.ufc.com/athlete/amir-albazi',
 'https://www.ufc.com/athlete/irene-aldana']

In [None]:
def visite_page_combattant(driver, url):
    driver.get(url)
    driver.implicitly_wait(3)
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, "html.parser")
    dictio = extraire_info_combattant(soup)
    return dictio

In [171]:
driver = webdriver.Chrome()
for url in hrefs:
    visite_page_combattant(driver, url)
driver.quit()

In [183]:
driver = webdriver.Chrome()

driver.get("https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23")

driver.implicitly_wait(5)

front_content = driver.page_source

front_soup = BeautifulSoup(front_content, "html.parser")

liste_test = []
hrefs = recolte_pages_combattants(front_soup)
for url in hrefs:
    dictio = visite_page_combattant(driver, url)
    liste_test.append(dictio)

# driver.find_element(By.XPATH, "//a[@title='Load more items']").click()

driver.quit()

pd.DataFrame(liste_test)

Unnamed: 0,Name,Division,Genre,Win,Losses,Draws,Âge,Poids,Title_holder,Permanent,...,Takedown avg,Envoi avg,Sig. Str.défense,Défense de démolition,Knockdown Avg,Temps de combat moyen,Style de combat,La Taille,Reach,Portée de la jambe
0,Nariman Abbassov,Lightweight Division,Male,0,1,0,29.0,156.0,False,45,...,0.0,0.0,46.0,67.0,0.0,900,,,,
1,Hamdy Abdelwahab,Heavyweight Division,Male,5,0,0,30.0,264.5,False,30,...,3.0,0.0,59.0,,1.0,900,Wrestler,74.0,72.0,41.0
2,Mansur Abdul-Malik,Middleweight Division,Male,7,0,0,27.0,186.0,False,28,...,0.0,0.0,54.0,75.0,1.29,350,Freestyle,74.0,79.5,43.0
3,Israel Adesanya,Middleweight Division,Male,24,4,0,35.0,184.0,False,1144,...,0.05,0.14,56.0,76.0,0.63,1098,Freestyle,76.0,80.0,44.5
4,Fabio Agu,Middleweight Division,Male,0,0,0,35.0,,False,0,...,,,,,,0,,,,
5,Jesus Aguilar,Flyweight Division,Male,11,2,0,28.0,127.5,False,26,...,1.75,1.75,58.0,36.0,0.44,410,Freestyle,64.0,62.5,36.0
6,Nick Aguirre,Bantamweight Division,Male,7,2,0,27.0,142.0,False,18,...,1.16,0.58,46.0,33.0,0.0,779,Freestyle,69.0,74.0,40.0
7,Abdul-Kareem Al-Selwady,Lightweight Division,Male,15,4,0,28.0,172.4,False,102,...,2.32,0.0,59.0,20.0,0.0,775,MMA,68.0,69.0,39.0
8,Herdem Alacabek,Light Heavyweight Division,Male,0,0,0,32.0,205.0,False,24,...,2.06,0.0,23.0,43.0,0.0,874,,74.0,74.5,41.0
9,Amir Albazi,Flyweight Division,Male,17,2,0,31.0,125.5,False,202,...,1.39,0.52,62.0,50.0,0.35,865,Jiu-Jitsu,65.0,68.0,38.0


In [None]:
df = pd.DataFrame(liste_test)
df.describe()

Unnamed: 0,Win,Losses,Draws,Âge,Poids,Permanent,Clinch,Sol,KO/TKO,DEC,...,Sig. Frappes Encaissées,Takedown avg,Envoi avg,Sig. Str.défense,Défense de démolition,Knockdown Avg,Temps de combat moyen,La Taille,Reach,Portée de la jambe
count,11.0,11.0,11.0,11.0,10.0,11.0,11.0,11.0,11.0,11.0,...,10.0,10.0,10.0,10.0,9.0,10.0,11.0,9.0,9.0,9.0
mean,9.181818,2.090909,0.0,30.727273,169.89,233.363636,14.909091,20.545455,4.727273,2.090909,...,4.098,1.188,0.321,51.9,53.111111,0.393,711.818182,70.333333,72.0,40.111111
std,7.947555,2.467977,0.0,3.349355,42.863334,408.030458,23.304311,21.713423,4.839234,2.736953,...,1.736758,1.099382,0.548421,11.464438,21.566435,0.457166,321.69483,4.330127,5.678908,2.607415
min,0.0,0.0,0.0,27.0,125.5,0.0,0.0,0.0,0.0,0.0,...,1.26,0.0,0.0,23.0,20.0,0.0,0.0,64.0,62.5,36.0
25%,2.5,0.0,0.0,28.0,137.5,25.0,1.5,0.0,0.5,0.0,...,3.15,0.075,0.0,48.0,36.0,0.0,592.5,68.0,68.5,38.5
50%,7.0,2.0,0.0,30.0,164.2,30.0,3.0,18.0,5.0,0.0,...,3.655,1.275,0.07,56.0,50.0,0.285,865.0,69.0,72.0,40.0
75%,15.0,3.0,0.0,33.5,185.5,152.0,16.0,33.5,7.0,3.5,...,5.305,1.9825,0.445,58.75,75.0,0.5825,889.5,74.0,74.5,41.0
max,24.0,8.0,0.0,36.0,264.5,1144.0,64.0,62.0,16.0,8.0,...,6.66,3.0,1.75,62.0,78.0,1.29,1098.0,76.0,80.0,44.5


Deroulage de page