In [None]:
# tournament.py
#========================
import os
import time
import re
from datetime import datetime
import pandas as pd
import traceback

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

#from badnet.sql_connection import badminton_db
#from badnet.utils import logger

class Tournament:
    def __init__(self, name: str, url: str, date: str, ville: str, departement = "99", source="badnet"):
        self.name = name
        self.url = url
        self.id = url.split("=")[-1]
        self.departement = departement
        self.date = date
        self.description = ''
        self.category = [ 'N', 'R', 'D','P', 'NC']
        self.disciplines = ['simple', 'double', 'mixte']
        self.age_group = ['jeunes', 'seniors', 'veterans']
        self.source = source
        self.date_publication = datetime.now()
        self.ville = ville
        self.date_registration_opening = ''
        self.date_registration_closed = ''
        self.ja = 'Non renseigné'
        self.url_affiche='https://badnet.fr/Img/poster/affiche_default.png'
        self.volant = 'Non renseigné'
        self.date_publication = datetime.now()
    
    def __repr__(self) -> str:

        return f"Tournament {self.id} {self.name}"

    def get_additional_info(self):
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--window-size=1920x6080')
        chromedriver = os.getcwd() + "/chromedriver"
        self.driver = webdriver.Chrome(options = chrome_options, service=Service(chromedriver))
        self.driver.get(self.url)
        time.sleep(10)
        self.driver.find_element("xpath", "/html/body/div[2]/div/section/div/div/nav/ul/li[1]/span").click()
        time.sleep(5)
        infos = self.driver.find_element("class name", "infos")
        
        dates = self.driver.find_element("class name", "limit")

        try:
            notes = self.driver.find_element("class name", "text")
            self.description = re.search('Notes des organisateurs\n((.*\n*)*)',notes.text).group(1)
        except Exception as e:
            logger.error(f"Could not extract organizer's notes for {self.name} ID={self.id}")
            logger.error(traceback.format_exc())
        ja = re.search('Juge-arbitre : (.+)', infos.text)
        self.ja = ja.group(1) if ja else 'unknown'
        volant = re.search('Volant officiel : (.+)', infos.text)
        self.volant = volant.group(1) if volant else 'unknown'
        
        try:
            simple = infos.text.split('\n')[infos.text.split('\n').index('SIMPLE DOUBLE MIXTE')+1].split()[0]=='check'
            double = infos.text.split('\n')[infos.text.split('\n').index('SIMPLE DOUBLE MIXTE')+1].split()[1]=='check'
            mixte = infos.text.split('\n')[infos.text.split('\n').index('SIMPLE DOUBLE MIXTE')+1].split()[2]=='check'

            discipline = {'simple': simple, 'double': double, 'mixte': mixte}
            self.disciplines = [key for key, value in discipline.items() if value]
            if self.disciplines==[]:
                self.disciplines = ['simple', 'double', 'mixte']
        except Exception as e:
            logger.error(f"Could not extract disciplines for {self.name} ID={self.id}")
            logger.error(traceback.format_exc())

        try:
            jeunes = infos.text.split('\n')[infos.text.split('\n').index('JEUNES SÉNIORS VÉTÉRANS HANDIBAD INCLUSIF')+1].split()[0]=='check'
            seniors = infos.text.split('\n')[infos.text.split('\n').index('JEUNES SÉNIORS VÉTÉRANS HANDIBAD INCLUSIF')+1].split()[1]=='check'
            veterans = infos.text.split('\n')[infos.text.split('\n').index('JEUNES SÉNIORS VÉTÉRANS HANDIBAD INCLUSIF')+1].split()[2]=='check'

            age_group = {'jeunes': jeunes, 'seniors': seniors, 'veterans': veterans}
            self.age_group = [key for key, value in age_group.items() if value]
            print(','.join(self.age_group))
            if self.age_group==[]:
                self.age_group = ['jeunes', 'seniors', 'veterans']
        except Exception as e:
            logger.error(f"Could not extract age groups for {self.name} ID={self.id}")
            logger.error(traceback.format_exc())

        
        try:
            N = infos.text.split('\n')[-1].split()[0]=='check'
            R = infos.text.split('\n')[-1].split()[1]=='check'
            D = infos.text.split('\n')[-1].split()[2]=='check'
            P = infos.text.split('\n')[-1].split()[3]=='check'
            NC = infos.text.split('\n')[-1].split()[4]=='check'

            categories = {'N': N, 'R': R, 'D': D, 'P': P, 'NC': NC}
            self.category = [key for key, value in categories.items() if value]
            if self.category == []:
                self.category = [ 'N', 'R', 'D','P', 'NC']
        except Exception as e:
            logger.error(f"Could not extract category for {self.name} ID={self.id}")
            logger.error(traceback.format_exc())
        try:
            self.date_registration_opening = dates.text.split('\n')[1]
            self.date_registration_closed = dates.text.split('\n')[3]
        except Exception as e:
            logger.error("Could not extract registration dates for {self.name} ID={self.id}")
            logger.error(traceback.format_exc())    

        self.url_affiche = self.driver.find_element("class name", 'flex.top').find_element("tag name", "figure").find_element("tag name", "a").get_attribute("href")
        
        self.driver.quit()
    
    def is_in_db(self):
        if badminton_db.execute(f"SELECT True FROM tournaments WHERE id={self.id}").fetchone():
            return True
        else:
            return False

    def save_to_db(self):
        if not self.is_in_db():
            tournament = pd.DataFrame(
                {'id':self.id,
                'name': self.name,
                'url':self.url,
                'departement': self.departement, 
                'date': self.date,
                'source': "badnet",
                'description': self.description,
                'date_publication': self.date_publication,
                'disciplines' : ','.join(self.disciplines),
                'agegroup': ','.join(self.age_group),
                'categories': ','.join(self.category)
                }, index=[0]
                )
            tournament.to_sql('tournaments', badminton_db, if_exists='append', index=False)

In [None]:
# scraper.py
#=============================
import os
import time
import html
import re
import pandas as pd
from datetime import datetime
import logging

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.select import Select


from badnet.tournament import Tournament

class BadnetScraper:
    def __init__(self, url):
        self.url=url
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--window-size=1920x6080')
        chromedriver = os.getcwd() + "/chromedriver"

        self.driver = webdriver.Chrome(options = chrome_options, service=Service(chromedriver))
        self.tournaments = []
        self.tournaments_df = pd.DataFrame({'id':pd.Series(dtype='str'), 
            'name': pd.Series(dtype='str'),
            'url':pd.Series(dtype='str'),
            'departement': pd.Series(dtype='str'), 
            'date': pd.Series(dtype='str'),
            'source': pd.Series(dtype='str'),
            'description': pd.Series(dtype='str'),
            "date_publication": pd.Series(dtype='datetime64[ns]')})

    def extract_tournaments(self):
        self.driver.get(self.url)
        time.sleep(10)

        departements = {'75':'62','77':'63','78':'64','91':'65','92':'66','93':'67','94':'68','95':'69'}
        for departement, departement_code in departements.items():
            print(departement)
            print("page d'acceuil chargée")
            #ligue_selector = Select(self.driver.find_element("id","ligue"))
            #ligue_selector.select_by_value('12')

            departement_selector = Select(self.driver.find_element("id","departement"))
            departement_selector.select_by_value(departement_code)
            time.sleep(1)
            

            tournaments=self.driver.find_element('id', 'search_results').find_elements('class name', 'events')[0].find_elements('class name', 'row')
            try:
                pages = self.driver.find_element('class name', 'pager')
                next_page = pages.find_element('xpath', '//a[text()="›"]')
                pager="›" in pages.text
                
            except:
                pager = False
            
            for tournament in tournaments:
                url = tournament.get_attribute("href")
                name = html.unescape(tournament.find_element('class name', 'name').text)
                date = tournament.find_element('class name', 'date').text
                ville = tournament.find_element('class name', 'location').text
                self.tournaments.append(Tournament(name=name, url=url, departement = departement, date=date, ville = ville))
                self.tournaments_df = pd.concat([pd.DataFrame({
                    'id':url.split("=")[-1],
                    'name':name,
                    'url':url,
                    'departement':departement,
                    'date':date,
                    'source':'badnet',
                    'description': '',
                    'date_publication':datetime.now()
                }, index=[0]
                ),self.tournaments_df.loc[:]]).reset_index(drop=True)

            while pager:
                next_page.click()    
                time.sleep(3)
                tournaments=self.driver.find_element('id', 'search_results').find_elements('class name', 'events')[0].find_elements('class name', 'row')
                pages = self.driver.find_element('class name', 'pager')
                try:
                    next_page = pages.find_element('xpath', '//a[text()="›"]')
                    pager="›" in pages.text
                except:
                    pager = False
                
                for tournament in tournaments:
                    url = tournament.get_attribute("href")
                    name = html.unescape(tournament.find_element('class name', 'name').text)
                    date = tournament.find_element('class name', 'date').text
                    ville = tournament.find_element('class name', 'location').text
                    self.tournaments.append(Tournament(name=name, url=url, departement = departement, date=date, ville = ville))
                    self.tournaments_df = pd.concat([pd.DataFrame({
                        'id':url.split("=")[-1],
                        'name':name,
                        'url':url,
                        'departement':departement,
                        'date':date,
                        'source':'badnet',
                        'description': '',
                        'date_publication':datetime.now()
                    }, index=[0]
                    ),self.tournaments_df.loc[:]]).reset_index(drop=True)
                print(pages.text, pager)
            
            time.sleep(1)

    def quit(self):
        self.driver.quit()

: 

In [None]:
badnet = BadnetScraper(url = "https://badnet.fr/accueil")

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.select import Select

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--window-size=1920x6080')
chromedriver = os.getcwd() + "/chromedriver"

driver = webdriver.Chrome(options = chrome_options, service=Service(chromedriver))

: 

In [None]:
import pandas as pd

: 