In [2]:
import pandas as pd
import multiprocessing as mp

from bs4 import BeautifulSoup
import requests
import logging
import numpy as np
import gc

# mp.set_start_method('spawn', force=True)
logger = logging.getLogger(__name__)

In [3]:
races_t = pd.read_csv('dataset/races.csv')

In [5]:
base_url = "https://www.procyclingstats.com/race/"



def scrap_for_points(sub_directory):
    try:
        complete_url = base_url + sub_directory
        response = requests.get(complete_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        tables_with_pnt = soup.select('div[class=result-cont] div.subTabs[data-subtab="1"] table.results.basic')
        # tables_with_pnt = [table for table in tables if any(f.text == 'Pnt' for f in table.find_all('th'))]
        len_tables = len(tables_with_pnt)
        if len_tables == 0 or len_tables > 1:
            logger.error(f"Found none, or too many {sub_directory}. Found {len_tables}")
            return None
        all_th = tables_with_pnt[0].find_all('th')


        get_pnt_index= [i for i, th in enumerate(all_th) if th.text == 'Pnt']
        get_rnk_index = [i for i, th in enumerate(all_th) if th.text == 'Rnk']
        get_rider_index = [i for i, th in enumerate(all_th) if th.text == 'Rider']
        get_ucipnt_index = [i for i, th in enumerate(all_th) if th.text == 'UCI']
        
        len_uci = len(get_ucipnt_index)


        if len(get_pnt_index) != 1:
            logger.error(f"Found none, or too many Pnt. Found {len(get_pnt_index)}. Subdirectory: {sub_directory}")
            return None
        
        if len(get_rnk_index) != 1:
            logger.error(f"Found none, or too many Rnk. Found {len(get_rnk_index)}. Subdirectory: {sub_directory}")
            return None
        
        if len(get_rider_index) != 1:
            logger.error(f"Found none, or too many Rider. Found {len(get_rider_index)}. Subdirectory: {sub_directory}")
            return None
        
        if  len_uci > 1:
            logger.error(f"Found none, or too many UCI. Found {len(get_ucipnt_index)}. Subdirectory: {sub_directory}")
            return None

        all_trs = tables_with_pnt[0].tbody.find_all('tr')

        all_pnt = []
        all_rnk = []
        all_uci = []
        all_riders = []
        

        for tr in all_trs:
            all_tds = tr.find_all('td')
            if not all_tds[0].text.isdigit():
                continue
            try:
                all_rnk.append(int(all_tds[get_rnk_index[0]].text))
            except Exception as e:
                logger.error(f"Error in getting a specific rank number in {sub_directory}. All tds: {all_tds}. Error {e}.")
                continue
            all_pnt.append(int(all_tds[get_pnt_index[0]].text or "0"))
            try:
                all_riders.append(all_tds[get_rider_index[0]].find('a')['href'].split('/')[-1])
            except Exception as e:
                logger.error(f"Error in getting a specific rider in {sub_directory}. All tds: {all_tds}. Error {e}.")
                continue

            if len(get_ucipnt_index) == 1:
                all_uci.append(int(all_tds[get_ucipnt_index[0]].text or "0"))

                
      
        if len(np.unique(all_uci)) == 1:
            all_uci = [np.nan] * len(all_uci)

        return {sub_directory: {'uci': all_uci, 'pnt': all_pnt, 'rnk': all_rnk, 'riders': all_riders}}
    except Exception as e:
        logger.error(f"Error in {sub_directory}. {e}")
        raise e



In [6]:
scrap_for_points('paris-nice/1985/stage-7b')

{'paris-nice/1985/stage-7b': {'uci': [nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan],
  'pnt': [50, 30, 18, 13, 10, 7, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'rnk': [1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20],
  'riders': ['stephen-roche',
   'sean-kelly',
   'jean-francois-bernard',
   'robert-millar',
   'pedro-munoz-machin',
   'phil-anderson',
   'pascal-simon',
   'charly-berard',
   'eric-caritoux',
   'frederic-vichot',
   'jerome-simon',
   'jean-marie-grezet',
   'alain-vigneron',
   'martin-earley',
   'jokin-mujika',
   'jean-claude-bagot',
   'jorg-muller',
   'dominique-arnaud',
   'charly-mottet',
   'kim-andersen']}}

In [7]:
all_urls = races_t['_url'].unique().tolist()
with mp.Pool() as pool:
    results = pool.map(scrap_for_points, all_urls, chunksize=200)

    # unify all dicts in results
    all_results = {}
    for r in results:
        if r is not None:
            all_results.update(r)
    wb_races = pd.DataFrame(all_results).T

    # res.to_csv('points.csv')
wb_races

Unnamed: 0,uci,pnt,rnk,riders
tour-de-france/1978/stage-6,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[100, 70, 50, 40, 32, 26, 22, 18, 14, 10, 8, 6...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[sean-kelly, gerrie-knetemann, rene-bittinger,..."
vuelta-a-espana/2016/stage-14,"[100, 40, 20, 12, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[80, 50, 35, 25, 18, 15, 12, 10, 8, 6, 5, 4, 3...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[robert-gesink, kenny-elissonde, egor-silin, g..."
tour-de-france/2019/stage-21,"[120, 50, 25, 15, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[100, 70, 50, 40, 32, 26, 22, 18, 14, 10, 8, 6...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[caleb-ewan, dylan-groenewegen, niccolo-bonifa..."
volta-a-catalunya/1999/prologue,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[50, 30, 18, 13, 10, 7, 4, 3, 2, 1, 0, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[angel-luis-casero, andrea-peron-1, abraham-ol..."
tour-de-france/2022/stage-9,"[120, 50, 25, 15, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[100, 70, 50, 40, 32, 26, 22, 18, 14, 10, 8, 6...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[bob-jungels, jonathan-castroviejo, carlos-ver..."
...,...,...,...,...
giro-d-italia/2017/stage-3,"[100, 40, 20, 12, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[80, 50, 35, 25, 18, 15, 12, 10, 8, 6, 5, 4, 3...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[fernando-gaviria, rudiger-selig, giacomo-nizz..."
paris-roubaix/2000/result,[],"[275, 200, 150, 120, 100, 90, 80, 70, 60, 50, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[johan-museeuw, peter-van-petegem, erik-zabel,..."
paris-nice/1976/stage-2,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[50, 30, 13, 10, 7, 4, 3, 2, 1, 0, 0, 0, 0, 0,...","[1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...","[freddy-maertens, jan-raas, jean-jacques-fussi..."
volta-a-catalunya/2016/stage-7,"[50, 20, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[50, 30, 18, 13, 10, 7, 4, 3, 2, 1, 0, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[alexey-tsatevich, primoz-roglic, jarlinson-pa..."


In [9]:
gc.collect()
wb_races.loc['paris-nice/1985/stage-7b']

uci       [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
pnt       [50, 30, 18, 13, 10, 7, 4, 3, 2, 1, 0, 0, 0, 0...
rnk       [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
riders    [stephen-roche, sean-kelly, jean-francois-bern...
Name: paris-nice/1985/stage-7b, dtype: object

In [10]:
races_ = races_t.copy()

def parallel_process(indx):
    riders = wb_races.loc[indx]['riders']
    pnt = wb_races.loc[indx]['pnt']
    rnk = wb_races.loc[indx]['rnk']
    uci = wb_races.loc[indx]['uci']

    if len(uci) == 0:
        uci = [np.nan] * len(riders)

    riders_dict = {}
    for i, rider in enumerate(riders):
        riders_dict[rider] = {'points': pnt[i], 'position': rnk[i], 'uci_points': uci[i]}
  
   
    df_race = races_.loc[races_['_url'] == indx].copy()
    set_df_riders = set(list(df_race['cyclist']))
    set_wb_riders = set(riders)
    all_riders_not_in = list(set_df_riders - set_wb_riders)
    all_riders = list(set(set_df_riders) & set(set_wb_riders))

    if len(all_riders_not_in) > 0:
        for rider in all_riders_not_in:
            df_race.drop(index=df_race[df_race['cyclist'] == rider].index, inplace=True)
    for indx,rider in enumerate(all_riders):
        indx_rider = df_race[df_race['cyclist'] == rider].index
        df_race.loc[indx_rider, 'points'] = riders_dict[rider]['points']
        df_race.loc[indx_rider, 'position'] = riders_dict[rider]['position']
        
        if len(uci) > 0:
            df_race.loc[indx_rider, 'uci_points'] = riders_dict[rider]['uci_points']

    return df_race


all_urls = [indx for indx in wb_races.index]

with mp.Pool() as pool:
    del races_
    races_ =  pd.concat(pool.map(parallel_process, all_urls, chunksize=200))

gc.collect()

0

In [11]:
print(len(races_))
print(len(races_t))

races_.to_csv('races_updated.csv', index=False)


586982
589865


In [12]:
print(f"Total nan points: {races_['points'].isna().sum()}")
print(f"Total nan uci_points: {races_['uci_points'].isna().sum()}")
print(f'Total nan position: {races_['position'].isna().sum()}')
print(f'Total nan climb_total: {races_['climb_total'].isna().sum()}') 

Total nan points: 0
Total nan uci_points: 336927
Total nan position: 0
Total nan climb_total: 146447
