# Library imports

In [1]:
from urllib.request import urlopen

# import library to look for height and weight text on web pages
import re
from bs4 import BeautifulSoup
import requests
import time, os
import lxml.html as lh

# import library to push height weight into spreadsheets & open file
import csv
output_file = open('height_weight.csv', 'w', newline = '')
output_writer = csv.writer(output_file)

# import numpy, pandas, seaborn, matplotlib just because
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
from matplotlib import pyplot as plt

# scikitlearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


---
## Links for end-of-2019 UCI point rankings(top 200 cyclists per category)
### Time-trialist:
https://www.procyclingstats.com/rankings.php?id=50939&nation=&team=&page=0&prev_id=50939&younger=&older=&limit=200&filter=Filter&morefilters=

(to get ranks 201-400, 401-600, etc., just change 'page=0' to 'page=200', 400, etc.)

### Sprinter:
https://www.procyclingstats.com/rankings.php?id=50938&nation=&team=&page=0&prev_id=50938&younger=&older=&limit=200&filter=Filter&morefilters=

### Climber:
https://www.procyclingstats.com/rankings.php?id=49059&nation=&team=&page=0&prev_id=49058&younger=&older=&limit=200&filter=Filter&morefilters=

### Overall men:
https://www.procyclingstats.com/rankings.php?id=49002&nation=&team=&page=0&prev_id=prev&younger=&older=&limit=200&filter=Filter&morefilters=

### Overall women:
https://www.procyclingstats.com/rankings.php?id=49034&nation=&team=&page=0&prev_id=prev&younger=&older=&limit=200&filter=Filter&morefilters=


---

# Initial scrape of rider rankings

Setting up URLs to create soups

In [2]:
# Creating soups (This takes about 20 seconds for 1000 cyclists x 3 categories)

# time trialists
# 200
url_200_tt = 'https://www.procyclingstats.com/rankings.php?id=50939&nation=&team=&page=0&prev_id=50939&younger=&older=&limit=200&filter=Filter&morefilters='
response_200_tt = requests.get(url_200_tt)
page_200_tt = response_200_tt.text
soup_200_tt = BeautifulSoup(page_200_tt,"lxml")
# 400
url_400_tt = 'https://www.procyclingstats.com/rankings.php?id=50939&nation=&team=&page=200&prev_id=50939&younger=&older=&limit=200&filter=Filter&morefilters='
response_400_tt = requests.get(url_400_tt)
page_400_tt = response_400_tt.text
soup_400_tt = BeautifulSoup(page_400_tt,"lxml")
# 600
url_600_tt = 'https://www.procyclingstats.com/rankings.php?id=50939&nation=&team=&page=400&prev_id=50939&younger=&older=&limit=200&filter=Filter&morefilters='
response_600_tt = requests.get(url_600_tt)
page_600_tt = response_600_tt.text
soup_600_tt = BeautifulSoup(page_600_tt,"lxml")
# 800
url_800_tt = 'https://www.procyclingstats.com/rankings.php?id=50939&nation=&team=&page=600&prev_id=50939&younger=&older=&limit=200&filter=Filter&morefilters='
response_800_tt = requests.get(url_800_tt)
page_800_tt = response_800_tt.text
soup_800_tt = BeautifulSoup(page_800_tt,"lxml")
# 1000
url_1000_tt = 'https://www.procyclingstats.com/rankings.php?id=50939&nation=&team=&page=800&prev_id=50939&younger=&older=&limit=200&filter=Filter&morefilters='
response_1000_tt = requests.get(url_1000_tt)
page_1000_tt = response_1000_tt.text
soup_1000_tt = BeautifulSoup(page_1000_tt,"lxml")

# sprinters
# 200
url_200_sp = 'https://www.procyclingstats.com/rankings.php?id=50938&nation=&team=&page=0&prev_id=50938&younger=&older=&limit=200&filter=Filter&morefilters='
response_200_sp = requests.get(url_200_sp)
page_200_sp = response_200_sp.text
soup_200_sp = BeautifulSoup(page_200_sp,"lxml")
# 400
url_400_sp = 'https://www.procyclingstats.com/rankings.php?id=50938&nation=&team=&page=200&prev_id=50938&younger=&older=&limit=200&filter=Filter&morefilters='
response_400_sp = requests.get(url_400_sp)
page_400_sp = response_400_sp.text
soup_400_sp = BeautifulSoup(page_400_sp,"lxml")
# 600
url_600_sp = 'https://www.procyclingstats.com/rankings.php?id=50938&nation=&team=&page=400&prev_id=50938&younger=&older=&limit=200&filter=Filter&morefilters='
response_600_sp = requests.get(url_600_sp)
page_600_sp = response_600_sp.text
soup_600_sp = BeautifulSoup(page_600_sp,"lxml")
# 800
url_800_sp = 'https://www.procyclingstats.com/rankings.php?id=50938&nation=&team=&page=600&prev_id=50938&younger=&older=&limit=200&filter=Filter&morefilters='
response_800_sp = requests.get(url_800_sp)
page_800_sp = response_800_sp.text
soup_800_sp = BeautifulSoup(page_800_sp,"lxml")
# 1000
url_1000_sp = 'https://www.procyclingstats.com/rankings.php?id=50938&nation=&team=&page=800&prev_id=50938&younger=&older=&limit=200&filter=Filter&morefilters='
response_1000_sp = requests.get(url_1000_sp)
page_1000_sp = response_1000_sp.text
soup_1000_sp = BeautifulSoup(page_1000_sp,"lxml")

# climbers
# 200
url_200_cl = 'https://www.procyclingstats.com/rankings.php?id=49059&nation=&team=&page=0&prev_id=49058&younger=&older=&limit=200&filter=Filter&morefilters='
response_200_cl = requests.get(url_200_cl)
page_200_cl = response_200_cl.text
soup_200_cl = BeautifulSoup(page_200_cl,"lxml")
# 400
url_400_cl = 'https://www.procyclingstats.com/rankings.php?id=49059&nation=&team=&page=200&prev_id=49058&younger=&older=&limit=200&filter=Filter&morefilters='
response_400_cl = requests.get(url_400_cl)
page_400_cl = response_400_cl.text
soup_400_cl = BeautifulSoup(page_400_cl,"lxml")
# 600
url_600_cl = 'https://www.procyclingstats.com/rankings.php?id=49059&nation=&team=&page=400&prev_id=49058&younger=&older=&limit=200&filter=Filter&morefilters='
response_600_cl = requests.get(url_600_cl)
page_600_cl = response_600_cl.text
soup_600_cl = BeautifulSoup(page_600_cl,"lxml")
# 800
url_800_cl = 'https://www.procyclingstats.com/rankings.php?id=49059&nation=&team=&page=600&prev_id=49058&younger=&older=&limit=200&filter=Filter&morefilters='
response_800_cl = requests.get(url_800_cl)
page_800_cl = response_800_cl.text
soup_800_cl = BeautifulSoup(page_800_cl,"lxml")
# 1000
url_1000_cl = 'https://www.procyclingstats.com/rankings.php?id=49059&nation=&team=&page=800&prev_id=49058&younger=&older=&limit=200&filter=Filter&morefilters='
response_1000_cl = requests.get(url_1000_cl)
page_1000_cl = response_1000_cl.text
soup_1000_cl = BeautifulSoup(page_1000_cl,"lxml")

In [3]:
# time trialists
table_200_tt = soup_200_tt.find('table') # 200
table_400_tt = soup_400_tt.find('table') # 400
table_600_tt = soup_600_tt.find('table') # 600
table_800_tt = soup_800_tt.find('table') # 800
table_1000_tt = soup_1000_tt.find('table') # 1000

# sprinters
table_200_sp = soup_200_sp.find('table') # 200
table_400_sp = soup_400_sp.find('table') # 400
table_600_sp = soup_600_sp.find('table') # 600
table_800_sp = soup_800_sp.find('table') # 800
table_1000_sp = soup_1000_sp.find('table') # 1000

# climbers
table_200_cl = soup_200_cl.find('table') # 200
table_400_cl = soup_400_cl.find('table') # 400
table_600_cl = soup_600_cl.find('table') # 600
table_800_cl = soup_800_cl.find('table') # 800
table_1000_cl = soup_1000_cl.find('table') # 1000

In [4]:
# time trialists
rows_200_tt = [row for row in table_200_tt.find_all('tr')] # 200 , tr tag is for rows
rows_400_tt = [row for row in table_400_tt.find_all('tr')] # 400
rows_600_tt = [row for row in table_600_tt.find_all('tr')] # 600
rows_800_tt = [row for row in table_800_tt.find_all('tr')] # 800
rows_1000_tt = [row for row in table_1000_tt.find_all('tr')] # 1000

# sprinters
rows_200_sp = [row for row in table_200_sp.find_all('tr')] # 200 , tr tag is for rows
rows_400_sp = [row for row in table_400_sp.find_all('tr')] # 400
rows_600_sp = [row for row in table_600_sp.find_all('tr')] # 600
rows_800_sp = [row for row in table_800_sp.find_all('tr')] # 800
rows_1000_sp = [row for row in table_1000_sp.find_all('tr')] # 1000

# climbers
rows_200_cl = [row for row in table_200_cl.find_all('tr')] # 200 , tr tag is for rows
rows_400_cl = [row for row in table_400_cl.find_all('tr')] # 400
rows_600_cl = [row for row in table_600_cl.find_all('tr')] # 600
rows_800_cl = [row for row in table_800_cl.find_all('tr')] # 800
rows_1000_cl = [row for row in table_1000_cl.find_all('tr')] # 1000

Filling out dictionaries for each set of 200 per category

In [5]:
# time trialists
cyclists_dict_tt = {}
cyclists_200_tt = {} # 200
cyclists_400_tt = {} # 400
cyclists_600_tt = {} # 600
cyclists_800_tt = {} # 800
cyclists_1000_tt = {} # 1000

for row in rows_200_tt[1:200]:
    #the 4 lines below are saying the above, that url = rows[1].find_all('td')[3].find('a')['href']
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_200_tt[title] = [url] + [i.text for i in items]
    
for row in rows_400_tt[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_400_tt[title] = [url] + [i.text for i in items]
    
for row in rows_600_tt[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_600_tt[title] = [url] + [i.text for i in items]
    
for row in rows_800_tt[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_800_tt[title] = [url] + [i.text for i in items]
    
for row in rows_1000_tt[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_1000_tt[title] = [url] + [i.text for i in items]
    
# sprinters
cyclists_dict_sp = {}
cyclists_200_sp = {} # 200
cyclists_400_sp = {} # 400
cyclists_600_sp = {} # 600
cyclists_800_sp = {} # 800
cyclists_1000_sp = {} # 1000

for row in rows_200_sp[1:200]:
    #the 4 lines below are saying the above, that url = rows[1].find_all('td')[3].find('a')['href']
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_200_sp[title] = [url] + [i.text for i in items]
    
for row in rows_400_sp[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_400_sp[title] = [url] + [i.text for i in items]
    
for row in rows_600_sp[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_600_sp[title] = [url] + [i.text for i in items]
    
for row in rows_800_sp[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_800_sp[title] = [url] + [i.text for i in items]
    
for row in rows_1000_sp[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_1000_sp[title] = [url] + [i.text for i in items]
    
# climbers
cyclists_dict_cl = {}
cyclists_200_cl = {} # 200
cyclists_400_cl = {} # 400
cyclists_600_cl = {} # 600
cyclists_800_cl = {} # 800
cyclists_1000_cl = {} # 1000

for row in rows_200_cl[1:200]:
    #the 4 lines below are saying the above, that url = rows[1].find_all('td')[3].find('a')['href']
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_200_cl[title] = [url] + [i.text for i in items]
    
for row in rows_400_cl[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_400_cl[title] = [url] + [i.text for i in items]
    
for row in rows_600_cl[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_600_cl[title] = [url] + [i.text for i in items]
    
for row in rows_800_cl[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_800_cl[title] = [url] + [i.text for i in items]
    
for row in rows_1000_cl[1:200]:
    items = row.find_all('td')
    link = items[3].find('a')
    title = link.text
    url = link['href']
    cyclists_1000_cl[title] = [url] + [i.text for i in items]

Combining each dictionary set of 200 into single dictionaries per cyclist category

In [6]:
# time trialists
cyclists_dict_tt.update(cyclists_200_tt)
cyclists_dict_tt.update(cyclists_400_tt)
cyclists_dict_tt.update(cyclists_600_tt)
cyclists_dict_tt.update(cyclists_800_tt)
cyclists_dict_tt.update(cyclists_1000_tt)

# sprinters
cyclists_dict_sp.update(cyclists_200_sp)
cyclists_dict_sp.update(cyclists_400_sp)
cyclists_dict_sp.update(cyclists_600_sp)
cyclists_dict_sp.update(cyclists_800_sp)
cyclists_dict_sp.update(cyclists_1000_sp)

# climbers
cyclists_dict_cl.update(cyclists_200_cl)
cyclists_dict_cl.update(cyclists_400_cl)
cyclists_dict_cl.update(cyclists_600_cl)
cyclists_dict_cl.update(cyclists_800_cl)
cyclists_dict_cl.update(cyclists_1000_cl)

In [3]:
#Testing the dictionary
cyclists_dict_tt

NameError: name 'cyclists_dict_tt' is not defined

Turning dictionaries into pandas dataframes

In [8]:
#Putting the scrape into a pandas dataframe
# time trialists
df_cyclists_tt = pd.DataFrame(cyclists_dict_tt).T  #transpose
df_cyclists_tt.columns = ['link_stub', 'rank','previous_rank','diff','name','team','points','...',]
del df_cyclists_tt['previous_rank'], df_cyclists_tt['diff'], df_cyclists_tt['...']

# sprinters
df_cyclists_sp = pd.DataFrame(cyclists_dict_sp).T  #transpose
df_cyclists_sp.columns = ['link_stub', 'rank','previous_rank','diff','name','team','points','...',]
del df_cyclists_sp['previous_rank'], df_cyclists_sp['diff'], df_cyclists_sp['...']

# climbers
df_cyclists_cl = pd.DataFrame(cyclists_dict_cl).T  #transpose
df_cyclists_cl.columns = ['link_stub', 'rank','previous_rank','diff','name','team','points','...',]
del df_cyclists_cl['previous_rank'], df_cyclists_cl['diff'], df_cyclists_cl['...']

In [9]:
df_cyclists_cl.head() #preview

Unnamed: 0,link_stub,rank,name,team,points
Roglič Primož,rider/primoz-roglic,1,Roglič Primož,Team Jumbo-Visma,1386
Valverde Alejandro,rider/alejandro-valverde,2,Valverde Alejandro,Movistar Team,1174
Fuglsang Jakob,rider/jakob-fuglsang,3,Fuglsang Jakob,Astana Pro Team,1138
Quintana Nairo,rider/nairo-quintana,4,Quintana Nairo,Movistar Team,958
Pogačar Tadej,rider/tadej-pogacar,5,Pogačar Tadej,UAE-Team Emirates,942


# Cyclist profile scraping functions
Using dataframe created by initial rankings table scrape

### Small function

In [10]:
#Function to seek values in html
#This is nested in the profile webscraper

def get_cyclist_value(soup, field_name):
    
    '''Grab a value from Pro Cycling Stats HTML
    
    Takes a string attribute of a cuclist on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

### Main function

In [11]:
def get_cyclist_dict(link):
    '''
    From procyclingstats link stub, request cyclist html, parse with BeautifulSoup, and
    collect 
        - name
        - link
        - date of birth
        - age
        - nationality
        - weight
        - height
        - place of birth
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.procyclingstats.com/'
    
    url = base_url + link #Create full url to scrape
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['name', 'link_stub', 'date_of_birth', 'age', 'nationality', 'place_of_birth', 'weight', 'height']
    
    #Get name
    name = soup.find('title').text
    #Get link
    link_stub = link  
    #Get date of birth
    info_cont = (soup.find(class_='rdr-info-cont').text)
    dob_as_list = info_cont.split(' ')[3:6]
    date_of_birth = str(dob_as_list[2][:4] + " " + str(dob_as_list[1]) + " " + str(dob_as_list[0][:-2]))
    #Get age
    age = info_cont.split(' ')[6][:4].strip('()')
    try:
        if float(age) > 10:
            pass
        else:
            age = None
    except:
        age = None
    #Get place of birth
    place_of_birth = get_cyclist_value(soup, 'Place of birth')
    #Get nationality
    nationality = (soup.find(class_='rdr-info-cont')
                    .find_all('a', class_='black')[0]
                    .text)
    #Get weight
    try:
        weight = (soup.find(class_='rdr-info-cont')
                 .find_all('span')[1]
                 .text
                ).split()[1]
        if float(weight) < 2.1:
            height = weight
            weight = None
    except:
        weight = None
    #Get height
    try:
        if weight == None:
            height = (soup.find(class_='rdr-info-cont')
                 .find_all('span')[1]
                 .text
                ).split()[1]
        else:
            height = (soup.find(class_='rdr-info-cont')
                .find_all('span')[1]
                .text
                ).split()[4]
            try:
                if float(height) > 1:
                    pass
                else:
                    height = None
            except:
                height = None
    except:
        height = None
    
    
    #Create cyclist dictionary and return
    cyclist_dict = dict(zip(headers, [name,
                                link_stub,
                                date_of_birth,
                                age,
                                nationality,
                                place_of_birth,
                                weight, 
                                height]))

    return cyclist_dict

## Individual rider tests for above function
Uncomment lines to test

In [12]:
# get_cyclist_dict('rider/primoz-roglic')

# get_cyclist_dict('rider/patrick-bevin')

# # No weight, height
# get_cyclist_dict('rider/martin-toft-madsen')

# # No place of birth, weight, height
# get_cyclist_dict('rider/moise-mugisha')

# # Height but no weight
# get_cyclist_dict('rider/brent-van-moer')

# # Weight but no height
# get_cyclist_dict('rider/nicolas-dalla-valle')

# Scraping cyclist profiles and making into dataframe

In [14]:
#this block takes 5min to run per 200(25min per category, 75min for all 3), do something while you wait!

# time trialists
df_cyclists_profiles_list_tt = [] # making blank list to insert scraped value dictionaries into
for link in df_cyclists_tt.link_stub: # iterates through link_stubs from first dataframe
    df_cyclists_profiles_list_tt.append(get_cyclist_dict(link)) # runs function, then appends data to above list

In [15]:
# sprinters
df_cyclists_profiles_list_sp = [] 
for link in df_cyclists_sp.link_stub:
    df_cyclists_profiles_list_sp.append(get_cyclist_dict(link))

In [16]:
# climbers
df_cyclists_profiles_list_cl = []
for link in df_cyclists_cl.link_stub:
    df_cyclists_profiles_list_cl.append(get_cyclist_dict(link))

In [17]:
#convert list of dict to df

# time trialists
df_cyclists_profiles_tt = pd.DataFrame(df_cyclists_profiles_list_tt) 
df_cyclists_profiles_tt.set_index('link_stub', inplace=True)

# sprinters
df_cyclists_profiles_sp = pd.DataFrame(df_cyclists_profiles_list_sp) 
df_cyclists_profiles_sp.set_index('link_stub', inplace=True)

# climbers
df_cyclists_profiles_cl = pd.DataFrame(df_cyclists_profiles_list_cl) 
df_cyclists_profiles_cl.set_index('link_stub', inplace=True)

In [18]:
#preview new dataframe
df_cyclists_profiles_cl.head()

Unnamed: 0_level_0,name,date_of_birth,age,nationality,place_of_birth,weight,height
link_stub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rider/primoz-roglic,Primož Roglič,1989 October 29,30,Slovenia,Trbovlje,65,1.77
rider/alejandro-valverde,Alejandro Valverde,1980 April 25,40,Spain,Murcia,61,1.77
rider/jakob-fuglsang,Jakob Fuglsang,1985 March 22,35,Denmark,Geneva,65,1.81
rider/nairo-quintana,Nairo Quintana,1990 February 4,30,Colombia,Cómbita,59,1.67
rider/tadej-pogacar,Tadej Pogačar,1998 September 21,21,Slovenia,Komenda,66,1.76


In [19]:
#preview original dataframe for reference
df_cyclists_cl.head()

Unnamed: 0,link_stub,rank,name,team,points
Roglič Primož,rider/primoz-roglic,1,Roglič Primož,Team Jumbo-Visma,1386
Valverde Alejandro,rider/alejandro-valverde,2,Valverde Alejandro,Movistar Team,1174
Fuglsang Jakob,rider/jakob-fuglsang,3,Fuglsang Jakob,Astana Pro Team,1138
Quintana Nairo,rider/nairo-quintana,4,Quintana Nairo,Movistar Team,958
Pogačar Tadej,rider/tadej-pogacar,5,Pogačar Tadej,UAE-Team Emirates,942


# Merging main ranking dataframe with cyclist detail dataframe

In [20]:
# df_cyclists.merge(df_cyclists_page_info, left_index=True, right_index=True)

# time trialists
df_cyclists_master_tt = pd.merge(df_cyclists_tt, df_cyclists_profiles_tt, on='link_stub')
df_cyclists_master_tt['name'] = df_cyclists_master_tt['name_y']
df_cyclists_master_tt.set_index('name', inplace=True) #Using into index instead of numbers
del df_cyclists_master_tt['name_x'], df_cyclists_master_tt['name_y']  #Removing extra name columns
df_cyclists_master_tt = df_cyclists_master_tt.dropna(subset=['age']) #Remove rows missing age, if cyclist passed away
df_cyclists_master_tt = df_cyclists_master_tt.dropna(subset=['height']) #Remove rows missing height
df_cyclists_master_tt = df_cyclists_master_tt.dropna(subset=['weight']) #Remove rows missing weight

# sprinters
df_cyclists_master_sp = pd.merge(df_cyclists_sp, df_cyclists_profiles_sp, on='link_stub')
df_cyclists_master_sp['name'] = df_cyclists_master_sp['name_y']
df_cyclists_master_sp.set_index('name', inplace=True)
del df_cyclists_master_sp['name_x'], df_cyclists_master_sp['name_y']
df_cyclists_master_sp = df_cyclists_master_sp.dropna(subset=['age']) 
df_cyclists_master_sp = df_cyclists_master_sp.dropna(subset=['height'])
df_cyclists_master_sp = df_cyclists_master_sp.dropna(subset=['weight'])

# climbers
df_cyclists_master_cl = pd.merge(df_cyclists_cl, df_cyclists_profiles_cl, on='link_stub')
df_cyclists_master_cl['name'] = df_cyclists_master_cl['name_y']
df_cyclists_master_cl.set_index('name', inplace=True)
del df_cyclists_master_cl['name_x'], df_cyclists_master_cl['name_y']
df_cyclists_master_cl = df_cyclists_master_cl.dropna(subset=['age'])
df_cyclists_master_cl = df_cyclists_master_cl.dropna(subset=['height'])
df_cyclists_master_cl = df_cyclists_master_cl.dropna(subset=['weight'])

## Formatting & adding columns to cyclists

In [21]:
# converting appropriate columns to numbers 

# time trialists
df_cyclists_master_tt['rank'] = df_cyclists_master_tt['rank'].astype(int)
df_cyclists_master_tt['points'] = df_cyclists_master_tt['points'].astype(float)
df_cyclists_master_tt['age'] = df_cyclists_master_tt['age'].astype(int)
df_cyclists_master_tt['weight'] = df_cyclists_master_tt['weight'].astype(float)
df_cyclists_master_tt['height'] = df_cyclists_master_tt['height'].astype(float)
df_cyclists_master_tt['bmi'] = (df_cyclists_master_tt['weight'] / (df_cyclists_master_tt['height'] ** 2)).round(2) #add bmi

# sprinters
df_cyclists_master_sp['rank'] = df_cyclists_master_sp['rank'].astype(int)
df_cyclists_master_sp['points'] = df_cyclists_master_sp['points'].astype(float)
df_cyclists_master_sp['age'] = df_cyclists_master_sp['age'].astype(int)
df_cyclists_master_sp['weight'] = df_cyclists_master_sp['weight'].astype(float)
df_cyclists_master_sp['height'] = df_cyclists_master_sp['height'].astype(float)
df_cyclists_master_sp['bmi'] = (df_cyclists_master_sp['weight'] / (df_cyclists_master_sp['height'] ** 2)).round(2) #add bmi

# climbers
df_cyclists_master_cl['rank'] = df_cyclists_master_cl['rank'].astype(int)
df_cyclists_master_cl['points'] = df_cyclists_master_cl['points'].astype(float)
df_cyclists_master_cl['age'] = df_cyclists_master_cl['age'].astype(int)
df_cyclists_master_cl['weight'] = df_cyclists_master_cl['weight'].astype(float)
df_cyclists_master_cl['height'] = df_cyclists_master_cl['height'].astype(float)
df_cyclists_master_cl['bmi'] = (df_cyclists_master_cl['weight'] / (df_cyclists_master_cl['height'] ** 2)).round(2) #add bmi

In [22]:
# converting metric units to imperial
df_cyclists_master_tt['height'] = (df_cyclists_master_tt['height'] * 3.281).round(2)
df_cyclists_master_tt['weight'] = df_cyclists_master_tt['weight'] * 2.2

df_cyclists_master_sp['height'] = (df_cyclists_master_sp['height'] * 3.281).round(2)
df_cyclists_master_sp['weight'] = df_cyclists_master_sp['weight'] * 2.2

df_cyclists_master_cl['height'] = (df_cyclists_master_cl['height'] * 3.281).round(2)
df_cyclists_master_cl['weight'] = df_cyclists_master_cl['weight'] * 2.2

In [23]:
# # add specialty columns, as num values
# df_cyclists_200_full['time_trialist'] = 1
# df_cyclists_200_full['sprinter'] =  0

# # add specialty columns, as strings
df_cyclists_master_tt['specialty'] = 'Time trialist'
df_cyclists_master_sp['specialty'] = 'Sprinter'
df_cyclists_master_cl['specialty'] = 'Climber'

In [24]:
# assigning continents to riders based off of list of unique nationalities
europe = ['Slovenia', 'Belgium', 'Switzerland', 'Italy', 'Denmark', 'Netherlands', 'Spain',
          'Austria', 'France', 'Portugal', 'Great Britain', 'Germany', 'Luxembourg', 'Estonia',
          'Poland', 'Sweden', 'Romania', 'Czech Republic', 'Ireland', 'Norway', 'Serbia', 'Ukraine', 
          'Greece', 'Hungary', 'Slovakia', 'Latvia', 'Belarus', 'Russia', 'Lithuania', 'Europe',
         'Finland', 'Moldova', 'North Macedonia', 'Kosovo', 'Bosnia and Herzegovina', 'Iceland',
         'Croatia', 'Albania']
asia = ['Japan', 'Taiwan', 'South Korea', 'Malaysia', 'China', 'Kazakhstan', 'Iran', 'Azerbaijan', 'Turkey',
        'Thailand', 'Hongkong', 'Uzbekistan', 'Israel', 'Cyprus', 'Qatar', 'United Arab Emirates', 'Indonesia',
       'Singapore', 'Mongolia', 'India', 'Georgia', 'Syria', 'Philippines', 'Laos']
north_america = ['United States', 'Canada', 'Guatemala', 'Panama', 'Mexico', 'Dominican Republic', 'Cuba',
                 'Trinidad & Tobago', 'Bermuda', 'Honduras', 'Costa Rica', 'Belize', 'Bahamas', 'Curacao',
                'El Salvador', 'Puerto Rico', 'Virgin Islands', 'Antigua and Barbuda', 'Dominica', 'Anguilla']
south_america = ['Colombia', 'Venezuela', 'Chile', 'Ecuador', 'Argentina', 'Brazil', 'Uruguay', 'Peru']
africa = ['South Africa', 'Eritrea', 'Namibia', 'Rwanda', 'Ethiopia', 'Algeria', 'Mauritius',
         'Morocco', 'Angola', 'Tunisia', 'Burkina Faso', 'Kenya', 'Seychelles', 'Ivory Coast', 'Senegal']
oceania = ['Australia', 'New Zealand']

# function to assign continents to nationalities
def get_continent(nationality):
    if nationality in europe:
        return 'Europe'
    elif nationality in asia:
        return 'Asia'
    elif nationality in north_america:
        return 'North America'
    elif nationality in south_america:
        return 'South America'
    elif nationality in africa:
        return 'Africa'
    elif nationality in oceania:
        return 'Oceania'
    else:
        return None


# create continent column
df_cyclists_master_tt['continent'] = df_cyclists_master_tt['nationality'].apply(lambda x: get_continent(x))
df_cyclists_master_sp['continent'] = df_cyclists_master_sp['nationality'].apply(lambda x: get_continent(x))
df_cyclists_master_cl['continent'] = df_cyclists_master_cl['nationality'].apply(lambda x: get_continent(x))

In [25]:
df_cyclists_master_tt.head() #preview

Unnamed: 0_level_0,link_stub,rank,team,points,date_of_birth,age,nationality,place_of_birth,weight,height,bmi,specialty,continent
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Primož Roglič,rider/primoz-roglic,1,Team Jumbo-Visma,901.0,1989 October 29,30,Slovenia,Trbovlje,143.0,5.81,20.75,Time trialist,Europe
Victor Campenaerts,rider/victor-campenaerts,2,Lotto Soudal,702.0,1991 October 28,28,Belgium,Hoboken,158.4,5.68,24.06,Time trialist,Europe
Stefan Küng,rider/stefan-kung,3,Groupama - FDJ,682.0,1993 November 16,26,Switzerland,Wil,182.6,6.33,22.28,Time trialist,Europe
Filippo Ganna,rider/filippo-ganna,4,Team INEOS,670.0,1996 July 25,23,Italy,Verbania,167.2,6.33,20.4,Time trialist,Europe
Patrick Bevin,rider/patrick-bevin,5,CCC Team,578.0,1991 February 15,29,New Zealand,Taupo,165.0,5.91,23.15,Time trialist,Oceania


In [26]:
df_cyclists_master_tt.info() # see info

<class 'pandas.core.frame.DataFrame'>
Index: 539 entries, Primož Roglič to Christoph Sauser
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   link_stub       539 non-null    object 
 1   rank            539 non-null    int64  
 2   team            539 non-null    object 
 3   points          539 non-null    float64
 4   date_of_birth   539 non-null    object 
 5   age             539 non-null    int64  
 6   nationality     539 non-null    object 
 7   place_of_birth  455 non-null    object 
 8   weight          539 non-null    float64
 9   height          539 non-null    float64
 10  bmi             539 non-null    float64
 11  specialty       539 non-null    object 
 12  continent       538 non-null    object 
dtypes: float64(4), int64(2), object(7)
memory usage: 59.0+ KB


## Merging cycling specialty dataframes into a single master

In [27]:
# # create master dataframe including all categories
frames = [df_cyclists_master_tt, df_cyclists_master_sp , df_cyclists_master_cl]
cyclists_master_overall = pd.concat(frames)

In [28]:
cyclists_master_overall.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1907 entries, Primož Roglič to Filippo Ganna
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   link_stub       1907 non-null   object 
 1   rank            1907 non-null   int64  
 2   team            1907 non-null   object 
 3   points          1907 non-null   float64
 4   date_of_birth   1907 non-null   object 
 5   age             1907 non-null   int64  
 6   nationality     1907 non-null   object 
 7   place_of_birth  1648 non-null   object 
 8   weight          1907 non-null   float64
 9   height          1907 non-null   float64
 10  bmi             1907 non-null   float64
 11  specialty       1907 non-null   object 
 12  continent       1906 non-null   object 
dtypes: float64(4), int64(2), object(7)
memory usage: 208.6+ KB


In [1]:
# #export to csv in case
# cyclists_master.to_csv('cyclists_by_specialty_male.csv', encoding='utf-8-sig')

## Pickling the dataframes

In [320]:
import pickle

In [328]:
with open('cyclists_tt.pkl', 'wb') as pickle_file:
    pickle.dump(df_cyclists_master_tt, pickle_file)

In [329]:
with open('cyclists_sp.pkl', 'wb') as pickle_file:
    pickle.dump(df_cyclists_master_sp, pickle_file)

In [330]:
with open('cyclists_cl.pkl', 'wb') as pickle_file:
    pickle.dump(df_cyclists_master_cl, pickle_file)

In [331]:
with open('cyclists_master.pkl', 'wb') as pickle_file:
    pickle.dump(cyclists_master_overall, pickle_file)

# Appendix: Sections to test scraping functions on individual cyclist profiles

In [None]:
# # Preliminary url assigning and soup setup
# base_url = 'https://www.procyclingstats.com/'
# link = 'rider/primoz-roglic'
# url = base_url + link

# response = requests.get(url)
# page = response.text
# soup = BeautifulSoup(page, 'lxml')
# soup.find('a')

In [None]:
# # Info bit 1
# (soup.find(class_='rdr-info-cont')
#      .find_all('span')[1]
#      .text
#         )

In [None]:
# # Info bit 2
# (soup.find(class_='rdr-info-cont')
#          .text
#         )

In [None]:
# # Get name
# name = soup.find('title').text
# name

In [None]:
# # Get date of birth
# info_cont = (soup.find(class_='rdr-info-cont').text)
# dob_as_list = info_cont.split(' ')[3:6]
# date_of_birth = str(dob_as_list[2][:4] + " " + str(dob_as_list[1]) + " " + str(dob_as_list[0][:-2]))
# date_of_birth

In [None]:
# # Get age
# age = info_cont.split(' ')[6][:4].strip('()')
# age

In [None]:
# # Get nationality v2
# (soup.find(class_='rdr-info-cont')
#     .find_all('a', class_='black')[0]
#     .text)

In [None]:
# # Get place of birth
# place_of_birth = get_cyclist_value(soup, 'Place of birth')
# place_of_birth

In [None]:
# # Get weight. Identifying null values, and cyclists with height but no weight, and vice versa
# try:
#     weight = (soup.find(class_='rdr-info-cont')
#              .find_all('span')[1]
#              .text
#             ).split()[1]
#     if float(weight) < 2.1:
#         height = weight
#         weight = None
# except:
#     weight = None
# weight

In [None]:
# # Get height. Identifying null values, and cyclists with height but no weight, and vice versa
# try:
#     if weight == None:
#         height = (soup.find(class_='rdr-info-cont')
#              .find_all('span')[1]
#              .text
#             ).split()[1]
#     else:
#         height = (soup.find(class_='rdr-info-cont')
#             .find_all('span')[1]
#             .text
#             ).split()[4]
#         try:
#             if float(height) > 1:
#                 pass
#             else:
#                 height = None
#         except:
#             height = None
# except:
#     height = None
# height

In [None]:
# #both height and weight together
# try:
#     weight = (soup.find(class_='rdr-info-cont')
#              .find_all('span')[1]
#              .text
#             ).split()[1]
#     if float(weight) < 2.1:
#         height = weight
#         weight = None
# except:
#     weight = None

# try:
#     if weight == None:
#         height = (soup.find(class_='rdr-info-cont')
#              .find_all('span')[1]
#              .text
#             ).split()[1]
#     else:
#         height = (soup.find(class_='rdr-info-cont')
#             .find_all('span')[1]
#             .text
#             ).split()[4]
#         try:
#             if float(height) > 1:
#                 pass
#             else:
#                 height = None
#         except:
#             height = None
# except:
#     height = None
# print("weight:", weight, ", height:", height)