In [9]:
from bs4 import BeautifulSoup
import requests
import time, os
import pickle
import re
import io
import pandas as pd
import numpy as np

## Scrape race URLs

In [5]:
# Open main results page, capture the html
URL = "https://www.crossresults.com/?n=results&map=0&region=all"
response = requests.get(URL)
print(response.status_code)
soup = BeautifulSoup(response.text,'lxml')

200


In [43]:
# race data get appeneded to each of 4 lists
race_path = []
race_name = []
race_date = []
race_region = []

# pull race path, name, date and region. 
# divs belong to 2 classes. Go through each for every month of races
for div in soup.find_all(class_="monthContent"):
    for tr in div.find_all(class_="resultsrow datarow1"):
        race_path.append(tr.find('a').get('href'))
        race_name.append(tr.find('a').text)
        race_date.append(tr.find('a').findNext().text)
        race_region.append(tr.find('a').findNext().findNext().text)
    for tr in div.find_all(class_="resultsrow datarow2"):
        race_path.append(tr.find('a').get('href'))
        race_name.append(tr.find('a').text)
        race_date.append(tr.find('a').findNext().text)
        race_region.append(tr.find('a').findNext().findNext().text)


In [51]:
# We only want races in the US
US_regions = ['California', 'Mid Atlantic', 'Mountain West', 
              'New England', 'New York/Ontario', 'North Central', 
              'Pacific Northwest', 'South Central', 'Southeast']

# These IDs are for US races
US_ids = []

# Find if region is in the US, save ID if so
for idx, region in enumerate(race_region):
    if region in US_regions:
        US_ids.append(idx)

# Update lists to only include US races
race_path = [race_path[i] for i in US_ids]
race_name = [race_name[i] for i in US_ids]
race_date = [race_date[i] for i in US_ids]
race_region = [race_region[i] for i in US_ids]

In [55]:
# Put the data into one object
races = [race_path, race_name, race_date, race_region]

In [56]:
# Pickle the race data
with open('races.pickle', 'wb') as to_write:
    pickle.dump(races, to_write)

## Scrape data from each race

In [None]:
# Read in the race data
with open('races.pickle','rb') as read_file:
    races = pickle.load(read_file)

In [2]:
# Open each race page, capture the html
path = '/race/10159'
URL = 'https://www.crossresults.com' + path
response = requests.get(URL)
soup = BeautifulSoup(response.text,'lxml')

In [3]:
# Race info
main = soup.find("div", {"id": "resultstitle"}).text.split(' • ')

# Race Name
name = main[0]

# Race date
date = ' '.join(main[1].split())

# Race location
location = main[2].split('\r')[0].strip()

# Beers
beers = soup.find("div", {"class": "beerrating rating"}).text.split()[0]

# Moisture
moisture = soup.find("div", {"class": "moisturerating rating"}).text.split()[0]

# Accel
accel = soup.find("div", {"class": "accelrating rating"}).text.split()[0]

# Tech
tech = soup.find("div", {"class": "techrating rating"}).text.split()[0]

# Elevation
elevation = soup.find("div", {"class": "elevationrating rating"}).text.split()[0]

# Conditions
conditions = soup.find("div", {"id": "resultstitle"}).text.strip().split('\n')[-1].strip()

# Weather
weather = conditions.split(',')[0]

# Temperature
temperature = conditions.split(',')[1].strip().split()[0]

# Wind
wind = conditions.split(',')[2].strip().split()[1]

# extract script tag to get lat and lon
script = soup.find('article', {'id': 'content'}).find('script', {'type': 'text/javascript'})

# pull out the lat and lon
pattern = re.compile('GetMap\(\"(.*?)"')
lat_lon = re.findall(pattern, script.string)[0]

In [4]:
# grab the result path
result_path = soup.find("span", {"class": "downloadoptions"}).find_all('a')[0]['href']

In [5]:
# capture results html
URL = 'https://www.crossresults.com/' + result_path
response = requests.get(URL)
result_soup = BeautifulSoup(response.text,'lxml')

In [64]:
# read the results into a pandas dataframe
race_df = pd.read_csv(io.StringIO(result_soup.text.strip()), index_col=False)

In [65]:
race_df

Unnamed: 0,Category Name,Place,RacerID,First Name,Last Name,Team Name,Time,License,Carried Points,Scored Points
0,Men Cat 1/2,1,74538,Ryan,Woodall,Team TGB,,153440.0,,
1,Men Cat 1/2,2,195292,Nick,Mackie,,,558841.0,,
2,Men Cat 1/2,3,136346,Matthew,Nalesnik,,,119092.0,,
3,Men Cat 3,1,195292,Nick,Mackie,,,558841.0,421.797987,366.539396
4,Men Cat 3,2,136346,Matthew,Nalesnik,,,119092.0,400.000000,377.692931
...,...,...,...,...,...,...,...,...,...,...
74,Women Junior 16-18,1,183163,Ashley,Davis,Velobrew Racing,,476708.0,,
75,Women Junior 9-12,1,197020,Isabella,Fountain,,,,,
76,Women Junior 9-12,2,197016,Eve,Munkittrick,,,590776.0,,
77,Women Junior 9-12,3,197021,Helen,Munkittrick,,,604405.0,,


In [66]:
# only keep the rows that have values in scored points
race_df = race_df.loc[~pd.isnull(race_df['Scored Points'])]

In [67]:
# calculate points delta
race_df['Points Delta'] = race_df['Scored Points'] - race_df['Carried Points']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_df['Points Delta'] = race_df['Scored Points'] - race_df['Carried Points']


In [68]:
# Add column with number of starters in each field

# get a list of the number of starters in each field
starter_counts = race_df['Category Name'].value_counts()

# Use a function to lookup value for each field name
def lookup_starter_count(row):
    """use string in column 'Category Name' to return value from starter_counts"""
    return starter_counts[row['Category Name']]

# run the function through the df
race_df['Starters'] = race_df.apply(lookup_starter_count, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_df['Starters'] = race_df.apply(lookup_starter_count, axis=1)


In [69]:
# Add column with number of finishers in each field

# First, find out how many finishers are in each field

# start with an empty dictionary
finisher_counts = {}
# iterate through the categories and placings
for cat, place in zip(race_df['Category Name'], race_df['Place']):
    try:
        place = int(place)
        if cat not in finisher_counts.keys():
            finisher_counts[cat] = place
        elif finisher_counts[cat] < place:
            finisher_counts[cat] = place
        else:
            pass    
    except ValueError:
        pass

# Use a function to lookup value for each field name
def lookup_finisher_counts(row):
    """use string in column 'Category Name' to return value from finisher_counts"""
    return finisher_counts[row['Category Name']]

# run the function through the df
race_df['Finishers'] = race_df.apply(lookup_finisher_counts, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_df['Finishers'] = race_df.apply(lookup_finisher_counts, axis=1)


In [70]:
race_df

Unnamed: 0,Category Name,Place,RacerID,First Name,Last Name,Team Name,Time,License,Carried Points,Scored Points,Points Delta,Starters,Finishers
3,Men Cat 3,1,195292,Nick,Mackie,,,558841.0,421.797987,366.539396,-55.258591,8,7
4,Men Cat 3,2,136346,Matthew,Nalesnik,,,119092.0,400.0,377.692931,-22.307069,8,7
5,Men Cat 3,3,12831,Paul,Carter,Mid South Racing,,133359.0,400.0,388.846465,-11.153535,8,7
6,Men Cat 3,4,195288,Christopher,Yake,,,563016.0,672.977264,400.0,-272.977264,8,7
7,Men Cat 3,5,176396,Tyler,Austhof,Team Florida,,556780.0,400.0,411.153535,11.153535,8,7
8,Men Cat 3,6,193199,Joshua,Benton,,,582091.0,469.407592,422.307069,-47.100523,8,7
9,Men Cat 3,7,143239,Eric,Kirby,Muddy Nuts,,214063.0,400.0,433.460604,33.460604,8,7
10,Men Cat 3,DNF,169763,Drew,Miller,Velobrew Racing,,152659.0,400.0,455.767673,55.767673,8,7
11,Men Cat 4/5,1,197028,Alonso,Montilla,Cycles and Coffee House,,535588.0,540.0,429.695751,-110.304249,12,12
12,Men Cat 4/5,2,183166,Randal,Thibodeaux,,,529122.0,436.573239,448.079793,11.506553,12,12


In [71]:
# add the race metadata as new columns to the race_df
race_df[['Race Name', 'Date', 'Location', 'Beers', 'Moisture', 'Accel', 
         'Tech', 'Elevation', 'Weather', 'Temperature', 'Wind', 'Coordinates']] = \
         name, date, location, beers, moisture, accel, \
         tech, elevation, weather, temperature, wind, lat_lon

race_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[k] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_df[['Race Name', 'Date', 'Location', 'Beers', 'Moisture', 'Accel',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, v)


Unnamed: 0,Category Name,Place,RacerID,First Name,Last Name,Team Name,Time,License,Carried Points,Scored Points,...,Location,Beers,Moisture,Accel,Tech,Elevation,Weather,Temperature,Wind,Coordinates
3,Men Cat 3,1,195292,Nick,Mackie,,,558841.0,421.797987,366.539396,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
4,Men Cat 3,2,136346,Matthew,Nalesnik,,,119092.0,400.0,377.692931,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
5,Men Cat 3,3,12831,Paul,Carter,Mid South Racing,,133359.0,400.0,388.846465,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
6,Men Cat 3,4,195288,Christopher,Yake,,,563016.0,672.977264,400.0,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
7,Men Cat 3,5,176396,Tyler,Austhof,Team Florida,,556780.0,400.0,411.153535,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
8,Men Cat 3,6,193199,Joshua,Benton,,,582091.0,469.407592,422.307069,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
9,Men Cat 3,7,143239,Eric,Kirby,Muddy Nuts,,214063.0,400.0,433.460604,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
10,Men Cat 3,DNF,169763,Drew,Miller,Velobrew Racing,,152659.0,400.0,455.767673,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
11,Men Cat 4/5,1,197028,Alonso,Montilla,Cycles and Coffee House,,535588.0,540.0,429.695751,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
12,Men Cat 4/5,2,183166,Randal,Thibodeaux,,,529122.0,436.573239,448.079793,...,"Green Cove Springs, FL",2.2,3.6,2.4,2.2,2.4,Partly Cloudy,49,4,29.9884166717529:-81.6820068359375
