In [4]:
from bs4 import BeautifulSoup
import requests
import time, os
import pickle
import re
import io
import pandas as pd
import numpy as np

## Scrape race URLs

In [None]:
# Open main results page, capture the html
URL = "https://www.crossresults.com/?n=results&map=0&region=all"
response = requests.get(URL)
print(response.status_code)
soup = BeautifulSoup(response.text,'lxml')

In [None]:
# race data get appeneded to each of 4 lists
race_path = []
race_name = []
race_date = []
race_region = []

# pull race path, name, date and region. 
# divs belong to 2 classes. Go through each for every month of races
for div in soup.find_all(class_="monthContent"):
    for tr in div.find_all(class_="resultsrow datarow1"):
        race_path.append(tr.find('a').get('href'))
        race_name.append(tr.find('a').text)
        race_date.append(tr.find('a').findNext().text)
        race_region.append(tr.find('a').findNext().findNext().text)
    for tr in div.find_all(class_="resultsrow datarow2"):
        race_path.append(tr.find('a').get('href'))
        race_name.append(tr.find('a').text)
        race_date.append(tr.find('a').findNext().text)
        race_region.append(tr.find('a').findNext().findNext().text)


In [None]:
# We only want races in the US
US_regions = ['California', 'Mid Atlantic', 'Mountain West', 
              'New England', 'New York/Ontario', 'North Central', 
              'Pacific Northwest', 'South Central', 'Southeast']

# These IDs are for US races
US_ids = []

# Find if region is in the US, save ID if so
for idx, region in enumerate(race_region):
    if region in US_regions:
        US_ids.append(idx)

# Update lists to only include US races
race_path = [race_path[i] for i in US_ids]
race_name = [race_name[i] for i in US_ids]
race_date = [race_date[i] for i in US_ids]
race_region = [race_region[i] for i in US_ids]

In [None]:
# Put the data into one object
races = [race_path, race_name, race_date, race_region]

In [None]:
# Pickle the race data
with open('races.pickle', 'wb') as to_write:
    pickle.dump(races, to_write)

## Scrape data from each race

In [5]:
# Functions to do the bulk of the work

 
def lookup_starter_count(row, race_df):
    """use string in column 'Category Name' to return value from starter_counts"""
    starter_counts = race_df['Category Name'].value_counts()
    return starter_counts[row['Category Name']]

def lookup_finisher_counts(row, race_df):
    """Find out how many finishers are in each field and store in dict finisher_counts,
    then use string in column 'Category Name' to return value from finisher_counts"""
    # start with an empty dictionary
    finisher_counts = {}
    # iterate through the categories and placings
    for cat, place in zip(race_df['Category Name'], race_df['Place']):
        try:
            place = int(place)
            if cat not in finisher_counts.keys():
                finisher_counts[cat] = place
            elif finisher_counts[cat] < place:
                finisher_counts[cat] = place
            else:
                pass    
        except ValueError:
            pass
    return finisher_counts[row['Category Name']]


def capture_race_data(race_path):
    """Capture metadata and results from an individual race, 
    and return a dataframe with that data"""
    
    # Get html from individual race page

    URL = 'https://www.crossresults.com' + race_path
    response = requests.get(URL)
    soup = BeautifulSoup(response.text,'lxml')

    # Race metadata

    main = soup.find("div", {"id": "resultstitle"}).text.split(' • ')
    # Race Name
    name = main[0]
    # Race date
    date = ' '.join(main[1].split())
    # Race location
    location = main[2].split('\r')[0].strip()
    # Beers
    beers = soup.find("div", {"class": "beerrating rating"}).text.split()[0]
    # Moisture
    moisture = soup.find("div", {"class": "moisturerating rating"}).text.split()[0]
    # Accel
    accel = soup.find("div", {"class": "accelrating rating"}).text.split()[0]
    # Tech
    tech = soup.find("div", {"class": "techrating rating"}).text.split()[0]
    # Elevation
    elevation = soup.find("div", {"class": "elevationrating rating"}).text.split()[0]
    # Conditions
    conditions = soup.find("div", {"id": "resultstitle"}).text.strip().split('\n')[-1].strip()
    # Weather
    weather = conditions.split(',')[0]
    # Temperature
    temperature = conditions.split(',')[1].strip().split()[0]
    # Wind
    wind = conditions.split(',')[2].strip().split()[1]
    # extract script tag to get lat and lon
    script = soup.find('article', {'id': 'content'}).find('script', {'type': 'text/javascript'})
    # pull out the lat and lon
    pattern = re.compile('GetMap\(\"(.*?)"')
    try:
        lat_lon = re.findall(pattern, script.string)[0]
    except IndexError:
        lat_lon = np.nan

    # Capture results

    result_path = soup.find("span", {"class": "downloadoptions"}).find_all('a')[0]['href']
    URL = 'https://www.crossresults.com/' + result_path
    response = requests.get(URL)
    result_soup = BeautifulSoup(response.text,'lxml')
    # read the results into a pandas dataframe
    race_df = pd.read_csv(io.StringIO(result_soup.text.strip()), index_col=False)
    # only keep the rows that have values in scored points
    race_df = race_df.loc[~pd.isnull(race_df['Scored Points'])]

    # Run some calculations on results

    # calculate points delta
    race_df['Points Delta'] = race_df['Scored Points'] - race_df['Carried Points']
    # Add column with number of starters in each field
    race_df['Starters'] = race_df.apply(lookup_starter_count, race_df = race_df, axis=1)
    # Add column with number of finishers in each field
    race_df['Finishers'] = race_df.apply(lookup_finisher_counts, race_df = race_df, axis=1)
    # Add the race metadata as new columns to the race_df
    race_df[['Race Name', 'Date', 'Location', 'Beers', 'Moisture', 'Accel', 
             'Tech', 'Elevation', 'Weather', 'Temperature', 'Wind', 'Coordinates']] = \
             name, date, location, beers, moisture, accel, \
             tech, elevation, weather, temperature, wind, lat_lon

    return race_df

In [6]:
# Read in the race data
with open('races.pickle','rb') as read_file:
    races = pickle.load(read_file)

In [7]:
race_paths = races[0]

In [8]:
# Initialize a main df to hold all data from all races
all_race_data = pd.DataFrame()

for i in range(2):
    path = race_paths[i]
    all_race_data = all_race_data.append(capture_race_data(path))
