# Google colab notebook

In [3]:
from bs4 import BeautifulSoup
import requests
import time, os
import pickle
import re
import io
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
from google.colab import drive
drive.mount('drive')

## Scrape data from each race

In [10]:
# Functions to do the bulk of the work

 
def lookup_starter_count(row, race_df):
    """use string in column 'Category Name' to return value from starter_counts"""
    starter_counts = race_df['Category Name'].value_counts()
    return starter_counts[row['Category Name']]

def lookup_finisher_counts(row, race_df):
    """Find out how many finishers are in each field and store in dict finisher_counts,
    then use string in column 'Category Name' to return value from finisher_counts"""
    # start with an empty dictionary
    finisher_counts = {}
    # iterate through the categories and placings
    for cat, place in zip(race_df['Category Name'], race_df['Place']):
        try:
            place = int(place)
            if cat not in finisher_counts.keys():
                finisher_counts[cat] = place
            elif finisher_counts[cat] < place:
                finisher_counts[cat] = place
            else:
                pass    
        except ValueError:
            pass
    return finisher_counts[row['Category Name']]


def capture_race_data(race_path):
    """Capture metadata and results from an individual race, 
    and return a dataframe with that data"""
    
    # Get html from individual race page

    URL = 'https://www.crossresults.com' + race_path
    response = requests.get(URL)
    soup = BeautifulSoup(response.text,'lxml')

    # Race metadata

    try:
        main = soup.find("div", {"id": "resultstitle"}).text.split(' • ')
    except:
        main = np.nan
    # Race Name
    try:
        name = main[0]
    except:
        name = np.nan
    # Race date
    try:
        date = ' '.join(main[1].split())
    except:
        date = np.nan
    # Race location
    try:
        location = main[2].split('\r')[0].strip()
    except:
        location = np.nan
    # Beers
    try:
        beers = soup.find("div", {"class": "beerrating rating"}).text.split()[0]
    except:
        beers = np.nan
    # Moisture
    try:
        moisture = soup.find("div", {"class": "moisturerating rating"}).text.split()[0]
    except:
        moisture = np.nan
    # Accel
    try:
        accel = soup.find("div", {"class": "accelrating rating"}).text.split()[0]
    except:
        accel = np.nan
    # Tech
    try:
        tech = soup.find("div", {"class": "techrating rating"}).text.split()[0]
    except:
        tech = np.nan
    # Elevation
    try:
        elevation = soup.find("div", {"class": "elevationrating rating"}).text.split()[0]
    except:
        elevation = np.nan
    # Conditions
    try:
        conditions = soup.find("div", {"id": "resultstitle"}).text.strip().split('\n')[-1].strip()
    except:
        conditions = np.nan
    # Weather
    try:
        weather = conditions.split(',')[0]
    except:
        weather = np.nan
    # Temperature
    try:
        temperature = conditions.split(',')[1].strip().split()[0]
    except:
        temperature = np.nan
    # Wind
    try:
        wind = conditions.split(',')[2].strip().split()[1]
    except:
        wind = np.nan
    # extract script tag to get lat and lon
    try:
        script = soup.find('article', {'id': 'content'}).find('script', {'type': 'text/javascript'})
    except:
        script = np.nan
    # pull out the lat and lon
    try: 
        pattern = re.compile('GetMap\(\"(.*?)"')
    except:
        pattern = np.nan
    try:
        lat_lon = re.findall(pattern, script.string)[0]
    except:
        lat_lon = np.nan

    # Capture results

    try:
        result_path = soup.find("span", {"class": "downloadoptions"}).find_all('a')[0]['href']
        URL = 'https://www.crossresults.com/' + result_path
        response = requests.get(URL)
        result_soup = BeautifulSoup(response.text,'lxml')
        # read the results into a pandas dataframe
        race_df = pd.read_csv(io.StringIO(result_soup.text.strip()), index_col=False)
        # only keep the rows that have values in scored points
        race_df = race_df.loc[~pd.isnull(race_df['Scored Points'])]
    except:
        race_df = pd.DataFrame(columns=['Category Name', 'Place', 'RacerID', 'First Name', 'Last Name', 'Team Name',\
                                 'Time', 'License', 'Carried Points', 'Scored Points', 'Points Delta', 'Starters',\
                                 'Finishers', 'Race Name', 'Date', 'Location', 'Beers', 'Moisture', 'Accel',\
                                 'Tech', 'Elevation', 'Weather', 'Temperature', 'Wind', 'Coordinates'])

    # As long as there are data in the df...
    if len(race_df):
    
        # Run some calculations on results

        # calculate points delta
        race_df['Points Delta'] = race_df['Scored Points'] - race_df['Carried Points']
        # Add column with number of starters in each field
        race_df['Starters'] = race_df.apply(lookup_starter_count, race_df = race_df, axis=1)
        # Add column with number of finishers in each field
        race_df['Finishers'] = race_df.apply(lookup_finisher_counts, race_df = race_df, axis=1)
        # Add the race metadata as new columns to the race_df
        race_df[['Race Name', 'Date', 'Location', 'Beers', 'Moisture', 'Accel', 
                 'Tech', 'Elevation', 'Weather', 'Temperature', 'Wind', 'Coordinates']] = \
                 name, date, location, beers, moisture, accel, \
                 tech, elevation, weather, temperature, wind, lat_lon

    return race_df

In [4]:
# Read in the race data
with open('drive/My Drive/Colab/races.pickle','rb') as read_file:
    races = pickle.load(read_file)
race_paths = races[0]

In [None]:
last_idx = len(race_paths)-1

for i in range(500, len(race_paths)):
 
    # Initialize a new main df for every 1000 races
    if (i%500==0):
        all_race_data = pd.DataFrame()
    
    # Get all the data for an individual race and append to main df
    path = race_paths[i]
    all_race_data = all_race_data.append(capture_race_data(path))

    # Write out the data every 500 races and at the end
    if (i in range(499, 8000, 500)) or (i==last_idx):
        file_name = 'all_race_data_' + str(i) + '.csv'
        all_race_data.to_csv(file_name)
        !cp $file_name "drive/My Drive/Colab"
        
    print(i,'\t',datetime.now().time())

