# Web scrape for results from the Hampton Ladies Triathlon
## From the Race Roster website

Lindsay Brin
mid-June 2018

See `raceroster_webscrape.pynb` for more general code to get results from the Race Roster website.

In [61]:
# Import libraries
from lxml import html
from lxml import etree
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.plotly as py

## Read Hampton data from Race Roster website

In [3]:
# Get html
raceroster_hampton = "https://results.raceroster.com/results/wjvz7sruf3ngamgq"
page = requests.get(raceroster_hampton)
tree = html.fromstring(page.content)

## Process webscrape for Hampton

(Rockwood was farther down below, but it was moved to another notebook with more robust code overall.)

In [3]:
# Hampton processing
# Pull out different columns from the table
res_place = tree.xpath('//tr/td[1][@*]/text()')
res_time_total = tree.xpath('//tr/td[2][@*]/text()')

# Note that name has a different path because it's in a link (a href).
res_names = tree.xpath('//tr/td[3]/a[@*]/text()')

# Back to normal
res_city = tree.xpath('//tr/td[4][@*]/text()')
res_div = tree.xpath('//tr/td[5][@*]/text()')
res_div_place = tree.xpath('//tr/td[6][@*]/text()')
res_number = tree.xpath('//tr/td[7][@*]/text()')
res_swim = tree.xpath('//tr/td[8][@*]/text()')
res_t1 = tree.xpath('//tr/td[9][@*]/text()')
res_bike = tree.xpath('//tr/td[10][@*]/text()')
res_t2 = tree.xpath('//tr/td[11][@*]/text()')
res_run = tree.xpath('//tr/td[12][@*]/text()')

Let me check the first few values of each new list.

In [4]:
varnames = ['res_place', 
            'res_time_total', 
            'res_names', 
            'res_city', 
            'res_div', 
            'res_div_place',
            'res_number',
            'res_swim',
            'res_t1',
            'res_bike',
            'res_t2',
            'res_run']

for v in varnames:
    print(v)
    print(len(globals()[v][0:3]))
    print(globals()[v][0:3])
    print()

res_place
3
['\n1\n            ', '\n2\n            ', '\n3\n            ']

res_time_total
3
['\n01:15:21\n            ', '\n01:16:28\n            ', '\n01:17:26\n            ']

res_names
3
['\nEryn Weldon\n                ', '\nCarlie Lemoine\n                ', '\nMichaela Kearns\n                ']

res_city
3
['\nMoncton\n            ', '\nHampton\n            ', '\nLower Coverdale\n            ']

res_div
3
['\nf20-29\n            ', '\nf30-39\n            ', '\nf20-29\n            ']

res_div_place
3
['\n    1/28\n            ', '\n    1/77\n            ', '\n    2/28\n            ']

res_number
3
['\n245\n            ', '\n139\n            ', '\n116\n            ']

res_swim
3
['\n06:23\n            ', '\n08:03\n            ', '\n08:48\n            ']

res_t1
3
['\n02:26\n            ', '\n01:27\n            ', '\n01:44\n            ']

res_bike
3
['\n39:40\n            ', '\n39:47\n            ', '\n39:12\n            ']

res_t2
3
['\n01:44\n            ', '\n01:25\n         

I need to clean up each list to remove white space and `\n`

In [5]:
# [element.strip() for element in res_place][0:4]
[element.strip() for element in globals()['res_place']][0:3]

['1', '2', '3']

But really, I want a dataframe, not a whole bunch of individual lists.

In [6]:
results = pd.DataFrame.from_items([
    ('place', [element.strip() for element in globals()['res_place']]),
    ('time_total_hhmmss', [element.strip() for element in globals()['res_time_total']]), 
    ('name', [element.strip() for element in globals()['res_names']]), 
    ('city', [element.strip() for element in globals()['res_city']]), 
    ('div', [element.strip() for element in globals()['res_div']]), 
    ('div_place', [element.strip() for element in globals()['res_div_place']]),
    ('number', [element.strip() for element in globals()['res_number']]),
    ('swim_mmss', [element.strip() for element in globals()['res_swim']]),
    ('t1_mmss', [element.strip() for element in globals()['res_t1']]),
    ('bike_mmss', [element.strip() for element in globals()['res_bike']]),
    ('t2_mmss', [element.strip() for element in globals()['res_t2']]),
    ('run_mmss', [element.strip() for element in globals()['res_run']])
])

In [7]:
results.head()

Unnamed: 0,place,time_total_hhmmss,name,city,div,div_place,number,swim_mmss,t1_mmss,bike_mmss,t2_mmss,run_mmss
0,1,01:15:21,Eryn Weldon,Moncton,f20-29,1/28,245,06:23,02:26,39:40,01:44,25:11
1,2,01:16:28,Carlie Lemoine,Hampton,f30-39,1/77,139,08:03,01:27,39:47,01:25,25:50
2,3,01:17:26,Michaela Kearns,Lower Coverdale,f20-29,2/28,116,08:48,01:44,39:12,01:32,26:12
3,4,01:17:37,Lisanne Maurice,Moncton,f40-49,1/69,158,09:27,01:04,38:42,01:10,27:17
4,5,01:19:03,Jocelyn LeBlanc,Moncton,f20-29,3/28,135,09:00,01:27,38:32,01:07,28:59


Hooray!!

In [8]:
def time_split_sec(t):
    """Convert a time in the format hh:mm:ss into total seconds"""
    if t.count(":") == 2 and len(t) == 8:
        (h, m, s) = t.split(':')
    elif t.count(":") == 1 and len(t) == 5:
        (m, s) = t.split(':')
        h = 0
    else:
        return
    result = int(h) * 3600 + int(m) * 60 + int(s)
    return(result)

In [9]:
# time_split_sec(results['res_swim'][0])

In [10]:
def time_split_min(t):
    """Convert a time in the format hh:mm:ss into total minutes"""
    if t.count(":") == 2 and len(t) == 8:
        (h, m, s) = t.split(':')
    elif t.count(":") == 1 and len(t) == 5:
        (m, s) = t.split(':')
        h = 0
    else:
        return
    result = int(h) * 60 + int(m) + int(s)/60
    return(result)

In [11]:
# print(time_split_min(results['res_swim'][0]))
# print(time_split_min(results['res_time_total'][0]))

Add columns that are the results times in decimal minutes

In [12]:
results["time_total"] = [time_split_min(element) for element in results['time_total_hhmmss']]
results["swim"] = [time_split_min(element) for element in results['swim_mmss']]
results["bike"] = [time_split_min(element) for element in results['bike_mmss']]
results["run"] = [time_split_min(element) for element in results['run_mmss']]
results["t1"] = [time_split_min(element) for element in results['t1_mmss']]
results["t2"] = [time_split_min(element) for element in results['t2_mmss']]

In [13]:
results.head()

Unnamed: 0,place,time_total_hhmmss,name,city,div,div_place,number,swim_mmss,t1_mmss,bike_mmss,t2_mmss,run_mmss,time_total,swim,bike,run,t1,t2
0,1,01:15:21,Eryn Weldon,Moncton,f20-29,1/28,245,06:23,02:26,39:40,01:44,25:11,75.35,6.383333,39.666667,25.183333,2.433333,1.733333
1,2,01:16:28,Carlie Lemoine,Hampton,f30-39,1/77,139,08:03,01:27,39:47,01:25,25:50,76.466667,8.05,39.783333,25.833333,1.45,1.416667
2,3,01:17:26,Michaela Kearns,Lower Coverdale,f20-29,2/28,116,08:48,01:44,39:12,01:32,26:12,77.433333,8.8,39.2,26.2,1.733333,1.533333
3,4,01:17:37,Lisanne Maurice,Moncton,f40-49,1/69,158,09:27,01:04,38:42,01:10,27:17,77.616667,9.45,38.7,27.283333,1.066667,1.166667
4,5,01:19:03,Jocelyn LeBlanc,Moncton,f20-29,3/28,135,09:00,01:27,38:32,01:07,28:59,79.05,9.0,38.533333,28.983333,1.45,1.116667


In [14]:
results["bike"][0]

39.666666666666664

Write dataset to file so that I can do analysis in a different notebook.

In [15]:
results.to_csv('../Data_output/results_Hampton2018.csv', index = False)