In [398]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen, HTTPError
import csv

## Save working links for scores to CSV

In [399]:
next_year_dict = {}
for year in range(5, 18):
    year_str = '0'+str(year) if year<10 else str(year)
    next_year = year + 1
    next_year_str = '0'+str(next_year) if next_year<10 else str(next_year)
    next_year_dict[year_str] = next_year_str

In [400]:
def parse_link(template, verbose=False):
    try:
        html = urlopen(template)
        if verbose:
            print('sucess:', template)
        return html
    except HTTPError as e:
        if verbose:
            print('failure:', template)
        return None

In [401]:
gp_templates = [    
    'http://www.isuresults.com/results/gp{0}{1}/CAT00{3}RS.HTM',
    'http://www.isuresults.com/results/gp{0}20{1}/CAT00{3}RS.HTM',
    'http://www.isuresults.com/results/gp{0}{1}{2}/CAT00{3}RS.HTM',
    'http://www.isuresults.com/results/season{1}{2}/gp{0}{1}{2}/CAT00{3}RS.HTM',
    'http://www.isuresults.com/results/season{1}{2}/gp{0}{1}{2}/data0{3}90.htm',
    'http://www.isuresults.com/results/season{1}{2}/gp{0}20{1}/CAT00{3}RS.HTM',
    'http://www.isuresults.com/results/season{1}{2}/gp{0}20{1}/data0{3}90.htm'      
]
cp_templates = [
    'http://www.isuresults.com/results/{0}20{2}/CAT00{3}RS.HTM',
    'http://www.isuresults.com/results/season{1}{2}/{0}20{2}/CAT00{3}RS.HTM'
]

In [402]:
gp_events = ['usa', 'can', 'fra', 'rus', 'chn', 'jpn', 'f']
cp_events = ['ec', 'fc', 'owg', 'wc']
events = gp_events + cp_events
event_new_names = ['US', 'CA', 'FR', 'RU', 'CN', 'JP', 'FN', 'EU', '4C', 'OL', 'WR']
event_name_dict = dict(zip(events, event_new_names))
event_name_dict

{'usa': 'US',
 'can': 'CA',
 'fra': 'FR',
 'rus': 'RU',
 'chn': 'CN',
 'jpn': 'JP',
 'f': 'FN',
 'ec': 'EU',
 'fc': '4C',
 'owg': 'OL',
 'wc': 'WR'}

In [431]:
with open('links/male.csv', mode='w') as file:    
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['event', 'year', 'url'])
    for event in events:
        for year_str, next_year_str in next_year_dict.items():
            if next_year_str == '12' and event == 'wc':
                gender = '5'
                print(gender)
            elif next_year_str in ['11', '12'] and event in ['ec', 'wc']:
                gender = '4'
            else:
                gender = '1'
            templates = gp_templates if event in gp_events else cp_templates
            for template in templates:
                template = template.format(event, year_str, next_year_str, gender)
                html = parse_link(template)
                if html is not None:
                    writer.writerow([event_name_dict[event], int('20'+next_year_str), template])
                    break

5


In [432]:
with open('links/female.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['event', 'year', 'url'])
    for event in events:
        for year_str, next_year_str in next_year_dict.items():
            if next_year_str == '12' and event == 'wc':
                gender = '6'
            elif next_year_str in ['11', '12'] and event in ['ec', 'wc']:
                gender = '5'
            else:
                gender = '2'      
            templates = gp_templates if event in gp_events else cp_templates
            for template in templates:
                template = template.format(event, year_str, next_year_str, gender)
                html = parse_link(template)
                if html is not None:
                    writer.writerow([event_name_dict[event], int('20'+next_year_str), template])
                    break

## Parse score from saved links

In [433]:
male_links = pd.read_csv('links/male.csv')
male_links = pd.pivot(values=male_links['url'], index=male_links['event'], columns=male_links['year'])

In [500]:
link = male_links.loc['WR', 2018]
print(link)
parse_score(link, 'XX', 9999, process=False)[[1, 6]].dropna().iloc[2:, :].reset_index(drop=True)

http://www.isuresults.com/results/season1718/wc2018/CAT001RS.HTM


Unnamed: 0,1,6
0,Nathan CHEN,321.4
1,Shoma UNO,273.77
2,Mikhail KOLYADA,272.32
3,Alexei BYCHENKO,258.28
4,Kazuki TOMONO,256.11
5,Deniss VASILJEVS,254.86
6,Dmitri ALIEV,252.3
7,Keegan MESSING,252.3
8,Misha GE,249.57
9,Michal BREZINA,243.99


In [225]:
def parse_score(link, event_name, year, process=True):
    html = urlopen(link)
    process_rules =
    {
        '2017': {
            'JP': {'cols': [1, 5], 'row_start': 1, 'row_end': -1, 'col_'},
            'FN': {'cols': [1, 5], 'row_start': 1, 'row_end': -1},
        }
    }
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')

        table_str = str(bs.findAll('table')[0])
        table = pd.read_html(table_str)[0]
        if process:
            if year == 2017:
                if event_name == 'JP':
                    table = table[[1, 5]].dropna().iloc[1:-1, :]
                elif event_name == 'FN':
                    table = table[[1, 5]].dropna().iloc[1:, :]
                else:
                    table = table[[1, 6, 8]].dropna().iloc[2:, :2]
            else:
                table = table[[1, 6, 8]].dropna().iloc[2:, :2]
            table.columns = ['name', 'score']
            table['event'] = event_name
            table['score'] = table['score'].astype(float)
            table['year'] = year
        return table
    except Exception as e:
        print(link)
        print(e)
        return None

In [None]:
pd.concat((parse_score(link, event_name, 2017) for event_name, link in links.items()), axis=0)

Unnamed: 0,name,score,event,year


In [214]:
for url in male_links['url']:
    parse_score(link, event_name, year)

0      http://www.isuresults.com/results/gpusa05/CAT0...
1      http://www.isuresults.com/results/gpusa06/CAT0...
2      http://www.isuresults.com/results/gpusa07/CAT0...
3      http://www.isuresults.com/results/gpusa08/CAT0...
4      http://www.isuresults.com/results/gpusa09/CAT0...
5      http://www.isuresults.com/results/gpusa2010/CA...
6      http://www.isuresults.com/results/gpusa2011/CA...
7      http://www.isuresults.com/results/gpusa2012/CA...
8      http://www.isuresults.com/results/gpusa2013/CA...
9      http://www.isuresults.com/results/gpusa2014/CA...
10     http://www.isuresults.com/results/season1516/g...
11     http://www.isuresults.com/results/season1617/g...
12     http://www.isuresults.com/results/season1718/g...
13     http://www.isuresults.com/results/gpcan05/CAT0...
14     http://www.isuresults.com/results/gpcan06/CAT0...
15     http://www.isuresults.com/results/gpcan07/CAT0...
16     http://www.isuresults.com/results/gpcan08/CAT0...
17     http://www.isuresults.co

In [58]:
scores = pd.concat((parse_score(link, event_name, 2017) for event_name, link in links.items()), axis=0)
scores['name'] = scores['name'].str.replace('\xa0', ' ')
scores.reset_index(drop=True, inplace=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(scores)

                      name   score event  year
0              Nathan CHEN  275.88    US  2017
1              Adam RIPPON  266.45    US  2017
2           Sergei VORONOV  257.49    US  2017
3               Boyang JIN  246.03    US  2017
4                  Han YAN  228.33    US  2017
5               Ross MINER  219.62    US  2017
6            Takahito MURA  212.77    US  2017
7               Liam FIRUS  210.83    US  2017
8           Kevin REYNOLDS  204.05    US  2017
9           Roman SADOVSKY  200.10    US  2017
10               Shoma UNO  301.10    CA  2017
11             Jason BROWN  261.14    CA  2017
12       Alexander SAMARIN  250.06    CA  2017
13            Patrick CHAN  245.70    CA  2017
14         Jorik HENDRICKX  237.31    CA  2017
15          Michal BREZINA  237.04    CA  2017
16          Nicolas NADEAU  229.43    CA  2017
17          Keegan MESSING  217.75    CA  2017
18            Jun Hwan CHA  210.32    CA  2017
19              Paul FENTZ  201.60    CA  2017
20           

In [61]:
scores.to_csv('scores/2017.csv', index=False)