In [4]:
from bs4 import BeautifulSoup

In [5]:
from urllib.request import urlopen

In [6]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import kendalltau

## Scrap scores from result pages

In [7]:
links = {
    'US': 'http://www.isuresults.com/results/season1718/gpusa2017/CAT001RS.HTM',
    'CA': 'http://www.isuresults.com/results/season1718/gpcan2017/CAT001RS.HTM',
    'FR': 'http://www.isuresults.com/results/season1718/gpfra2017/CAT001RS.HTM',
    'RU': 'http://www.isuresults.com/results/season1718/gprus2017/CAT001RS.HTM',
    'CN': 'http://www.isuresults.com/results/season1718/gpchn2017/CAT001RS.HTM',
    'JP': 'http://www.isuresults.com/results/season1718/gpjpn2017/data0190.htm',
    'EU': 'http://www.isuresults.com/results/season1718/ec2018/CAT001RS.HTM',
    '4C': 'http://www.isuresults.com/results/season1718/fc2018/CAT001RS.HTM',
    'WR': 'http://www.isuresults.com/results/season1718/wc2018/CAT001RS.HTM'
}

In [8]:
def parse_score(link, event_name, year):
    html = urlopen(link)
    bs = BeautifulSoup(html.read(), 'html.parser')

    table_str = str(bs.findAll('table')[0])
    table = pd.read_html(table_str)[0]
    if year == 2017 and event_name == 'JP':
        table = table[[1, 5]].dropna().iloc[1:-1]
    else:
        table = table[[1, 6, 8]].dropna().iloc[2:, :2]
    table.columns = ['name', 'score']
    table['event'] = event_name
    table['score'] = table['score'].astype(float)
    table['year'] = year
    return table

In [9]:
# scores = pd.concat((parse_score(link, event_name, 2017) for event_name, link in links.items()), axis=0)
# scores['name'] = scores['name'].str.replace('\xa0', ' ')
# scores.reset_index(drop=True, inplace=True)
scores = pd.read_csv('scores/2017.csv', names=['name', 'score', 'event', 'year'])

In [10]:
season_scores = scores.loc[scores['event'] != 'WR']
world_scores = scores.loc[scores['event'] == 'WR']

In [11]:
season_avg = season_scores.groupby('name')['score'].mean().sort_values(ascending=False)
season_avg.head()

name
Shoma UNO           290.786667
Yuzuru HANYU        290.770000
Nathan CHEN         284.835000
Javier FERNANDEZ    277.440000
Boyang JIN          270.486667
Name: score, dtype: float64

## Implement kendall tau metrics

In [12]:
season_ranking = list(season_avg.loc[season_avg.index.isin(world_scores['name'])].index)
world_ranking = list(world_scores.loc[world_scores['name'].isin(season_avg.index), 'name'])

In [13]:
season_pairs = set(combinations(season_ranking, 2))
world_pairs = set(combinations(world_ranking, 2))
concordant_pairs = set(season_pairs) & set(world_pairs)
kendall = (2 * len(concordant_pairs) - len(season_pairs)) / len(season_pairs)
kendall

0.5434782608695652

Result agrees with kendalltau from scipy

In [14]:
season_numeric_rank = list(range(len(season_ranking)))
world_numeric_rank = list(season_ranking.index(skater) for skater in world_ranking)

In [15]:
kendalltau(season_numeric_rank, world_numeric_rank)

KendalltauResult(correlation=0.5434782608695652, pvalue=0.00019870245032364205)

RMSE with mean model

In [20]:
score_comparison = pd.merge(season_scores, season_avg.to_frame(), left_on='name', right_index=True, suffixes=['', '_avg'])
score_comparison['sq_error'] = (score_comparison['score'] - score_comparison['score_avg'])**2
score_com

(116, 5)