In [1]:
from bs4 import BeautifulSoup

In [2]:
from urllib.request import urlopen

In [323]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import kendalltau

## Scrap scores from result pages

In [160]:
links = {
    'US': 'http://www.isuresults.com/results/season1718/gpusa2017/CAT001RS.HTM',
    'CA': 'http://www.isuresults.com/results/season1718/gpcan2017/CAT001RS.HTM',
    'FR': 'http://www.isuresults.com/results/season1718/gpfra2017/CAT001RS.HTM',
    'RU': 'http://www.isuresults.com/results/season1718/gprus2017/CAT001RS.HTM',
    'CN': 'http://www.isuresults.com/results/season1718/gpchn2017/CAT001RS.HTM',
    'JP': 'http://www.isuresults.com/results/season1718/gpjpn2017/data0190.htm',
    'EU': 'http://www.isuresults.com/results/season1718/ec2018/CAT001RS.HTM',
    '4C': 'http://www.isuresults.com/results/season1718/fc2018/CAT001RS.HTM',
    'WR': 'http://www.isuresults.com/results/season1718/wc2018/CAT001RS.HTM'
}

In [249]:
def parse_score(link, event_name, year):
    html = urlopen(link)
    bs = BeautifulSoup(html.read(), 'html.parser')

    table_str = str(bs.findAll('table')[0])
    table = pd.read_html(table_str)[0]
    if year == 2017 and event_name == 'JP':
        table = table[[1, 5]].dropna().iloc[1:-1]
    else:
        table = table[[1, 6, 8]].dropna().iloc[2:, :2]
    table.columns = ['name', 'score']
    table['event'] = event_name
    table['score'] = table['score'].astype(float)
    return table

In [252]:
scores = pd.concat((parse_score(link, event_name, 2017) for event_name, link in links.items() if event_name != 'WR'), axis=0)
scores['name'] = scores['name'].str.replace('\xa0', ' ')
scores.reset_index(drop=True, inplace=True)

In [296]:
season_avg = scores.groupby('name')['score'].mean().sort_values(ascending=False)

In [294]:
world_score = parse_score(links['WR'], 'WR', 2017).sort_values(by='score', ascending=False).reset_index(drop=True)

## Implement kendall tau metrics

In [301]:
season_ranking = list(season_avg.loc[season_avg.index.isin(world_score['name'])].index)
world_ranking = list(world_score.loc[world_score['name'].isin(season_avg.index), 'name'])

In [320]:
season_pairs = set(combinations(season_ranking, 2))
world_pairs = set(combinations(world_ranking, 2))
concordant_pairs = set(season_pairs) & set(world_pairs)
kendall = (2 * len(concordant_pairs) - len(season_pairs)) / len(season_pairs)
kendall

0.5434782608695652

Result agrees with kendalltau from scipy

In [334]:
season_numeric_rank = list(range(len(season_ranking)))
world_numeric_rank = list(season_ranking.index(skater) for skater in world_ranking)

In [335]:
kendalltau(season_numeric_rank, world_numeric_rank)

KendalltauResult(correlation=0.5434782608695652, pvalue=0.00019870245032364205)