In [4]:
from bs4 import BeautifulSoup

In [5]:
from urllib.request import urlopen

In [182]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from itertools import combinations
from scipy.stats import kendalltau

## Scrap scores from result pages

In [7]:
links = {
    'US': 'http://www.isuresults.com/results/season1718/gpusa2017/CAT001RS.HTM',
    'CA': 'http://www.isuresults.com/results/season1718/gpcan2017/CAT001RS.HTM',
    'FR': 'http://www.isuresults.com/results/season1718/gpfra2017/CAT001RS.HTM',
    'RU': 'http://www.isuresults.com/results/season1718/gprus2017/CAT001RS.HTM',
    'CN': 'http://www.isuresults.com/results/season1718/gpchn2017/CAT001RS.HTM',
    'JP': 'http://www.isuresults.com/results/season1718/gpjpn2017/data0190.htm',
    'EU': 'http://www.isuresults.com/results/season1718/ec2018/CAT001RS.HTM',
    '4C': 'http://www.isuresults.com/results/season1718/fc2018/CAT001RS.HTM',
    'WR': 'http://www.isuresults.com/results/season1718/wc2018/CAT001RS.HTM'
}

In [8]:
def parse_score(link, event_name, year):
    html = urlopen(link)
    bs = BeautifulSoup(html.read(), 'html.parser')

    table_str = str(bs.findAll('table')[0])
    table = pd.read_html(table_str)[0]
    if year == 2017 and event_name == 'JP':
        table = table[[1, 5]].dropna().iloc[1:-1]
    else:
        table = table[[1, 6, 8]].dropna().iloc[2:, :2]
    table.columns = ['name', 'score']
    table['event'] = event_name
    table['score'] = table['score'].astype(float)
    table['year'] = year
    return table

In [9]:
# scores = pd.concat((parse_score(link, event_name, 2017) for event_name, link in links.items()), axis=0)
# scores['name'] = scores['name'].str.replace('\xa0', ' ')
# scores.reset_index(drop=True, inplace=True)
scores = pd.read_csv('scores/2017.csv', names=['name', 'score', 'event', 'year'])

In [327]:
season_scores = scores.loc[scores['event'] != 'WR'].copy()
world_scores = scores.loc[scores['event'] == 'WR', ['name', 'score']].set_index('name').squeeze()

## Implement kendall tau metrics

In [328]:
def return_ranking(skater_scores, world_scores):
    skater_scores = skater_scores.sort_values(ascending=False)
    world_scores = world_scores.sort_values(ascending=False)
    skater_ranking = list(skater_scores.index.intersection(world_scores.index))
    world_ranking = list(world_scores.index.intersection(skater_scores.index))
    return skater_ranking, world_ranking

In [78]:
def calculate_kendall_tau(skater_ranking, world_ranking, verbose=True):
    skater_pairs = set(combinations(skater_ranking, 2))
    world_pairs = set(combinations(world_ranking, 2))
    n_pairs = len(skater_pairs)
    n_concordant_pairs = len(set(skater_pairs) & set(world_pairs))
    print(f'There are {n_concordant_pairs} concordant_pairs out of {n_pairs} pairs')
    tau = (2 * n_concordant_pairs - n_pairs) / n_pairs
    return tau 

## Skater mean model

In [34]:
season_avg = season_scores.groupby('name')['score'].mean().sort_values(ascending=False)
season_avg.head()

name
Shoma UNO           290.786667
Yuzuru HANYU        290.770000
Nathan CHEN         284.835000
Javier FERNANDEZ    277.440000
Boyang JIN          270.486667
Name: score, dtype: float64

In [79]:
skater_ranking_avg, world_ranking = return_ranking(season_avg, world_scores)
calculate_kendall_tau(skater_ranking_avg, world_ranking)

There are 213 concordant_pairs out of 276 pairs


0.5434782608695652

Result agrees with kendalltau from scipy

In [106]:
season_numeric_rank = list(range(len(skater_ranking)))
world_numeric_rank = [skater_ranking.index(skater) for skater in world_ranking]
kendalltau(season_numeric_rank, world_numeric_rank)

KendalltauResult(correlation=0.5434782608695652, pvalue=0.00019870245032364205)

RMSE with mean model

In [68]:
score_comparison = pd.merge(season_scores, season_avg.to_frame(), left_on='name', right_index=True, suffixes=['', '_avg'])
score_comparison['sq_error'] = (score_comparison['score'] - score_comparison['score_avg'])**2
np.sqrt(score_comparison['sq_error'].mean())

10.099364225465914

## Normalized mean model

In [100]:
season_scores['score_normed'] = season_scores.groupby('event')['score'].transform(lambda score: (score - score.mean()) / score.std())

In [102]:
season_normed_avg = season_scores.groupby('name')['score_normed'].mean().sort_values(ascending=False)
season_normed_avg.head()

name
Shoma UNO           1.813727
Nathan CHEN         1.587136
Javier FERNANDEZ    1.574291
Yuzuru HANYU        1.487765
Sergei VORONOV      1.300975
Name: score_normed, dtype: float64

In [105]:
skater_ranking_normed_avg, world_ranking = return_ranking(season_normed_avg, world_scores)
calculate_kendall_tau(skater_ranking_normed_avg, world_ranking)

There are 202 concordant_pairs out of 276 pairs


0.463768115942029

## Linear model

In [301]:
dummies = pd.get_dummies(season_scores[['name', 'event']], prefix=['', ''], prefix_sep='', drop_first=True)
X = dummies.values
X = np.insert(X, 0, 1, axis=1)
y = season_scores['score'].values
coefs = np.linalg.inv(X.T @ X) @ (X.T @ y)
coefs

array([198.91109205,  86.01409275,  16.96695109,  -3.51386001,
        55.48285325,  59.90717173,  22.03744112, -21.10109205,
        82.47834931,  16.72206901, -21.21223782,  22.73776218,
        -3.70109205, -12.33223782,  -6.31255888,  25.25397219,
        48.82779627,  59.2511296 , -13.00109205,  39.28890795,
        -6.71223782, -18.65109205,  23.45749635,  48.94834931,
       -18.02247603,  18.3681855 , -22.78223782,   2.78776218,
        69.13744117,  88.46315656,  38.94649612,  -1.23109205,
        21.16523006,  12.94890795,  51.0981855 ,  41.78170778,
        59.51752397,  31.46556751,  36.00834931, -48.68109205,
        35.25      ,  30.99776218,  58.31020515,  41.42372591,
        80.4237811 ,  61.55063886,  44.5769236 ,  40.21151152,
       101.99372056,  40.27523006,  56.54523006,   9.49149612,
        19.78776218,  10.72166485,  24.52      ,  44.04      ,
        86.09909275,  98.44323517, -21.84109205,   8.19776218,
       -17.79223782,  -3.02223782,  20.398046  ,  -6.66

Double check with sklearn's LinearRegression

In [302]:
lin = LinearRegression(fit_intercept=False)
lin.fit(X, y)
lin.coef_, lin.intercept_

(array([198.91109205,  86.01409275,  16.96695109,  -3.51386001,
         55.48285325,  59.90717173,  22.03744112, -21.10109205,
         82.47834931,  16.72206901, -21.21223782,  22.73776218,
         -3.70109205, -12.33223782,  -6.31255888,  25.25397219,
         48.82779627,  59.2511296 , -13.00109205,  39.28890795,
         -6.71223782, -18.65109205,  23.45749635,  48.94834931,
        -18.02247603,  18.3681855 , -22.78223782,   2.78776218,
         69.13744117,  88.46315656,  38.94649612,  -1.23109205,
         21.16523006,  12.94890795,  51.0981855 ,  41.78170778,
         59.51752397,  31.46556751,  36.00834931, -48.68109205,
         35.25      ,  30.99776218,  58.31020515,  41.42372591,
         80.4237811 ,  61.55063886,  44.5769236 ,  40.21151152,
        101.99372056,  40.27523006,  56.54523006,   9.49149612,
         19.78776218,  10.72166485,  24.52      ,  44.04      ,
         86.09909275,  98.44323517, -21.84109205,   8.19776218,
        -17.79223782,  -3.02223782,  20.

Add removed baseline skater back to skater scores

In [311]:
removed_skater = list(set(season_scores['name']) - set(dummies.columns[:-7]))[0]
removed_skater

'Abzal RAKIMGALIEV'

In [317]:
linear_scores = pd.Series(coefs[1:-7], index=dummies.columns[:-7])
linear_scores[removed_skater] = 0

In [329]:
linear_ranking, world_ranking = return_ranking(linear_scores, world_scores)

In [333]:
calculate_kendall_tau(linear_ranking, world_ranking)

There are 220 concordant_pairs out of 276 pairs


0.5942028985507246

Only event gets incorporated into intercept (doesn't work)

In [300]:
# skater_dummies = pd.get_dummies(season_scores['name'])
# event_dummies = pd.get_dummies(season_scores['event'], drop_first=True)
# X = pd.concat([skater_dummies, event_dummies], axis=1).values
# X = np.insert(X, 0, 1, axis=1)
# y = season_scores['score'].values
# coefs = np.linalg.inv(X.T @ X) @ (X.T @ y)
# coefs

## 