In [26]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
from sklearn.preprocessing import StandardScaler

In [27]:
data = pd.read_excel('figure_skating.xlsx', sheet_name='Sheet1')
data

Unnamed: 0,discipline,skaters,home_grand_prix_total_score,other_grand_prix_total_score,season
0,men,Kevin Aymoz,250.03,279.09,2023-2024
1,pairs,Chelsea Liu/Balazs Nagy,177.66,172.60,2023-2024
2,dance,Madison Chock/Evan Bates,212.96,209.46,2023-2024
3,dance,Caroline Green/Michael Parsons,185.07,189.33,2023-2024
4,men,Ilia Malinin,310.47,304.68,2023-2024
...,...,...,...,...,...
85,women,Mana Kawabe,205.44,186.52,2021-2022
86,women,Rino Matsuike,186.17,184.36,2021-2022
87,pairs,Anastasia Mishina/Aleksandr Galliamov,226.98,227.28,2021-2022
88,dance,Victoria Sinitsina/Nikita Katsalapov,211.72,215.44,2021-2022


In [28]:
# must standardize by discipline
scalers_by_discipline = {}
data_discipline = data.groupby(['discipline'])
discipline_groups = [group for name, group in data_discipline]
for group in discipline_groups:
    home_scores = group['home_grand_prix_total_score'].values
    away_scores = group['other_grand_prix_total_score'].values
    all_scores = np.append(home_scores, away_scores)
    scaler = StandardScaler()
    scaler.fit(all_scores.reshape(-1,1))
    scalers_by_discipline[group.iloc[0,0]] = scaler
scalers_by_discipline
    

  discipline_groups = [group for name, group in data_discipline]


{'dance': StandardScaler(),
 'men': StandardScaler(),
 'pairs': StandardScaler(),
 'women': StandardScaler()}

In [29]:
# scale the data
data['scaled_home_grand_prix_total_score'] = data.apply(lambda x: scalers_by_discipline[x['discipline']].transform(np.array(x['home_grand_prix_total_score']).reshape(-1,1))[0][0], axis=1)
data['scaled_other_grand_prix_total_score'] = data.apply(lambda x: scalers_by_discipline[x['discipline']].transform(np.array(x['other_grand_prix_total_score']).reshape(-1,1))[0][0], axis=1)

In [30]:
# get values in array
home_grand_prix_total_score = data['scaled_home_grand_prix_total_score'].values
other_grand_prix_total_score = data['scaled_other_grand_prix_total_score'].values
home_grand_prix_total_score

array([-0.09764827, -0.27263687,  1.14764519, -0.50570079,  1.7255372 ,
        0.34256325, -0.43733554, -0.97164914, -0.95705519, -0.42554441,
        1.3470479 ,  1.506295  ,  1.61422756,  1.05948929, -0.47923822,
       -1.44547686, -0.4819958 , -1.17401904, -0.15890717,  1.0039853 ,
        0.13038391,  0.81756429,  0.07810513,  0.27981762, -0.10128844,
        0.76671196, -1.69303647,  0.54535063, -1.76661869,  0.50207553,
       -0.26857695, -0.82048269, -1.73696634,  0.79916351, -0.07683428,
       -1.28585601, -0.50302896, -0.81802048, -1.19617875,  1.41362229,
       -0.36768057, -1.39257575,  1.31007502,  0.70896608,  0.11200681,
       -0.87857803,  0.47398251,  0.13824369, -0.04335089, -0.29036757,
       -0.17671719, -0.38239639, -0.10733274, -1.74497055, -1.21114422,
        1.27577387,  1.11258007,  0.48574694, -0.30096181,  0.98223187,
        0.04231064,  1.11841744,  0.83591428,  0.94490431,  0.86724625,
       -1.26864675, -0.19930504, -0.45027961, -0.4333871 , -1.07

In [32]:
# t test
t_stat, p_val = ttest_rel(home_grand_prix_total_score, other_grand_prix_total_score, alternative='greater')
print(f'Test Statistic: {t_stat}')
print(f'P-Value: {p_val}')

Test Statistic: 1.6456478062103965
P-Value: 0.0516817816582294


In [39]:
# manual t test
differences = home_grand_prix_total_score - other_grand_prix_total_score
mean_differences = np.mean(differences)
var_differences = np.var(differences)
test_stat = mean_differences/(np.sqrt(var_differences)/np.sqrt(len(differences)))
test_stat

1.6548671937877102

In [38]:
from scipy.stats import t
p_value = 1-t.cdf(test_stat, len(differences)-1)
print(f'P-Value: {p_value}')

P-Value: 0.05073696111822423
