In [4]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
from sklearn.preprocessing import StandardScaler

In [5]:
# !conda install scipy -y

In [6]:
# !conda install scikit-learn -y

In [8]:
data = pd.read_csv('grand_prix_dataset.csv')
data

Unnamed: 0,Skater,Country Represented,Score,Country of Competition,Discipline,Season,Home Competition?
0,Adam Siao Him Fa,FR,217.52,US,Men,2021-2022,False
1,Adam Siao Him Fa,FR,243.29,FR,Men,2021-2022,True
2,Adam Siao Him Fa,FR,268.98,FR,Men,2022-2023,True
3,Adam Siao Him Fa,FR,250.45,JP,Men,2022-2023,False
4,Adam Siao Him Fa,FR,306.78,FR,Men,2023-2024,True
...,...,...,...,...,...,...,...
361,Yuhana Yokoi,JP,189.54,JP,Women,2019-2020,True
362,Yuma Kagiyama,JP,273.14,FR,Men,2023-2024,False
363,Yuma Kagiyama,JP,288.39,JP,Men,2023-2024,True
364,Yuzuru Hanyu,JP,322.59,CA,Men,2019-2020,False


In [37]:
if True in data['Score'].isna().values:
    skaters_to_drop = list(data[data['Score'].isna()]['Skater'])
    seasons_competed = list(data[data['Score'].isna()]['Season'])
    data = data[~data['Skater'].isin(skaters_to_drop) & ~data['Season'].isin(seasons_competed)]

    

In [39]:
# must standardize by discipline
scalers_by_discipline = {}
discipline_groups = data.groupby('Discipline')
# discipline_groups = [group for name, group in discipline_groups]
for name, group in discipline_groups:
    scaler = StandardScaler()
    scaler.fit(group['Score'].values.reshape(-1,1))
    scalers_by_discipline[name] = scaler
scalers_by_discipline

{'Dance': StandardScaler(),
 'Men': StandardScaler(),
 'Pairs': StandardScaler(),
 'Women': StandardScaler()}

In [40]:
data['Score'] = data.apply(lambda x: scalers_by_discipline[x['Discipline']].transform(np.array(x['Score']).reshape(-1,1))[0][0], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Score'] = data.apply(lambda x: scalers_by_discipline[x['Discipline']].transform(np.array(x['Score']).reshape(-1,1))[0][0], axis=1)


In [46]:
home_grand_prix_total_scores = data[data['Home Competition?'] == True]['Score'].values
other_grand_prix_total_scores = data[data['Home Competition?'] == False]['Score'].values

In [47]:
other_grand_prix_total_scores

array([ 0.08253069,  1.57095553, -1.14892313,  0.53747381,  2.19229405,
       -0.97858922,  1.18715529, -0.25590051, -0.60709357,  0.01911608,
       -0.91834419,  1.44559885, -0.70841821, -0.65221022,  0.50312965,
        1.03167716, -0.01179948, -0.54160344,  0.54960264, -0.06000013,
       -0.30314017, -1.51633501,  1.15736812,  1.17530514, -1.40061496,
       -0.92115206,  0.70457189,  0.17879851, -0.16959164,  0.96565755,
        1.16008585, -0.82546612,  0.04907768,  1.07637976,  1.72048185,
        1.08670713, -0.76189306,  0.0811469 ,  0.79482289, -0.35668784,
        0.70242006,  1.00408813,  0.95018331,  1.7665966 ,  0.33341627,
        1.20473448,  0.71564247,  1.66392569, -2.30407094, -0.51308769,
        1.27778576, -0.75007095,  0.03035973, -0.83977722, -1.32825881,
        0.98069112,  0.32314206, -2.42188277, -1.18368481, -0.98045247,
       -0.89606348, -1.30146645, -0.97496312,  0.60403822, -0.40858811,
       -1.12498184, -0.51838442, -1.11310604, -0.94829068, -0.48

In [30]:
# get values in array
home_grand_prix_total_score = data['scaled_home_grand_prix_total_score'].values
other_grand_prix_total_score = data['scaled_other_grand_prix_total_score'].values
home_grand_prix_total_score

array([-0.09764827, -0.27263687,  1.14764519, -0.50570079,  1.7255372 ,
        0.34256325, -0.43733554, -0.97164914, -0.95705519, -0.42554441,
        1.3470479 ,  1.506295  ,  1.61422756,  1.05948929, -0.47923822,
       -1.44547686, -0.4819958 , -1.17401904, -0.15890717,  1.0039853 ,
        0.13038391,  0.81756429,  0.07810513,  0.27981762, -0.10128844,
        0.76671196, -1.69303647,  0.54535063, -1.76661869,  0.50207553,
       -0.26857695, -0.82048269, -1.73696634,  0.79916351, -0.07683428,
       -1.28585601, -0.50302896, -0.81802048, -1.19617875,  1.41362229,
       -0.36768057, -1.39257575,  1.31007502,  0.70896608,  0.11200681,
       -0.87857803,  0.47398251,  0.13824369, -0.04335089, -0.29036757,
       -0.17671719, -0.38239639, -0.10733274, -1.74497055, -1.21114422,
        1.27577387,  1.11258007,  0.48574694, -0.30096181,  0.98223187,
        0.04231064,  1.11841744,  0.83591428,  0.94490431,  0.86724625,
       -1.26864675, -0.19930504, -0.45027961, -0.4333871 , -1.07

In [49]:
# t test
t_stat, p_val = ttest_rel(home_grand_prix_total_scores, other_grand_prix_total_scores, alternative='greater')
print(f'Test Statistic: {t_stat}')
print(f'P-Value: {p_val}')

Test Statistic: 2.5971093478367178
P-Value: 0.005188471067447645


In [51]:
# manual t test
differences = home_grand_prix_total_scores - other_grand_prix_total_scores
mean_differences = np.mean(differences)
var_differences = np.var(differences)
test_stat = mean_differences/(np.sqrt(var_differences)/np.sqrt(len(differences)))
test_stat

2.606111486953451

In [52]:
from scipy.stats import t
p_value = 1-t.cdf(test_stat, len(differences)-1)
print(f'P-Value: {p_value}')

P-Value: 0.005060150402822261
