## Implications of Site Popularity

### Node count vs. rank

In [1]:
import pandas as pd
from scipy.stats import kruskal
import os

path = os.getcwd() + "/datasets/stat_test/nodeCount_rank.csv"
df = pd.read_csv(path)

# Group the data by rank_bucket to feed into the Kruskal-Wallis test
data_groups = [df['node_count'][df['rank_bucket'] == group] for group in df['rank_bucket'].unique()]

result = kruskal(*data_groups)

print(result)


KruskalResult(statistic=395.3066132009497, pvalue=2.8731308925763685e-84)


In [3]:
import pandas as pd
from scipy.stats import kruskal
import os

path = os.getcwd() + "/datasets/stat_test/childrenSim_rank.csv"
df = pd.read_csv(path)

# Group the data by rank_bucket to feed into the Kruskal-Wallis test
unique_groups = df['rank_bucket'].unique()
groups = [df[df['rank_bucket'] == group]['sim_all'] for group in unique_groups]

result = kruskal(*groups)

# Calculate epsilon-squared (for effect size)
epsilon_squared = result.statistic / (len(df) - 1)

print(f"Kruskal-Wallis H-statistic: {result.statistic}")
print(f"Kruskal-Wallis p-value: {result.pvalue}")
print(f"Epsilon-squared: {epsilon_squared}")


Kruskal-Wallis H-statistic: 9277.995204838822
Kruskal-Wallis p-value: 0.0
Epsilon-squared: 0.001574842966253196


In [4]:
import pandas as pd
from scipy.stats import kruskal
import os

path = os.getcwd() + "/datasets/stat_test/tracker_rank.csv"
df = pd.read_csv(path)

# Group the data by rank_bucket to feed into the Kruskal-Wallis test
data_groups = [df['node_count'][df['rank_bucket'] == group] for group in df['rank_bucket'].unique()]

result = kruskal(*data_groups)

print(result)


KruskalResult(statistic=216.7278518858997, pvalue=9.484599954085243e-46)


### Sim of children vs rank

In [5]:
import pandas as pd
from scipy.stats import kruskal
import os

path = os.getcwd() + "/datasets/stat_test/childrenSim_rank.csv"
df = pd.read_csv(path)

# Group the data by rank_bucket to feed into the Kruskal-Wallis test
data_groups = [df['sim_all'][df['rank_bucket'] == group] for group in df['rank_bucket'].unique()]

result = kruskal(*data_groups)

print(result)


KruskalResult(statistic=9277.995204838822, pvalue=0.0)


In [7]:
import pandas as pd
from scipy.stats import kruskal
import os

path = os.getcwd() + "/datasets/stat_test/parentSim_rank.zip"
df = pd.read_csv(path)

# Group the data by rank_bucket to feed into the Kruskal-Wallis test
data_groups = [df['eval_all'][df['rank_bucket'] == group] for group in df['rank_bucket'].unique()]

result = kruskal(*data_groups)

print(result)


KruskalResult(statistic=162782.07086873578, pvalue=0.0)


## S.4 thirdParty_sim



In [12]:
import pandas as pd
from scipy.stats import ttest_rel
import os

path = os.getcwd() + "/datasets/stat_test/s4.thirdParty_sim.zip"

df = pd.read_csv(path)

# Perform the paired t-test
t, p = ttest_rel(df['is_third_party'], df['eval_all'])

# Print the results
print(f"t = {t:.3f}, p = {p:.3f}")


t = 1794.811, p = 0.000


## S.5 simVSchildrenCount

In [15]:
# Wilcoxon signed-rank test
import pandas as pd
from scipy.stats import wilcoxon

path = os.getcwd() + "/datasets/stat_test/s5.simVSchildrenCount.zip"

df = pd.read_csv(path)

# Perform the Wilcoxon signed-rank test, since our data are not normally distributed 
_, p = wilcoxon(df['ct_avg_all'], df['sim_all'])

# Print the results
print(f"Wilcoxon signed-rank test: p = {p:.3f}")


Wilcoxon signed-rank test: p = 0.000


## S.6 interaction-desktopVStreeDepth


In [18]:
import os
import pandas as pd
from scipy.stats import mannwhitneyu

# Define the path
path = os.getcwd() + "/datasets/stat_test/s6.interaction-desktopVStreeDepth.zip"

# Load the data
df = pd.read_csv(path, compression='zip')

# Separate your dataframe into two groups
group0 = df[df['f0_'] == 0]['tree_depth']
group1 = df[df['f0_'] == 1]['tree_depth']

# Perform the Mann-Whitney U test
stat, p = mannwhitneyu(group0, group1)

print('Statistics=%.3f, p=%.3f' % (stat, p))

Statistics=128595468010247.500, p=0.000


## S.7 resource_type_sim_parent

In [17]:
import pandas as pd
from scipy.stats import kruskal

path = os.getcwd() + "/datasets/stat_test/s7_resource_type_sim_parent.csv"

df = pd.read_csv(path)

# Perform the Kruskal-Wallis test
_, p = kruskal(*[group['sim'] for name, group in df.groupby('resource_type')])

# Print the results
print(f"Kruskal-Wallis test: p = {p:.3f}")


Kruskal-Wallis test: p = 0.000


## S.8 resource_type_sim_children

In [6]:
import pandas as pd
from scipy.stats import f_oneway
import os
import statsmodels.api as sm
from statsmodels.formula.api import ols

path = os.getcwd() + "/datasets/stat_test/s8_resource_type_sim_children.csv"

df = pd.read_csv(path)

# resource_type,sim,f0_,pct

model = ols('sim~resource_type+pct', data=df).fit()
sm.stats.anova_lm(model, typ=2)


Unnamed: 0,sum_sq,df,F,PR(>F)
resource_type,39.544608,12.0,1347.049692,0.0
pct,58.728961,1.0,24006.558522,0.0
Residual,1290.179636,527385.0,,
