# Mann-Whitney U test: Compare Group Performance

In [20]:
from scipy.stats import mannwhitneyu, shapiro
import pandas as pd
from scripts.utils import rank_biserial

## Compare: All

In [21]:
# Read the CSV file
df_6hum = pd.read_csv('../csv_data/nonsem_6hum.csv')
df_6bots = pd.read_csv('../csv_data/6bots.csv')
df_1bot_5hum = pd.read_csv('../csv_data/nonsem_1bot_5hum.csv')
df_3bots_3hum = pd.read_csv('../csv_data/nonsem_3bots_3hum.csv')

# Extract the 'ItemsFound' column
items_found_6hum = df_6hum['ItemsFound']
items_found_6bots = df_6bots['ItemsFound']
items_found_1bot_5hum = df_1bot_5hum['ItemsFound']
items_found_3bots_3hum = df_3bots_3hum['ItemsFound']

#Extract the 'Score' column
score_6hum = df_6hum['Score']
score_6bots = df_6bots['Score']
score_1bot_5hum = df_1bot_5hum['Score']
score_3bots_3hum = df_3bots_3hum['Score']

In [12]:
# Check normality assumption: items found
shapiro_items_found_6hum = shapiro(items_found_6hum)
shapiro_items_found_6bots = shapiro(items_found_6bots)
shapiro_items_found_1bot_5hum = shapiro(items_found_1bot_5hum)
shapiro_items_found_3bot_3hum = shapiro(items_found_3bots_3hum)

print(f"P-value of Shapiro-Wilk test for 6hum (ItemsFound): {shapiro_items_found_6hum[1]}")
print(f"P-value of Shapiro-Wilk test for 6bots (ItemsFound): {shapiro_items_found_6bots[1]}")
print(f"P-value of Shapiro-Wilk test for 1b5h (ItemsFound): {shapiro_items_found_1bot_5hum[1]}")
print(f"P-value of Shapiro-Wilk test for 3b3h (ItemsFound): {shapiro_items_found_3bot_3hum[1]}")
print("\n")

# Check normality assumption: score
shapiro_score_6hum = shapiro(score_6hum)
shapiro_score_6bots = shapiro(score_6bots)
shapiro_score_1bot_5hum = shapiro(score_1bot_5hum)
shapiro_score_3bot_3hum = shapiro(score_3bots_3hum)

print(f"P-value of Shapiro-Wilk test for 6hum (Scores): {shapiro_score_6hum[1]}")
print(f"P-value of Shapiro-Wilk test for 6bots (Scores): {shapiro_score_6bots[1]}")
print(f"P-value of Shapiro-Wilk test for 1b5h (Scores): {shapiro_score_1bot_5hum[1]}")
print(f"P-value of Shapiro-Wilk test for 3b3h (Scores): {shapiro_score_3bot_3hum[1]}")

P-value of Shapiro-Wilk test for 6hum (ItemsFound): 2.064731201263366e-12
P-value of Shapiro-Wilk test for 6bots (ItemsFound): 1.607155298539195e-13
P-value of Shapiro-Wilk test for 1b5h (ItemsFound): 0.00011544959914790341
P-value of Shapiro-Wilk test for 3b3h (ItemsFound): 1.3964101444826274e-10


P-value of Shapiro-Wilk test for 6hum (Scores): 3.884644752840973e-36
P-value of Shapiro-Wilk test for 6bots (Scores): 1.2070404188865794e-33
P-value of Shapiro-Wilk test for 1b5h (Scores): 1.707065247688503e-20
P-value of Shapiro-Wilk test for 3b3h (Scores): 2.340710740333886e-21


#### Compare: 6 bots vs 6 humans (Preliminary Study)

In [13]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_6bots_vs_6hum, p_items_6bots_vs_6hum = mannwhitneyu(items_found_6bots, items_found_6hum, alternative='greater')
r_items_6bots_vs_6hum = rank_biserial(items_found_6bots, items_found_6hum, alternative='greater')
print(f"Items 6 bots vs 6 humans: U-statistic = {stat_items_6bots_vs_6hum}, p-value = {p_items_6bots_vs_6hum}, Rank-Biserial = {r_items_6bots_vs_6hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_6bots_vs_6hum, p_score_6bots_vs_6hum = mannwhitneyu(score_6bots, score_6hum, alternative='greater')
r_score_6bots_vs_6hum = rank_biserial(score_6bots, score_6hum, alternative='greater')
print(f"Score 6 bots vs 6 humans: U-statistic = {stat_score_6bots_vs_6hum}, p-value = {p_score_6bots_vs_6hum}, Rank-Biserial = {r_score_6bots_vs_6hum}")

Items 6 bots vs 6 humans: U-statistic = 108218.5, p-value = 3.7544720164112874e-14, Rank-Biserial = 0.3002030469050365
Score 6 bots vs 6 humans: U-statistic = 109807.0, p-value = 9.668901554336394e-16, Rank-Biserial = 0.3192882545174933


#### Compare: 3 bots, 3 humans vs 6 humans

In [14]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_3bots_3hum_vs_6hum, p_items_3bots_3hum_vs_6hum = mannwhitneyu(items_found_3bots_3hum, items_found_6hum, alternative='greater')
r_items_3bots_3hum_vs_6hum = rank_biserial(items_found_3bots_3hum, items_found_6hum, alternative='greater')
print(f"Items 3 bots, 3 humans vs 6 humans: U-statistic = {stat_items_3bots_3hum_vs_6hum}, p-value = {p_items_3bots_3hum_vs_6hum}, Rank-Biserial = {r_items_3bots_3hum_vs_6hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_3bots_3hum_vs_6hum, p_score_3bots_3hum_vs_6hum = mannwhitneyu(score_3bots_3hum, score_6hum, alternative='greater')
r_score_3bots_3hum_vs_6hum = rank_biserial(score_3bots_3hum, score_6hum, alternative='greater')
print(f"Score 3 bots, 3 humans vs 6 humans: U-statistic = {stat_score_3bots_3hum_vs_6hum}, p-value = {p_score_3bots_3hum_vs_6hum}, Rank-Biserial = {r_score_3bots_3hum_vs_6hum}")

Items 3 bots, 3 humans vs 6 humans: U-statistic = 58383.5, p-value = 2.303272931313881e-13, Rank-Biserial = 0.34996994080651134
Score 3 bots, 3 humans vs 6 humans: U-statistic = 59418.0, p-value = 5.7460015459563286e-15, Rank-Biserial = 0.37389012208657046


#### Compare: 3 bots, 3 humans vs 6 bots

In [15]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_6bots_vs_3bots_3hum, p_items_6bots_vs_3bots_3hum = mannwhitneyu(items_found_6bots, items_found_3bots_3hum, alternative='greater')
r_items_6bots_vs_3bots_3hum = rank_biserial(items_found_6bots, items_found_3bots_3hum, alternative='greater')
print(f"Items 6 bots vs 3 bots, 3 humans: U-statistic = {stat_items_6bots_vs_3bots_3hum}, p-value = {p_items_6bots_vs_3bots_3hum}, Rank-Biserial = {r_items_6bots_vs_3bots_3hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_6bots_vs_3bots_3hum, p_score_6bots_vs_3bots_3hum = mannwhitneyu(score_6bots, score_3bots_3hum, alternative='greater')
r_score_6bots_vs_3bots_3hum = rank_biserial(score_6bots, score_3bots_3hum, alternative='greater')
print(f"Score 6 bots vs 3 bots, 3 humans: U-statistic = {stat_score_6bots_vs_3bots_3hum}, p-value = {p_score_6bots_vs_3bots_3hum}, Rank-Biserial = {r_score_6bots_vs_3bots_3hum}")


Items 6 bots vs 3 bots, 3 humans: U-statistic = 36764.5, p-value = 0.9990481863023576, Rank-Biserial = -0.14991444691083977
Score 6 bots vs 3 bots, 3 humans: U-statistic = 36889.5, p-value = 0.9988061706309225, Rank-Biserial = -0.14702413984461704


##### Compare: 3 bots, 3 humans vs 6 bots (secondary analysis)


In [16]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_3bots_3hum_vs_6bots, p_items_3bots_3hum_vs_6bots = mannwhitneyu(items_found_3bots_3hum, items_found_6bots, alternative='greater')
r_items_3bots_3hum_vs_6bots = rank_biserial(items_found_3bots_3hum, items_found_6bots, alternative='greater')
print(f"Items 3 bots, 3 humans vs 6 bots: U-statistic = {stat_items_3bots_3hum_vs_6bots}, p-value = {p_items_3bots_3hum_vs_6bots}, Rank-Biserial = {r_items_3bots_3hum_vs_6bots}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_3bots_3hum_vs_6bots, p_score_3bots_3hum_vs_6bots = mannwhitneyu(score_3bots_3hum, score_6bots, alternative='greater')
r_score_3bots_3hum_vs_6bots = rank_biserial(score_3bots_3hum, score_6bots, alternative='greater')
print(f"Score 3 bots, 3 humans vs 6 bots: U-statistic = {stat_score_3bots_3hum_vs_6bots}, p-value = {p_score_3bots_3hum_vs_6bots}, Rank-Biserial = {r_score_3bots_3hum_vs_6bots}")

Items 3 bots, 3 humans vs 6 bots: U-statistic = 49731.5, p-value = 0.0009533557062600396, Rank-Biserial = 0.14991444691083977
Score 3 bots, 3 humans vs 6 bots: U-statistic = 49606.5, p-value = 0.0011957225208056297, Rank-Biserial = 0.14702413984461704


#### Compare: 1 bot, 5 humans vs 6 humans

In [17]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_1bot_5hum_vs_6hum, p_items_1bot_5hum_vs_6hum = mannwhitneyu(items_found_1bot_5hum, items_found_6hum, alternative='greater')
r_items_1bot_5hum_vs_6hum = rank_biserial(items_found_1bot_5hum, items_found_6hum, alternative='greater')
print(f"Items 1 bot, 5 humans vs 6 humans: U-statistic = {stat_items_1bot_5hum_vs_6hum}, p-value = {p_items_1bot_5hum_vs_6hum}, Rank-Biserial = {r_items_1bot_5hum_vs_6hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_1bot_5hum_vs_6hum, p_score_1bot_5hum_vs_6hum = mannwhitneyu(score_1bot_5hum, score_6hum, alternative='greater')
r_score_1bot_5hum_vs_6hum = rank_biserial(score_1bot_5hum, score_6hum, alternative='greater')
print(f"Score 1 bot, 5 humans vs 6 humans: U-statistic = {stat_score_1bot_5hum_vs_6hum}, p-value = {p_score_1bot_5hum_vs_6hum}, Rank-Biserial = {r_score_1bot_5hum_vs_6hum}")

Items 1 bot, 5 humans vs 6 humans: U-statistic = 27380.0, p-value = 0.10418394988284796, Rank-Biserial = 0.07372549019607844
Score 1 bot, 5 humans vs 6 humans: U-statistic = 27507.5, p-value = 0.08967589441768631, Rank-Biserial = 0.07872549019607833


#### Compare: 1 bot, 5 humans vs 6 bots

In [18]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_6bots_vs_1bot_5hum, p_items_6bots_vs_1bot_5hum = mannwhitneyu(items_found_6bots, items_found_1bot_5hum, alternative='greater')
r_items_6bots_vs_1bot_5hum = rank_biserial(items_found_6bots, items_found_1bot_5hum, alternative='greater')
print(f"Items 6 bots vs 1 bot, 5 humans: U-statistic = {stat_items_6bots_vs_1bot_5hum}, p-value = {p_items_6bots_vs_1bot_5hum}, Rank-Biserial = {r_items_6bots_vs_1bot_5hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_6bots_vs_1bot_5hum, p_score_6bots_vs_1bot_5hum = mannwhitneyu(score_6bots, score_1bot_5hum, alternative='greater')
r_score_6bots_vs_1bot_5hum = rank_biserial(score_6bots, score_1bot_5hum, alternative='greater')
print(f"Score 6 bots vs 1 bot, 5 humans: U-statistic = {stat_score_6bots_vs_1bot_5hum}, p-value = {p_score_6bots_vs_1bot_5hum}, Rank-Biserial = {r_score_6bots_vs_1bot_5hum}")


Items 6 bots vs 1 bot, 5 humans: U-statistic = 31548.0, p-value = 2.464962449035513e-05, Rank-Biserial = 0.23717647058823532
Score 6 bots vs 1 bot, 5 humans: U-statistic = 31749.5, p-value = 1.410956335957599e-05, Rank-Biserial = 0.24507843137254892


#### Compare: 3 bots, 3 humans vs 1 bot, 5 humans

In [19]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_3bots_3hum_vs_1bot_5hum, p_items_3bots_3hum_vs_1bot_5hum = mannwhitneyu(items_found_3bots_3hum, items_found_1bot_5hum, alternative='greater')
r_items_3bots_3hum_vs_1bot_5hum = rank_biserial(items_found_3bots_3hum, items_found_1bot_5hum, alternative='greater')
print(f"Items 3 bots, 3 humans vs 1 bot, 5 humans: U-statistic = {stat_items_3bots_3hum_vs_1bot_5hum}, p-value = {p_items_3bots_3hum_vs_1bot_5hum}, Rank-Biserial = {r_items_3bots_3hum_vs_1bot_5hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_3bots_3hum_vs_1bot_5hum, p_score_3bots_3hum_vs_1bot_5hum = mannwhitneyu(score_3bots_3hum, score_1bot_5hum, alternative='greater')
r_score_3bots_3hum_vs_1bot_5hum = rank_biserial(score_3bots_3hum, score_1bot_5hum, alternative='greater')
print(f"Score 3 bots, 3 humans vs 1 bot, 5 humans: U-statistic = {stat_score_3bots_3hum_vs_1bot_5hum}, p-value = {p_score_3bots_3hum_vs_1bot_5hum}, Rank-Biserial = {r_score_3bots_3hum_vs_1bot_5hum}")

Items 3 bots, 3 humans vs 1 bot, 5 humans: U-statistic = 16886.5, p-value = 9.302451612574043e-06, Rank-Biserial = 0.2744528301886793
Score 3 bots, 3 humans vs 1 bot, 5 humans: U-statistic = 17034.5, p-value = 4.3489311178726575e-06, Rank-Biserial = 0.28562264150943406
