# Mann-Whitney U test: Compare Human Performance

In [1]:
from scipy.stats import mannwhitneyu, shapiro
import pandas as pd
from scripts.utils import rank_biserial

## Compare: Human

In [2]:
# Read the CSV file
df_6hum = pd.read_csv('../../data/performance/nonsem_6hum.csv')
df_1bot_5hum = pd.read_csv('../../data/performance/nonsem_1bot_5hum.csv')
df_3bots_3hum = pd.read_csv('../../data/performance/nonsem_3bots_3hum.csv')

# Extract the 'ItemsFound' column when isRobot == 0
items_found_6hum = df_6hum['ItemsFound']
items_found_5hum = df_1bot_5hum[df_1bot_5hum['isRobot'] == 0]['ItemsFound']
items_found_3hum = df_3bots_3hum[df_3bots_3hum['isRobot'] == 0]['ItemsFound']

# Extract the 'Score' column when isRobot == 0
score_6hum = df_6hum['Score']
score_5hum = df_1bot_5hum[df_1bot_5hum['isRobot'] == 0]['Score']
score_3hum = df_3bots_3hum[df_3bots_3hum['isRobot'] == 0]['Score']

In [3]:
# Check normality assumption: items found
shapiro_items_found_6hum = shapiro(items_found_6hum)
shapiro_items_found_5hum = shapiro(items_found_5hum)
shapiro_items_found_3hum = shapiro(items_found_3hum)

print(f"P-value of Shapiro-Wilk test for 6hum (ItemsFound): {shapiro_items_found_6hum[1]}")
print(f"P-value of Shapiro-Wilk test for 5hum (ItemsFound): {shapiro_items_found_5hum[1]}")
print(f"P-value of Shapiro-Wilk test for 3hum (ItemsFound): {shapiro_items_found_3hum[1]}")
print("\n")

# Check normality assumption: score
shapiro_score_6hum = shapiro(score_6hum)
shapiro_score_5hum = shapiro(score_5hum)
shapiro_score_3hum = shapiro(score_3hum)

print(f"P-value of Shapiro-Wilk test for 6hum (Scores): {shapiro_score_6hum[1]}")
print(f"P-value of Shapiro-Wilk test for 5hum (Scores): {shapiro_score_5hum[1]}")
print(f"P-value of Shapiro-Wilk test for 3hum (Scores): {shapiro_score_3hum[1]}")

P-value of Shapiro-Wilk test for 6hum (ItemsFound): 2.064731201263366e-12
P-value of Shapiro-Wilk test for 5hum (ItemsFound): 0.0002073894558634914
P-value of Shapiro-Wilk test for 3hum (ItemsFound): 0.00010280139774210438


P-value of Shapiro-Wilk test for 6hum (Scores): 3.884644752840973e-36
P-value of Shapiro-Wilk test for 5hum (Scores): 5.841863070132123e-19
P-value of Shapiro-Wilk test for 3hum (Scores): 2.7451437063780367e-15


#### Compare: 3 bots, 3 humans vs 6 humans

In [4]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_3hum_vs_6hum, p_items_3hum_vs_6hum = mannwhitneyu(items_found_3hum, items_found_6hum, alternative='greater')
r_items_3hum_vs_6hum = rank_biserial(items_found_3hum, items_found_6hum, alternative='greater')
print(f"Items 3 humans vs 6 humans: U-statistic = {stat_items_3hum_vs_6hum}, p-value = {p_items_3hum_vs_6hum}, Rank-Biserial = {r_items_3hum_vs_6hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_3hum_vs_6hum, p_score_3hum_vs_6hum = mannwhitneyu(score_3hum, score_6hum, alternative='greater')
r_score_3hum_vs_6hum = rank_biserial(score_3hum, score_6hum, alternative='greater')
print(f"Score 3 humans vs 6 humans: U-statistic = {stat_score_3hum_vs_6hum}, p-value = {p_score_3hum_vs_6hum}, Rank-Biserial = {r_score_3hum_vs_6hum}")

Items 3 humans vs 6 humans: U-statistic = 24852.5, p-value = 0.003222376146221037, Rank-Biserial = 0.1714036576168929
Score 3 humans vs 6 humans: U-statistic = 25482.0, p-value = 0.0007013835340530483, Rank-Biserial = 0.20107466063348411


#### Compare: 1 bot, 5 humans vs 6 humans

In [5]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_5hum_vs_6hum, p_items_5hum_vs_6hum = mannwhitneyu(items_found_5hum, items_found_6hum, alternative='greater')
r_items_5hum_vs_6hum = rank_biserial(items_found_5hum, items_found_6hum, alternative='greater')
print(f"Items 5 humans vs 6 humans: U-statistic = {stat_items_5hum_vs_6hum}, p-value = {p_items_5hum_vs_6hum}, Rank-Biserial = {r_items_5hum_vs_6hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_5hum_vs_6hum, p_score_5hum_vs_6hum = mannwhitneyu(score_5hum, score_6hum, alternative='greater')
r_score_5hum_vs_6hum = rank_biserial(score_5hum, score_6hum, alternative='greater')
print(f"Score 5 humans vs 6 humans: U-statistic = {stat_score_5hum_vs_6hum}, p-value = {p_score_5hum_vs_6hum}, Rank-Biserial = {r_score_5hum_vs_6hum}")

Items 5 humans vs 6 humans: U-statistic = 21269.5, p-value = 0.4841789174358023, Rank-Biserial = 0.0025216817496229726
Score 5 humans vs 6 humans: U-statistic = 21406.5, p-value = 0.44348487126613423, Rank-Biserial = 0.008979072398189958


#### Compare: 3 bots, 3 humans vs 1 bot, 5 humans

In [6]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_3hum_vs_5hum, p_items_3hum_vs_5hum = mannwhitneyu(items_found_3hum, items_found_5hum, alternative='greater')
r_items_3hum_vs_5hum = rank_biserial(items_found_3hum, items_found_5hum, alternative='greater')
print(f"Items 3 humans vs 5 humans: U-statistic = {stat_items_3hum_vs_5hum}, p-value = {p_items_3hum_vs_5hum}, Rank-Biserial = {r_items_3hum_vs_5hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_3hum_vs_5hum, p_score_3hum_vs_5hum = mannwhitneyu(score_3hum, score_5hum, alternative='greater')
r_score_3hum_vs_5hum = rank_biserial(score_3hum, score_5hum, alternative='greater')
print(f"Score 3 humans vs 5 humans: U-statistic = {stat_score_3hum_vs_5hum}, p-value = {p_score_3hum_vs_5hum}, Rank-Biserial = {r_score_3hum_vs_5hum}")

Items 3 humans vs 5 humans: U-statistic = 6301.5, p-value = 0.01892902210684083, Rank-Biserial = 0.16521819526627213
Score 3 humans vs 5 humans: U-statistic = 6382.0, p-value = 0.011818739015513058, Rank-Biserial = 0.1801035502958579
