# Mann-Whitney U test: Compare Human Performance

In [1]:
from scipy.stats import mannwhitneyu
import pandas as pd
from scripts.utils import rank_biserial

## Compare: Human

In [2]:
# Read the CSV file
df_6hum = pd.read_csv('../csv_data/nonsem_6hum.csv')
df_1bot_5hum = pd.read_csv('../csv_data/nonsem_1bot_5hum.csv')
df_3bots_3hum = pd.read_csv('../csv_data/nonsem_3bots_3hum.csv')

# Extract the 'ItemsFound' column when isRobot == 0
items_found_6hum = df_6hum['ItemsFound']
items_found_5hum = df_1bot_5hum[df_1bot_5hum['isRobot'] == 0]['ItemsFound']
items_found_3hum = df_3bots_3hum[df_3bots_3hum['isRobot'] == 0]['ItemsFound']

# Extract the 'Score' column when isRobot == 0
score_6hum = df_6hum['Score']
score_5hum = df_1bot_5hum[df_1bot_5hum['isRobot'] == 0]['Score']
score_3hum = df_3bots_3hum[df_3bots_3hum['isRobot'] == 0]['Score']

#### Compare Items: 3 bots, 3 humans vs 6 humans

In [3]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_3hum_vs_6hum, p_items_3hum_vs_6hum = mannwhitneyu(items_found_3hum, items_found_6hum, alternative='greater')
r_items_3hum_vs_6hum = rank_biserial(items_found_3hum, items_found_6hum, alternative='greater')
print(f"Items 3 humans vs 6 humans: U-statistic = {stat_items_3hum_vs_6hum}, p-value = {p_items_3hum_vs_6hum}, Rank-Biserial = {r_items_3hum_vs_6hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_3hum_vs_6hum, p_score_3hum_vs_6hum = mannwhitneyu(score_3hum, score_6hum, alternative='greater')
r_score_3hum_vs_6hum = rank_biserial(score_3hum, score_6hum, alternative='greater')
print(f"Score 3 humans vs 6 humans: U-statistic = {stat_score_3hum_vs_6hum}, p-value = {p_score_3hum_vs_6hum}, Rank-Biserial = {r_score_3hum_vs_6hum}")

Items 3 humans vs 6 humans: U-statistic = 24852.5, p-value = 0.003222376146221037, Rank-Biserial = 0.1714036576168929
Score 3 humans vs 6 humans: U-statistic = 25482.0, p-value = 0.0007013835340530483, Rank-Biserial = 0.20107466063348411


In [5]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column (6hum > 3hum)
stat_items_6hum_vs_3hum, p_items_6hum_vs_3hum = mannwhitneyu(items_found_6hum, items_found_3hum, alternative='greater')
r_items_6hum_vs_3hum = rank_biserial(items_found_6hum, items_found_3hum, alternative='greater')
print(f"Items 6 humans vs 3 humans (6hum > 3hum): U-statistic = {stat_items_6hum_vs_3hum}, p-value = {p_items_6hum_vs_3hum}, Rank-Biserial = {r_items_6hum_vs_3hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column (6hum > 3hum)
stat_score_6hum_vs_3hum, p_score_6hum_vs_3hum = mannwhitneyu(score_6hum, score_3hum, alternative='greater')
r_score_6hum_vs_3hum = rank_biserial(score_6hum, score_3hum, alternative='greater')
print(f"Score 6 humans vs 3 humans (6hum > 3hum): U-statistic = {stat_score_6hum_vs_3hum}, p-value = {p_score_6hum_vs_3hum}, Rank-Biserial = {r_score_6hum_vs_3hum}")


Items 6 humans vs 3 humans (6hum > 3hum): U-statistic = 17579.5, p-value = 0.9967849272104046, Rank-Biserial = -0.1714036576168929
Score 6 humans vs 3 humans (6hum > 3hum): U-statistic = 16950.0, p-value = 0.9993004336426138, Rank-Biserial = -0.20107466063348411


#### Compare Items: 1 bot, 5 humans vs 6 humans

In [6]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_5hum_vs_6hum, p_items_5hum_vs_6hum = mannwhitneyu(items_found_5hum, items_found_6hum, alternative='greater')
r_items_5hum_vs_6hum = rank_biserial(items_found_5hum, items_found_6hum, alternative='greater')
print(f"Items 5 humans vs 6 humans: U-statistic = {stat_items_5hum_vs_6hum}, p-value = {p_items_5hum_vs_6hum}, Rank-Biserial = {r_items_5hum_vs_6hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_5hum_vs_6hum, p_score_5hum_vs_6hum = mannwhitneyu(score_5hum, score_6hum, alternative='greater')
r_score_5hum_vs_6hum = rank_biserial(score_5hum, score_6hum, alternative='greater')
print(f"Score 5 humans vs 6 humans: U-statistic = {stat_score_5hum_vs_6hum}, p-value = {p_score_5hum_vs_6hum}, Rank-Biserial = {r_score_5hum_vs_6hum}")

Items 5 humans vs 6 humans: U-statistic = 21269.5, p-value = 0.4841789174358023, Rank-Biserial = 0.0025216817496229726
Score 5 humans vs 6 humans: U-statistic = 21406.5, p-value = 0.44348487126613423, Rank-Biserial = 0.008979072398189958


In [8]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column (6hum > 5hum)
stat_items_6hum_vs_5hum, p_items_6hum_vs_5hum = mannwhitneyu(items_found_6hum, items_found_5hum, alternative='greater')
r_items_6hum_vs_5hum = rank_biserial(items_found_6hum, items_found_5hum, alternative='greater')
print(f"Items 6 humans vs 5 humans (6hum > 5hum): U-statistic = {stat_items_6hum_vs_5hum}, p-value = {p_items_6hum_vs_5hum}, Rank-Biserial = {r_items_6hum_vs_5hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column (6hum > 5hum)
stat_score_6hum_vs_5hum, p_score_6hum_vs_5hum = mannwhitneyu(score_6hum, score_5hum, alternative='greater')
r_score_6hum_vs_5hum = rank_biserial(score_6hum, score_5hum, alternative='greater')
print(f"Score 6 humans vs 5 humans (6hum > 5hum): U-statistic = {stat_score_6hum_vs_5hum}, p-value = {p_score_6hum_vs_5hum}, Rank-Biserial = {r_score_6hum_vs_5hum}")


Items 6 humans vs 5 humans (6hum > 5hum): U-statistic = 21162.5, p-value = 0.5161194325581966, Rank-Biserial = -0.0025216817496229726
Score 6 humans vs 5 humans (6hum > 5hum): U-statistic = 21025.5, p-value = 0.5568105632552806, Rank-Biserial = -0.008979072398190069


#### Compare: 3 bots, 3 humans vs 1 bot, 5 humans

In [10]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column
stat_items_3hum_vs_5hum, p_items_3hum_vs_5hum = mannwhitneyu(items_found_3hum, items_found_5hum, alternative='greater')
r_items_3hum_vs_5hum = rank_biserial(items_found_3hum, items_found_5hum, alternative='greater')
print(f"Items 3 humans vs 5 humans: U-statistic = {stat_items_3hum_vs_5hum}, p-value = {p_items_3hum_vs_5hum}, Rank-Biserial = {r_items_3hum_vs_5hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column
stat_score_3hum_vs_5hum, p_score_3hum_vs_5hum = mannwhitneyu(score_3hum, score_5hum, alternative='greater')
r_score_3hum_vs_5hum = rank_biserial(score_3hum, score_5hum, alternative='greater')
print(f"Score 3 humans vs 5 humans: U-statistic = {stat_score_3hum_vs_5hum}, p-value = {p_score_3hum_vs_5hum}, Rank-Biserial = {r_score_3hum_vs_5hum}")

Items 3 humans vs 5 humans: U-statistic = 6301.5, p-value = 0.01892902210684083, Rank-Biserial = 0.16521819526627213
Score 3 humans vs 5 humans: U-statistic = 6382.0, p-value = 0.011818739015513058, Rank-Biserial = 0.1801035502958579


In [12]:
# Perform Mann-Whitney U test (alternative='greater') for the 'ItemsFound' column (5hum > 3hum)
stat_items_5hum_vs_3hum, p_items_5hum_vs_3hum = mannwhitneyu(items_found_5hum, items_found_3hum, alternative='greater')
r_items_5hum_vs_3hum = rank_biserial(items_found_5hum, items_found_3hum, alternative='greater')
print(f"Items 5 humans vs 3 humans (5hum > 3hum): U-statistic = {stat_items_5hum_vs_3hum}, p-value = {p_items_5hum_vs_3hum}, Rank-Biserial = {r_items_5hum_vs_3hum}")

# Perform Mann-Whitney U test (alternative='greater') for the 'Score' column (5hum > 3hum)
stat_score_5hum_vs_3hum, p_score_5hum_vs_3hum = mannwhitneyu(score_5hum, score_3hum, alternative='greater')
r_score_5hum_vs_3hum = rank_biserial(score_5hum, score_3hum, alternative='greater')
print(f"Score 5 humans vs 3 humans (5hum > 3hum): U-statistic = {stat_score_5hum_vs_3hum}, p-value = {p_score_5hum_vs_3hum}, Rank-Biserial = {r_score_5hum_vs_3hum}")


Items 5 humans vs 3 humans (5hum > 3hum): U-statistic = 4514.5, p-value = 0.981178157627451, Rank-Biserial = -0.16521819526627224
Score 5 humans vs 3 humans (5hum > 3hum): U-statistic = 4434.0, p-value = 0.988252725707378, Rank-Biserial = -0.180103550295858
