In [6]:
!cp "/content/drive/MyDrive/Authorship Bias Paper/Data/matchData.csv" .

In [7]:
import pandas as pd
import numpy as np
from scipy import stats

In [8]:
df = pd.read_csv("matchData.csv")
df = df[df["h-index"] < 15]
df.columns

Index(['Name', 'Google Scholar ID', 'Gender', 'Ethnicity', 'Affiliation',
       'Country', 'h-index', 'Citation count',
       'Co-authors’ names (Google Scholar)',
       'Co-authors’ genders (Google Scholar)',
       'Co-authors’ ethnicity (Google Scholar)', 'Co-authors’ names (OpenAI)',
       'Co-authors’ genders (OpenAI)', 'Co-authors’ ethnicity (OpenAI)',
       'Match Count 50%', 'Match Count 60%', 'Match Count 70%',
       'Match Count 80%'],
      dtype='object')

In [9]:
df["Recall 50%"] = 0
df["Recall 60%"] = 0
df["Recall 70%"] = 0
df["Recall 80%"] = 0

In [10]:
for index, row in df.iterrows():
  df.at[index, "Recall 50%"] = row["Match Count 50%"] / len(row["Co-authors’ names (Google Scholar)"].split(", "))
  df.at[index, "Recall 60%"] = row["Match Count 60%"] / len(row["Co-authors’ names (Google Scholar)"].split(", "))
  df.at[index, "Recall 70%"] = row["Match Count 70%"] / len(row["Co-authors’ names (Google Scholar)"].split(", "))
  df.at[index, "Recall 80%"] = row["Match Count 80%"] / len(row["Co-authors’ names (Google Scholar)"].split(", "))

# Gender Conditional Demographic Parity

In [11]:
print("Gender CDP 50% : ")
df[df["Gender"] == "Female"]["Recall 50%"].mean() / df[df["Gender"] == "Male"]["Recall 50%"].mean()

Gender CDP 50% : 


0.6775546929781464

In [12]:
stats.ttest_ind(df[df["Gender"] == "Female"]["Recall 50%"].to_list(), df[df["Gender"] == "Male"]["Recall 50%"].to_list())

TtestResult(statistic=-1.895864724712629, pvalue=0.06108286351196174, df=93.0)

In [13]:
print("Gender CDP 60% : ")
df[df["Gender"] == "Female"]["Recall 60%"].mean() / df[df["Gender"] == "Male"]["Recall 60%"].mean()

Gender CDP 60% : 


0.4116920557968297

In [14]:
stats.ttest_ind(df[df["Gender"] == "Female"]["Recall 60%"].to_list(), df[df["Gender"] == "Male"]["Recall 60%"].to_list())

TtestResult(statistic=-2.4202189896433155, pvalue=0.017454439875374768, df=93.0)

In [15]:
print("Gender CDP 70% : ")
df[df["Gender"] == "Female"]["Recall 70%"].mean() / df[df["Gender"] == "Male"]["Recall 70%"].mean()

Gender CDP 70% : 


0.045808954205442294

In [16]:
stats.ttest_ind(df[df["Gender"] == "Female"]["Recall 70%"].to_list(), df[df["Gender"] == "Male"]["Recall 70%"].to_list())

TtestResult(statistic=-2.599486104614053, pvalue=0.010858405555783985, df=93.0)

In [17]:
print("Gender CDP 80% : ")
df[df["Gender"] == "Female"]["Recall 80%"].mean() / df[df["Gender"] == "Male"]["Recall 80%"].mean()

Gender CDP 80% : 


0.05758760354452312

In [18]:
stats.ttest_ind(df[df["Gender"] == "Female"]["Recall 80%"].to_list(), df[df["Gender"] == "Male"]["Recall 80%"].to_list())

TtestResult(statistic=-2.248878045380056, pvalue=0.026880980061748765, df=93.0)

# Ethnicity Conditional Demographic Parity

In [19]:
print("Ethnicity CDP 50% : ")
df[df["Ethnicity"] != "White"]["Recall 50%"].mean() / df[df["Ethnicity"] == "White"]["Recall 50%"].mean()

Ethnicity CDP 50% : 


1.7239185941011077

In [20]:
stats.ttest_ind(df[df["Ethnicity"] != "White"]["Recall 50%"].to_list(), df[df["Ethnicity"] == "White"]["Recall 50%"].to_list())

TtestResult(statistic=2.5025854325575723, pvalue=0.014074753623473485, df=93.0)

In [21]:
print("Ethnicity CDP 60% : ")
df[df["Ethnicity"] != "White"]["Recall 60%"].mean() / df[df["Ethnicity"] == "White"]["Recall 60%"].mean()

Ethnicity CDP 60% : 


2.1552579107140586

In [22]:
stats.ttest_ind(df[df["Ethnicity"] != "White"]["Recall 60%"].to_list(), df[df["Ethnicity"] == "White"]["Recall 60%"].to_list())

TtestResult(statistic=2.072333535007129, pvalue=0.04100196969384453, df=93.0)

In [23]:
print("Ethnicity CDP 70% : ")
df[df["Ethnicity"] != "White"]["Recall 70%"].mean() / df[df["Ethnicity"] == "White"]["Recall 70%"].mean()

Ethnicity CDP 70% : 


3.928302696524398

In [24]:
stats.ttest_ind(df[df["Ethnicity"] != "White"]["Recall 70%"].to_list(), df[df["Ethnicity"] == "White"]["Recall 70%"].to_list())

TtestResult(statistic=1.7600449241071645, pvalue=0.08168817422886794, df=93.0)

In [25]:
print("Ethnicity CDP 80% : ")
df[df["Ethnicity"] != "White"]["Recall 80%"].mean() / df[df["Ethnicity"] == "White"]["Recall 80%"].mean()

Ethnicity CDP 80% : 


8.484048175072388

In [26]:
stats.ttest_ind(df[df["Ethnicity"] != "White"]["Recall 80%"].to_list(), df[df["Ethnicity"] == "White"]["Recall 80%"].to_list())

TtestResult(statistic=1.9472103870197461, pvalue=0.05452626903135477, df=93.0)

# Language Conditional Demographic Parity

In [27]:
englishCountries = ["USA", "UK", "Ireland", "Canada", "Australia"]

In [28]:
non_english_recall_mean = df[~df["Country"].isin(englishCountries)]["Recall 50%"].mean()
english_recall_mean = df[df["Country"].isin(englishCountries)]["Recall 50%"].mean()

language_demographic_parity = non_english_recall_mean / english_recall_mean
print("Language CDP 50%:", language_demographic_parity)

Language CDP 50%: 1.1233339312735078


In [29]:
stats.ttest_ind(df[~df["Country"].isin(englishCountries)]["Recall 50%"].to_list(), df[df["Country"].isin(englishCountries)]["Recall 50%"].to_list())

TtestResult(statistic=0.665177651417477, pvalue=0.5075831418290142, df=93.0)

In [30]:
non_english_recall_mean = df[~df["Country"].isin(englishCountries)]["Recall 60%"].mean()
english_recall_mean = df[df["Country"].isin(englishCountries)]["Recall 60%"].mean()

language_demographic_parity = non_english_recall_mean / english_recall_mean
print("Language CDP 60%:", language_demographic_parity)

Language CDP 60%: 1.1271563470505859


In [31]:
stats.ttest_ind(df[~df["Country"].isin(englishCountries)]["Recall 60%"].to_list(), df[df["Country"].isin(englishCountries)]["Recall 60%"].to_list())

TtestResult(statistic=0.4318464073986611, pvalue=0.666852601520896, df=93.0)

In [32]:
non_english_recall_mean = df[~df["Country"].isin(englishCountries)]["Recall 70%"].mean()
english_recall_mean = df[df["Country"].isin(englishCountries)]["Recall 70%"].mean()

language_demographic_parity = non_english_recall_mean / english_recall_mean
print("Language CDP 70%:", language_demographic_parity)

Language CDP 70%: 1.5124472015340424


In [33]:
stats.ttest_ind(df[~df["Country"].isin(englishCountries)]["Recall 70%"].to_list(), df[df["Country"].isin(englishCountries)]["Recall 70%"].to_list())

TtestResult(statistic=0.8431718296006397, pvalue=0.40129576326841554, df=93.0)

In [34]:
non_english_recall_mean = df[~df["Country"].isin(englishCountries)]["Recall 80%"].mean()
english_recall_mean = df[df["Country"].isin(englishCountries)]["Recall 80%"].mean()

language_demographic_parity = non_english_recall_mean / english_recall_mean
print("Language CDP 80%:", language_demographic_parity)

Language CDP 80%: 1.8903101775884361


In [35]:
stats.ttest_ind(df[~df["Country"].isin(englishCountries)]["Recall 80%"].to_list(), df[df["Country"].isin(englishCountries)]["Recall 80%"].to_list())

TtestResult(statistic=1.1301884909674416, pvalue=0.26130323017559093, df=93.0)