In [1]:
import pandas as pd

### Get data in

In [2]:
# read data
df = pd.read_csv("../dataset_final.csv")

# select subset of data with labelled genders.
data_20 = df[(df['year'] == 2020) & (df['genders'] != "-1;-1")][['ratings', 'genders']]

# calculate average score
def calculate_average_score(row):
    ratings = [float(rating) for rating in row['ratings'].split(";")]
    return round(sum(ratings) / len(ratings), 1)

# parse author gender
def get_author_gender_at(pos):
    def parse_gender(row):
        return row['genders'].split(";")[pos]
    return parse_gender

data_20['average_score'] = data_20.apply(calculate_average_score, axis = 1)
data_20['first_author_gender'] = data_20.apply(get_author_gender_at(0), axis = 1)
data_20['last_author_gender'] = data_20.apply(get_author_gender_at(-1), axis = 1)

data_20.head(3)

Unnamed: 0,ratings,genders,average_score,first_author_gender,last_author_gender
3008,6;8;6,m;m,6.7,m,m
3009,8;8,m;m,8.0,m,m
3010,8;3;8;8,m;m,6.8,m,m


In [3]:
print("Total number of papers with first author is female", len(data_20[data_20['first_author_gender'] == "f"]))
print("Total number of papers with first author is male", len(data_20[data_20['first_author_gender'] == "m"]))

Total number of papers with first author is female 272
Total number of papers with first author is male 2145


In [4]:
print("Total number of papers with last author is female", len(data_20[data_20['last_author_gender'] == "f"]))
print("Total number of papers with last author is male", len(data_20[data_20['last_author_gender'] == "m"]))

Total number of papers with last author is female 247
Total number of papers with last author is male 2249


### Mann-Whitney U test

In [5]:
import scipy.stats as stats
import numpy as np

In [6]:
male_first = data_20[data_20['first_author_gender'] == "m"]['average_score']
female_first = data_20[data_20['first_author_gender'] == "f"]['average_score']

In [7]:
print("Average score of papers with first author is female", female_first.mean())
print("Average score of papers with first author is male", male_first.mean())

Average score of papers with first author is female 4.056985294117647
Average score of papers with first author is male 4.2129603729603735


In [8]:
stats.mannwhitneyu(female_first, male_first)

MannwhitneyuResult(statistic=276796.5, pvalue=0.08349912595964876)

In [9]:
male_last = data_20[data_20['last_author_gender'] == "m"]['average_score']
female_last = data_20[data_20['last_author_gender'] == "f"]['average_score']

In [10]:
print("Average score of papers with last author is female", female_last.mean())
print("Average score of papers with last author is male", male_last.mean())

Average score of papers with last author is female 4.178137651821863
Average score of papers with last author is male 4.198932859048466


In [11]:
stats.mannwhitneyu(female_last, male_last)

MannwhitneyuResult(statistic=275505.5, pvalue=0.41695158116074954)