In [None]:
import json
import jsonlines
import os, sys

sys.path.append("../..")
from src.config_utils import get_benchmark_config
from src.wiki_helpers import get_label_from_url

In [None]:
dataset_path = "/Datasets"
benchmark_config = get_benchmark_config()

In [None]:
benchmark_config

# Get human entities with properties

In [None]:
property_labels = ["itemLabel",
                   "gender",
                   "instanceOf",
                   "coordinates",
                   "occupation",
                   "ethnicity",
                   "religion"]
location_identifiers = [#"countryOfOrigin",
                            #"country",
                            #"locatedIn",
                            "location",
                            "countryOfCitizenship",
                            "placeOfBirth"]
time_identifiers = [#"inception",
                        #"startTime",
                        #"pointInTime",
                        "dateOfBirth",
                        "dateOfDeath"]
property_labels += location_identifiers + time_identifiers

In [None]:
exclude = ["SIQA", "GSM8K", "COPA", "MMLU"]

In [None]:
def create_property_dict(results_dict, qid_to_label_map=None):
    # only human entities
    human = "http://www.wikidata.org/entity/Q5"
    fictional_human = "http://www.wikidata.org/entity/Q15632617"
    qid = list(results_dict.keys())[0]
    results_dict = list(results_dict.values())[0]
    new_dict = {}
    for key in property_labels:
        if "instanceOf" in results_dict.keys() and results_dict["instanceOf"]["value"] in [human, fictional_human]:
            new_dict["qid"] = qid
            if key in results_dict.keys():
                new_val = results_dict[key]["value"]
                label, qid_to_label_map = get_label_from_url(new_val, qid_to_label_map)
                new_dict[key] = label
    return new_dict, qid_to_label_map


In [None]:
def get_attribute_dicts(wiki_metadata_path, qid_to_label_map):
      metadata_list = []
      try:
          with open(wiki_metadata_path, 'r') as f:
                for entry in f.readlines():
                    metadata_list += [json.loads(entry)]
      except:
          try:
              with open(os.path.join(os.path.basename(wiki_metadata_path), "metadata_lists", "wiki_metadata.txt"), 'r') as f:
                    for entry in f.readlines():
                        metadata_list += [json.loads(entry)]
          except:
              print("Could not read metadata file")
      property_dict = []

      for item in metadata_list:
          out_dict, qid_to_label_map = create_property_dict(item, qid_to_label_map)
          if len(out_dict) > 0:
               property_dict += [out_dict]
      return property_dict, qid_to_label_map


In [None]:
qid_to_label_map = {}

for b in benchmark_config:
    if b["name"] in benchs:
        print("Processing", b["name"])
        input_path = os.path.join(dataset_path, b["dataset_path"], "wikidata_metadata.txt")
        output_path = os.path.join(dataset_path, b["dataset_path"], "wikidata_metadata_humans.json")

        property_dict, qid_to_label_map = get_attribute_dicts(input_path, qid_to_label_map)

        with jsonlines.open(output_path, 'w') as f:
            f.write(property_dict)

# Gather & plot occupation by gender

In [None]:
dict_of_dicts = {}

In [None]:
for b in benchmark_config:
    tmp_dict = []
    print("Processing", b["name"])
    input_path = os.path.join(dataset_path, b["dataset_path"], "wikidata_metadata_humans.json")
    with open(input_path, 'r') as f:
        for entry in f.readlines():
            e = json.loads(entry)
            for entity in e:
                try:
                    tmp_dict += [entity]
                except:
                    print(entity)
    dict_of_dicts[b["name"]] = tmp_dict

In [None]:
import pandas as pd

In [None]:
gender_occupation_df = pd.DataFrame(columns=["benchmark", "qid", "gender", "occupation"])
i = 0
for benchmark_name in dict_of_dicts.keys():
    for entity in dict_of_dicts[benchmark_name]:
        gender_occupation_df.loc[i, "benchmark"] = benchmark_name
        for k, v in entity.items():
            gender_occupation_df.loc[i, k] = v
        i += 1

In [None]:
gender_occupation_df

# Plots

In [None]:
all_benchmarks = gender_occupation_df["benchmark"].unique()


In [None]:
occ_benchmarks = []
for b in all_benchmarks:
    occ_sum = gender_occupation_df.loc[gender_occupation_df["benchmark"] == b, ["benchmark", "gender", "occupation"]].value_counts().sum()
    print(b, occ_sum)
    if occ_sum > 300:
        occ_benchmarks += [b]

In [None]:
occ_benchmarks_df = gender_occupation_df.loc[gender_occupation_df["benchmark"].isin(occ_benchmarks), ["benchmark", "gender", "occupation"]]
occ_benchmarks_df.head()

In [None]:
num_occ_benchmarks = len(occ_benchmarks)
num_occ_benchmarks

In [None]:
! pip install colorcet

In [None]:
import colorcet as cc

def map_list_to_color(lst, hex=False):
    offset = 0
    if not hex:
        colors = cc.cm.glasbey.colors[offset : len(lst) + offset]
    else:
        colors = cc.glasbey[offset : len(lst) + offset]
    map = dict(zip(lst, colors))
    return map

In [None]:
benchmarks_color_list = map_list_to_color(dict_of_dicts.keys())
benchmarks_color_list_hex = map_list_to_color(dict_of_dicts.keys(), hex=True)
benchmarks_color_list

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
occ_benchmarks

In [None]:
benchmarks_color_list_hex = {'HotpotQA': '#d60000',
 'BoolQ': '#8c3bff',
 'StrategyQA': '#018700',
 'PIQA': '#00acc6',
 'SQuAD': '#97ff00',
 'WinoGrande': '#ff7ed1',
 'DROP': '#6b004f',
 'TriviaQA': '#ffa52f',
 'WebQuestions': '#573b00',
 'NaturalQuestions': '#005659',
 'COPA': '#0000dd',
 'CommonsenseQA': '#00fdcf',
 'SIQA': '#a17569',
 'HellaSwag': '#bcb6ff',
 'TruthfulQA': '#95b577',
 'COQA': '#bf03b8',
 'OpenBookQA': '#645474',
 'RACE': '#790000',
 'ScienceQA': '#0774d8',
 'MMLU': '#fdf490',
 'GPQA': '#004b00',
 'ARC': '#8e7900',
 'GSM8K': '#ff7266'}

In [None]:
cc_benchmarks_df["benchmark"].value_counts()

In [None]:
occ_benchmarks_df["benchmark"].unique()

In [None]:
top_n = 10

benchs = [b for b in occ_benchmarks_df["benchmark"].unique() if b not in exclude]
genders = ["female", "male"]
ncols = 2
nrows = len(benchs)

fig, axs = plt.subplots(ncols=ncols, nrows=nrows, sharex=True, figsize=(ncols*3.5, nrows*2.3))
plt.suptitle('Top-10 occupations by gender [%]',y=.995, fontsize=15)

count = 0
for c in range(ncols):
    for r in range(nrows):
        series = occ_benchmarks_df.loc[(occ_benchmarks_df["benchmark"]==benchs[r]) & (occ_benchmarks_df["gender"]==genders[c]), ["occupation"]]
        percentages = series.value_counts().values[:top_n] / len(series) * 100
        indices = series.value_counts().index[:top_n]
        indices = [i[0] for i in indices]
        name_updates = {"association football player": "assoc. football pl.", 
                        "American football player": "American football pl.", 
                        "racing automobile driver": "racing autom. driver",
                        "beach volleyball player": "beach volleyb. pl.",
                        "beauty pageant contestant": "beauty pageant cont."}
        indices = [i.replace(i, name_updates[i]) if i in name_updates else i for i in indices]
        sns.barplot(y=indices, x=percentages, ax=axs[r,c], color=benchmarks_color_list_hex[benchs[r]])
        axs[r,c].set_title(genders[c], fontsize=14)
        axs[r,c].set_xticks([0, 5, 10, 15]) #, fontsize=14)
        axs[r,c].tick_params(axis='y', labelsize=14)
        axs[r,c].tick_params(axis='x', labelsize=14)
        if c % ncols == 1:
            axs[r,c].yaxis.set_label_position("right")
            axs[r,c].set_ylabel(benchs[r], fontsize=14)
        axs[r,c].set_xlabel("")
fig.tight_layout()
plt.savefig(os.path.join(dataset_path, 'images', 'occ_by_gender', f'occupations_combined_appendix.pdf'), format='pdf', dpi=400, bbox_inches='tight')