In [1]:
# %% Imports
# In-built modules
import os
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pykeen
import torch
from tqdm import tqdm

# Internal Imports
from src.utils import set_base_path_based_on_host, HumanWikidata5M_pykeen
from src.bias_measurement.link_prediction_bias.utils import get_sensitive_and_target_relations

pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 750)
pd.set_option('display.min_rows', 20)
pd.set_option('display.max_rows', 20)

BASE_PATH_HOST = set_base_path_based_on_host()

# Make analysis for final subset of HumanWikidata5M

## Target relation: occupation

In [18]:
occupation_facts_HW5M_subset = pd.read_csv(os.path.join(BASE_PATH_HOST,'results/bias_measurement/link_prediction_bias/',                                                         'preds_df_occupation_HumanW5M_subset_testset_with_sensitive_attribute_info.tsv'), sep = '\t')

### Sensitive attribute: gender

In [4]:
tail_value_counts_gender = occupation_facts_HW5M_subset['P21'].value_counts()
tail_value_counts_gender

-1          2743
Q6581097     871
Q6581072     149
Name: P21, dtype: int64

How many facts exist per occupation class in the testset?

In [7]:
groupy_occupation_labels_all = occupation_facts_HW5M_subset.groupby('true_tail_class_label')
groupy_occupation_labels_all.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,P21,P27,P172,P140
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1534,1534,1534,1534,1534,1534,1534,1534,1534
1,1070,1070,1070,1070,1070,1070,1070,1070,1070
2,262,262,262,262,262,262,262,262,262
3,253,253,253,253,253,253,253,253,253
4,158,158,158,158,158,158,158,158,158
5,142,142,142,142,142,142,142,142,142
6,129,129,129,129,129,129,129,129,129
7,109,109,109,109,109,109,109,109,109
8,106,106,106,106,106,106,106,106,106


How many of each of these have gender information, i.e. a value in column P21 that is not -1?

In [10]:
valid_gender_only = occupation_facts_HW5M_subset[occupation_facts_HW5M_subset['P21'] != '-1']
groupy_occupation_labels_only_valid_gender = valid_gender_only.groupby('true_tail_class_label')
groupy_occupation_labels_only_valid_gender.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,P21,P27,P172,P140
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,399,399,399,399,399,399,399,399,399
1,308,308,308,308,308,308,308,308,308
2,69,69,69,69,69,69,69,69,69
3,78,78,78,78,78,78,78,78,78
4,47,47,47,47,47,47,47,47,47
5,32,32,32,32,32,32,32,32,32
6,28,28,28,28,28,28,28,28,28
7,33,33,33,33,33,33,33,33,33
8,26,26,26,26,26,26,26,26,26


Of these, how many have gender male? - Q6581097

In [12]:
male_gender_only = occupation_facts_HW5M_subset[occupation_facts_HW5M_subset['P21'] == 'Q6581097']
groupy_occupation_labels_only_male_gender = male_gender_only.groupby('true_tail_class_label')
groupy_occupation_labels_only_male_gender.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,P21,P27,P172,P140
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,336,336,336,336,336,336,336,336,336
1,274,274,274,274,274,274,274,274,274
2,58,58,58,58,58,58,58,58,58
3,72,72,72,72,72,72,72,72,72
4,34,34,34,34,34,34,34,34,34
5,31,31,31,31,31,31,31,31,31
6,21,21,21,21,21,21,21,21,21
7,19,19,19,19,19,19,19,19,19
8,26,26,26,26,26,26,26,26,26


Of these, how many have gender female? - Q6581072


In [13]:
female_gender_only = occupation_facts_HW5M_subset[occupation_facts_HW5M_subset['P21'] == 'Q6581072']
groupy_occupation_labels_only_female_gender = female_gender_only.groupby('true_tail_class_label')
groupy_occupation_labels_only_female_gender.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,P21,P27,P172,P140
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,63,63,63,63,63,63,63,63,63
1,34,34,34,34,34,34,34,34,34
2,11,11,11,11,11,11,11,11,11
3,6,6,6,6,6,6,6,6,6
4,13,13,13,13,13,13,13,13,13
5,1,1,1,1,1,1,1,1,1
6,7,7,7,7,7,7,7,7,7
7,14,14,14,14,14,14,14,14,14


### Sensitive attribute: country of citizenship

In [16]:
tail_value_counts_citizenship = occupation_facts_HW5M_subset['P27'].value_counts()
tail_value_counts_citizenship

-1         2721
Q30         357
Q142         80
Q145         74
Q183         44
Q16          41
Q17          27
Q668         22
Q159         19
Q15180       19
           ... 
Q227          1
Q844930       1
Q971          1
Q736          1
Q217          1
Q179876       1
Q974          1
Q211          1
Q954          1
Q42620        1
Name: P27, Length: 114, dtype: int64

How many of each of these have gender information, i.e. a value in column P21 that is not -1?

In [19]:
valid_citizenship_only = occupation_facts_HW5M_subset[occupation_facts_HW5M_subset['P27'] != '-1']
groupy_occupation_labels_only_valid_citizenship = valid_citizenship_only.groupby('true_tail_class_label')
groupy_occupation_labels_only_valid_citizenship.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,P21,P27,P172,P140
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,437,437,437,437,437,437,437,437,437
1,278,278,278,278,278,278,278,278,278
2,68,68,68,68,68,68,68,68,68
3,76,76,76,76,76,76,76,76,76
4,44,44,44,44,44,44,44,44,44
5,34,34,34,34,34,34,34,34,34
6,38,38,38,38,38,38,38,38,38
7,29,29,29,29,29,29,29,29,29
8,38,38,38,38,38,38,38,38,38


Of these, how many have citizenship in USA? - Q30

In [20]:
US_citizenship_only = occupation_facts_HW5M_subset[occupation_facts_HW5M_subset['P27'] == 'Q30']
groupy_occupation_labels_US_citizenship_only = US_citizenship_only.groupby('true_tail_class_label')
groupy_occupation_labels_US_citizenship_only.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,P21,P27,P172,P140
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,146,146,146,146,146,146,146,146,146
1,104,104,104,104,104,104,104,104,104
2,9,9,9,9,9,9,9,9,9
3,50,50,50,50,50,50,50,50,50
4,18,18,18,18,18,18,18,18,18
5,2,2,2,2,2,2,2,2,2
6,2,2,2,2,2,2,2,2,2
7,17,17,17,17,17,17,17,17,17
8,9,9,9,9,9,9,9,9,9


Of these, how many have citizenship in France? - Q142

In [21]:
France_citizenship_only = occupation_facts_HW5M_subset[occupation_facts_HW5M_subset['P27'] == 'Q142']
groupy_occupation_labels_France_citizenship_only = France_citizenship_only.groupby('true_tail_class_label')
groupy_occupation_labels_France_citizenship_only.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,P21,P27,P172,P140
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,33,33,33,33,33,33,33,33,33
1,11,11,11,11,11,11,11,11,11
2,13,13,13,13,13,13,13,13,13
3,1,1,1,1,1,1,1,1,1
4,5,5,5,5,5,5,5,5,5
5,1,1,1,1,1,1,1,1,1
6,8,8,8,8,8,8,8,8,8
7,3,3,3,3,3,3,3,3,3
8,5,5,5,5,5,5,5,5,5


Of these, how many have citizenship in United Kingdom? - Q145

In [23]:
UK_citizenship_only = occupation_facts_HW5M_subset[occupation_facts_HW5M_subset['P27'] == 'Q145']
groupy_occupation_labels_UK_citizenship_only = UK_citizenship_only.groupby('true_tail_class_label')
groupy_occupation_labels_UK_citizenship_only.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,P21,P27,P172,P140
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,41,41,41,41,41,41,41,41,41
1,9,9,9,9,9,9,9,9,9
2,4,4,4,4,4,4,4,4,4
3,1,1,1,1,1,1,1,1,1
4,2,2,2,2,2,2,2,2,2
5,10,10,10,10,10,10,10,10,10
6,2,2,2,2,2,2,2,2,2
7,3,3,3,3,3,3,3,3,3
8,2,2,2,2,2,2,2,2,2


Of these, how many have citizenship in Germany? - Q183

In [24]:
Germany_citizenship_only = occupation_facts_HW5M_subset[occupation_facts_HW5M_subset['P27'] == 'Q183']
groupy_occupation_labels_Germany_citizenship_only = Germany_citizenship_only.groupby('true_tail_class_label')
groupy_occupation_labels_Germany_citizenship_only.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,P21,P27,P172,P140
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,19,19,19,19,19,19,19,19,19
1,12,12,12,12,12,12,12,12,12
2,4,4,4,4,4,4,4,4,4
3,2,2,2,2,2,2,2,2,2
4,4,4,4,4,4,4,4,4,4
5,1,1,1,1,1,1,1,1,1
8,2,2,2,2,2,2,2,2,2


# Make the same analysis for FB15k-237



## Target relation: occupation

In [34]:
occupation_facts_FB15k237 = pd.read_csv(os.path.join(BASE_PATH_HOST,'results/bias_measurement/link_prediction_bias/',                                                         'preds_df_occupation_FB15k-237_testset_with_sensitive_attribute_info.tsv'), sep = '\t')
occupation_facts_FB15k237

Unnamed: 0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,true_tail_class_label,/people/person/gender
0,/m/02ghq,/people/person/profession,profession,/m/01d_h8,Film Producer-GB,3,/m/05zppz
1,/m/01wkmgb,/people/person/profession,profession,/m/09jwl,Musician-GB,5,/m/05zppz
2,/m/0mm1q,/people/person/profession,profession,/m/01d_h8,Film Producer-GB,3,/m/05zppz
3,/m/03j90,/people/person/profession,profession,/m/05z96,Poet,0,/m/05zppz
4,/m/02vwckw,/people/person/profession,profession,/m/0n1h,Artist-GB,0,/m/05zppz
5,/m/01w58n3,/people/person/profession,profession,/m/016z4k,Singer-songwriter-GB,0,/m/02zsn
6,/m/01n8gr,/people/person/profession,profession,/m/0lgw7,Photographer,0,/m/05zppz
7,/m/0b7xl8,/people/person/profession,profession,/m/05sxg2,Theatrical producer,0,/m/05zppz
8,/m/01309x,/people/person/profession,profession,/m/02hrh1q,Actor-GB,1,/m/05zppz
9,/m/01trf3,/people/person/profession,profession,/m/0np9r,Voice Actor,0,/m/05zppz


In [35]:
tail_value_counts_gender = occupation_facts_FB15k237['/people/person/gender'].value_counts()
tail_value_counts_gender

/m/05zppz    1108
/m/02zsn      203
Name: /people/person/gender, dtype: int64

In [36]:
groupy_occupation_labels_all = occupation_facts_FB15k237.groupby('true_tail_class_label')
groupy_occupation_labels_all.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,/people/person/gender
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,511,511,511,511,511,511
1,261,261,261,261,261,261
2,116,116,116,116,116,116
3,111,111,111,111,111,111
4,79,79,79,79,79,79
5,71,71,71,71,71,71
6,61,61,61,61,61,61
7,51,51,51,51,51,51
8,50,50,50,50,50,50


In [38]:
valid_gender_only = occupation_facts_FB15k237[occupation_facts_FB15k237['/people/person/gender'] != '-1']
groupy_occupation_labels_only_valid_gender = valid_gender_only.groupby('true_tail_class_label')
groupy_occupation_labels_only_valid_gender.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,/people/person/gender
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,511,511,511,511,511,511
1,261,261,261,261,261,261
2,116,116,116,116,116,116
3,111,111,111,111,111,111
4,79,79,79,79,79,79
5,71,71,71,71,71,71
6,61,61,61,61,61,61
7,51,51,51,51,51,51
8,50,50,50,50,50,50


Of the gender facts, what is the per class count for male facts? - m/05zppz

In [39]:
male_gender_only = occupation_facts_FB15k237[occupation_facts_FB15k237['/people/person/gender'] == '/m/05zppz']
groupy_occupation_labels_male_gender_only = male_gender_only.groupby('true_tail_class_label')
groupy_occupation_labels_male_gender_only.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,/people/person/gender
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,429,429,429,429,429,429
1,202,202,202,202,202,202
2,104,104,104,104,104,104
3,104,104,104,104,104,104
4,76,76,76,76,76,76
5,61,61,61,61,61,61
6,50,50,50,50,50,50
7,42,42,42,42,42,42
8,40,40,40,40,40,40


Of the gender facts, what is the per class count for female facts? - /m/02zsn


In [40]:
female_gender_only = occupation_facts_FB15k237[occupation_facts_FB15k237['/people/person/gender'] == '/m/02zsn']
groupy_occupation_labels_female_gender_only = female_gender_only.groupby('true_tail_class_label')
groupy_occupation_labels_female_gender_only.agg('count')

Unnamed: 0_level_0,head_id,relation_id,relation_label,true_tail_id,true_tail_label,/people/person/gender
true_tail_class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,82,82,82,82,82,82
1,59,59,59,59,59,59
2,12,12,12,12,12,12
3,7,7,7,7,7,7
4,3,3,3,3,3,3
5,10,10,10,10,10,10
6,11,11,11,11,11,11
7,9,9,9,9,9,9
8,10,10,10,10,10,10
