In [56]:
!pip install jsonlines
!pip install sparqlwrapper
!pip install tqdm

import os
import json

from SPARQLWrapper import SPARQLWrapper, JSON
import jsonlines



# Get dict of all predicate labels 

In [57]:
trex_dir = "../TREx"
pid_list = []

for f in os.listdir(trex_dir):
    with open(os.path.join(trex_dir, f)) as json_file:
        f_content = list(json_file)
    pid = json.loads(f_content[0])["predicate_id"]
    pid_list += [pid]

assert len(pid_list) == 41

In [58]:
user_agent = 'AGENT NAME' ## Customize
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)

In [59]:
pid_name_labels = {}
for pid in pid_list:
    search_item = f"{{(wdt:{pid})}}"
    service = """{ bd:serviceParam wikibase:language "en". }"""
    sparql.setQuery(f""" 
                    SELECT ?wdLabel WHERE 
    {{
                    VALUES (?wdt) {search_item}
                    ?wd wikibase:directClaim ?wdt .
                    SERVICE wikibase:label {service}
    }}
    """)
    sparql.setReturnFormat(JSON)
    query_result = sparql.query().convert()
    try:
        pid_name_labels[pid] = query_result["results"]["bindings"][0]["wdLabel"]["value"]
    except:
        pass
  

KeyboardInterrupt: 

In [None]:
pid_name_labels

{'P740': 'location of formation',
 'P108': 'employer',
 'P190': 'twinned administrative body',
 'P27': 'country of citizenship',
 'P1376': 'capital of',
 'P131': 'located in the administrative territorial entity',
 'P937': 'work location',
 'P176': 'manufacturer',
 'P463': 'member of',
 'P20': 'place of death',
 'P136': 'genre',
 'P39': 'position held',
 'P407': 'language of work or name',
 'P527': 'has part(s)',
 'P276': 'location',
 'P19': 'place of birth',
 'P47': 'shares border with',
 'P101': 'field of work',
 'P1303': 'instrument',
 'P17': 'country',
 'P127': 'owned by',
 'P103': 'native language',
 'P31': 'instance of',
 'P159': 'headquarters location',
 'P530': 'diplomatic relation',
 'P495': 'country of origin',
 'P37': 'official language',
 'P138': 'named after',
 'P361': 'part of',
 'P140': 'religion or worldview',
 'P1001': 'applies to jurisdiction',
 'P30': 'continent',
 'P178': 'developer',
 'P279': 'subclass of',
 'P449': 'original broadcaster',
 'P364': 'original langua

In [None]:
with open("predicate_labels.json", "w") as f:
    json.dump(pid_name_labels, f)

# Analyze distributions

In [60]:
!pip install pandas

import pandas as pd



### Create tables for locations and languages including essential information for later analysis

In [61]:
location_predicates_obj = ["P36", "P740", "P190", "P27", "P47", "P1376", "P937", "P131", "P20", "P276", "P19", "P17", "P159", "P495", "P1001", "P30"]
location_predicates_sub = ["P36", "P190", "P1376", "P131", "P47", "P37", "P30"]
person_predicates_sub = ["P108", "P27", "P937", "P20", "P19", "P101", "P103", "P1412", "P106", "P413"] 
language_predicates_obj = ["P407", "P103", "P37", "P364", "P1412"]

In [62]:
with open("predicate_labels.json", "r") as json_file:
    pid_name_labels = json.load(json_file)

In [63]:
trex_dir = "../TREx"

In [64]:
trex_locations = []

for p in set(location_predicates_obj + location_predicates_sub):
    with open(os.path.join(trex_dir, f"{p}.jsonl")) as json_file:
        f_content = list(json_file)
    for l in f_content:
        list_item = json.loads(l)
        pid = list_item["predicate_id"]
        trex_locations += [{
            "predicate_id" : pid, 
            "predicate_label": pid_name_labels[pid],
            "obj_label" : list_item["obj_label"],
            "sub_label" : list_item["sub_label"],
            "location_is_obj" : p in location_predicates_obj,
            "location_is_sub" : p in location_predicates_sub,
            "obj_uri" : list_item["sub_uri"],
            "sub_uri" : list_item["sub_uri"],
            }]

trex_locations_df = pd.DataFrame(trex_locations)
trex_locations_df.to_csv("trex_locations.csv")


In [65]:
trex_languages = []

for p in set(language_predicates_obj):
    with open(os.path.join(trex_dir, f"{p}.jsonl")) as json_file:
        f_content = list(json_file)
    for l in f_content:
        list_item = json.loads(l)
        pid = list_item["predicate_id"]
        trex_languages += [{
            "predicate_id" : pid, 
            "predicate_label": pid_name_labels[pid],
            "obj_label" : list_item["obj_label"],
            "sub_label" : list_item["sub_label"],
            "sub_uri" : list_item["sub_uri"]
            # all language-related items are subjects in the templates
            }]

trex_locations_df = pd.DataFrame(trex_languages)
trex_locations_df.to_csv("trex_languages.csv")


### Get person entities and add gender information

In [68]:
# Load gender map: Maps Wikidata gender ids to labels (str)
path_to_gender_map = "/Users/angeliekraft/Documents/Code/knowledge-enhanced-lm-training/notebooks/wikidata_genders.json" ## Customize
with open(path_to_gender_map, "r") as f:
    gender_map = json.load(f)["map"] 

# Get local file with list of Wikidata entities including gender info
path_to_wiki_entities = "/Users/angeliekraft/Documents/Code/wikidata_humans/wikidata_all_human_entities.jsonl" ## Customize
with open(path_to_wiki_entities) as json_file:
    wikidata_entity_list = list(json_file)

In [69]:
# Make entity list simpler for better querying of gender: maps person id to gender id
simple_entity_list = {}
for entity in wikidata_entity_list:
    entity = json.loads(entity)
    simple_entity_list[entity["entity_id"]] = entity["gender"]

In [70]:
user_agent = 'AGENT NAME' ## Customize
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)

def get_entity_gender_from_wikidata(entity_uri):
    # Query gender through SPARQL if person id is not in the local dump
    sparql.setQuery(f""" 
    SELECT * WHERE {{
    wd:{entity_uri} wdt:P21 ?gender .
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """)
    sparql.setReturnFormat(JSON)
    query_result = sparql.query().convert()
    try:
        return query_result["results"]["bindings"][0]["gender"]["value"].split("/")[-1]
    except:
        return "NaN"

def get_gender_of_entity(entity_uri):
    # Get gender information for a specific person entity
    gender_id = simple_entity_list.get(entity_uri, "NaN")
    if gender_id == "NaN":
        gender_id = get_entity_gender_from_wikidata(entity_uri)
        if gender_id == "NaN":
            print("No gender info available for", entity_uri)
    gender_string = gender_map.get(gender_id, "NaN")
    return gender_string
    

In [71]:
# Make list of dicts with person entity infos
trex_people = []

for p in set(person_predicates_sub):
    with open(os.path.join(trex_dir, f"{p}.jsonl")) as json_file:
        f_content = list(json_file)
    for l in f_content:
        list_item = json.loads(l)
        pid = list_item["predicate_id"]
        uri = list_item["sub_uri"]
        trex_people += [{
            "predicate_id" : pid, 
            "predicate_label": pid_name_labels[pid],
            "obj_label" : list_item["obj_label"],
            "sub_label" : list_item["sub_label"],
            "sub_uri" : uri,
            # all person-related items are subjects in the templates
            "gender" : get_gender_of_entity(uri)
            }]


No gender info available for Q599145
No gender info available for Q671780
No gender info available for Q2590074
No gender info available for Q5247396
No gender info available for Q6203279
No gender info available for Q367143
No gender info available for Q18233
No gender info available for Q1683281
No gender info available for Q49542
No gender info available for Q622283
No gender info available for Q1998273
No gender info available for Q18913178
No gender info available for Q1192364
No gender info available for Q165192
No gender info available for Q1673765
No gender info available for Q311762
No gender info available for Q1057292
No gender info available for Q1673765
No gender info available for Q4154059
No gender info available for Q8068538
No gender info available for Q2366673
No gender info available for Q5639595
No gender info available for Q378422
No gender info available for Q7916974
No gender info available for Q7427317
No gender info available for Q18206631
No gender info availa

In [72]:
# Make table and store
trex_locations_df = pd.DataFrame(trex_people)
trex_locations_df.to_csv("trex_persons.csv")

In [73]:
trex_locations_df["gender"].value_counts()

gender
male               7494
female             1146
NaN                 107
non-binary            2
trans woman           2
female organism       1
Name: count, dtype: int64

### Analyze geospatial distribution (WIP)

Tutorial: https://towardsdatascience.com/using-python-to-create-a-world-map-from-a-list-of-country-names-cd7480d03b10

In [1]:
!pip install pycountry-convert
!pip install geopy
!pip install numpy



In [2]:
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
from geopy.geocoders import Nominatim
import numpy as np
import pandas as pd
from tqdm import tqdm


In [3]:
trex_locations_df = pd.read_csv("trex_locations.csv")

In [4]:
def get_continent(col):
    try:
        cn_a2_code =  country_name_to_country_alpha2(col)
    except:
        cn_a2_code = 'Unknown' 
    try:
        cn_continent = country_alpha2_to_continent_code(cn_a2_code)
    except:
        cn_continent = 'Unknown' 
    return (cn_a2_code, cn_continent)

In [5]:
geolocator = Nominatim(user_agent="trex_eda")
def geolocate(location):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(location)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan

In [6]:
def get_geolocation_for_row(row):
    if row["location_is_sub"]:
        row["continent_sub"] = geolocate(row["sub_label"])
    else:
        row["continent_sub"] = "NaN"
    if row["location_is_obj"]:
        row["continent_obj"] = geolocate(row["obj_label"])
    else:
        row["continent_obj"] = "NaN"
    return row

def get_geolocations_for_df(df, start_idx, end_idx):
    return df.iloc[start_idx:end_idx].apply(get_geolocation_for_row, axis=1)


In [7]:
import threading

class ThreadWithReturnValue(threading.Thread):

    def __init__(self, group=None, target=None, name=None,
                    args=(), kwargs={}, Verbose=None):
        threading.Thread.__init__(self, group, target, name, args, kwargs)
        self._return = None

    def run(self):
        if self._target is not None:
            self._return = self._target(*self._args,
                                                **self._kwargs)
    def join(self, *args):
        threading.Thread.join(self, *args)
        return self._return


In [8]:

thread1 = ThreadWithReturnValue(target=get_geolocations_for_df, args=(trex_locations_df, 0, 3723))
thread2 = ThreadWithReturnValue(target=get_geolocations_for_df, args=(trex_locations_df, 3723, 7446))
thread3 = ThreadWithReturnValue(target=get_geolocations_for_df, args=(trex_locations_df, 7446, 11169))
thread4 = ThreadWithReturnValue(target=get_geolocations_for_df, args=(trex_locations_df, 11169, -1))

In [9]:
thread1.start()
thread2.start()
thread3.start()
thread4.start()

In [10]:
trex_locations_df = pd.concat([thread1.join(), thread2.join(), thread3.join(), thread4.join()], axis=0)

In [None]:
trex_locations_df#["continent_sub"].value_counts()

Unnamed: 0.1,Unnamed: 0,predicate_id,predicate_label,obj_label,sub_label,location_is_obj,location_is_sub,obj_uri,sub_uri,continent_sub,continent_obj
0,0,P27,country of citizenship,Brazil,Rubens Barrichello,True,False,Q169846,Q169846,,"(-10.3333333, -53.2)"
1,1,P27,country of citizenship,France,Yves Mirande,True,False,Q3573865,Q3573865,,"(46.603354, 1.8883335)"
2,2,P27,country of citizenship,Estonia,August Gailit,True,False,Q761368,Q761368,,"(58.7523778, 25.3319078)"
3,3,P27,country of citizenship,Denmark,Princess Elisabeth of Denmark,True,False,Q242691,Q242691,,"(55.670249, 10.3333283)"
4,4,P27,country of citizenship,Denmark,Tue West,True,False,Q7851370,Q7851370,,"(55.670249, 10.3333283)"
5,5,P27,country of citizenship,Sweden,Bertil Lindblad,True,False,Q364505,Q364505,,"(59.6749712, 14.5208584)"
6,6,P27,country of citizenship,India,Prafulla Chandra Ghosh,True,False,Q7237579,Q7237579,,"(22.3511148, 78.6677428)"
7,7,P27,country of citizenship,Japan,Noriyasu Hirata,True,False,Q504815,Q504815,,"(36.5748441, 139.2394179)"
8,8,P27,country of citizenship,Canada,Woodrow Lloyd,True,False,Q434670,Q434670,,"(61.0666922, -107.991707)"
9,9,P27,country of citizenship,Nigeria,Namadi Sambo,True,False,Q2993602,Q2993602,,"(9.6000359, 7.9999721)"
