In [2]:
!pip install jsonlines
!pip install sparqlwrapper
!pip install tqdm

import os
import json

from SPARQLWrapper import SPARQLWrapper, JSON
import jsonlines

Collecting jsonlines
  Using cached jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Collecting attrs>=19.2.0
  Using cached attrs-23.1.0-py3-none-any.whl (61 kB)
Installing collected packages: attrs, jsonlines
Successfully installed attrs-23.1.0 jsonlines-3.1.0
Collecting sparqlwrapper
  Using cached SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pyparsing<4,>=2.1.0
  Downloading pyparsing-3.1.1-py3-none-any.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.1/103.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isodate<0.7.0,>=0.6.0
  Using cached isodate-0.6.1-py2.py3-none-any.whl (41 kB)
Installing collected packages: pyparsing, isodate, rdflib, sparqlwrapper
Successfully installed isodate-0.6.1 pyparsing-3.1.1 rdfli

# Get dict of all predicate labels 

In [11]:
trex_dir = "../TREx"
pid_list = []

for f in os.listdir(trex_dir):
    with open(os.path.join(trex_dir, f)) as json_file:
        f_content = list(json_file)
    pid = json.loads(f_content[0])["predicate_id"]
    pid_list += [pid]

assert len(pid_list) == 41

In [12]:
user_agent = 'AGENT NAME' ## Customize
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)

In [27]:
pid_name_labels = {}
for pid in pid_list:
    search_item = f"{{(wdt:{pid})}}"
    service = """{ bd:serviceParam wikibase:language "en". }"""
    sparql.setQuery(f""" 
                    SELECT ?wdLabel WHERE 
    {{
                    VALUES (?wdt) {search_item}
                    ?wd wikibase:directClaim ?wdt .
                    SERVICE wikibase:label {service}
    }}
    """)
    sparql.setReturnFormat(JSON)
    query_result = sparql.query().convert()
    try:
        pid_name_labels[pid] = query_result["results"]["bindings"][0]["wdLabel"]["value"]
    except:
        pass
  

In [41]:
pid_name_labels

{'P740': 'location of formation',
 'P108': 'employer',
 'P190': 'twinned administrative body',
 'P27': 'country of citizenship',
 'P1376': 'capital of',
 'P131': 'located in the administrative territorial entity',
 'P937': 'work location',
 'P176': 'manufacturer',
 'P463': 'member of',
 'P20': 'place of death',
 'P136': 'genre',
 'P39': 'position held',
 'P407': 'language of work or name',
 'P527': 'has part(s)',
 'P276': 'location',
 'P19': 'place of birth',
 'P47': 'shares border with',
 'P101': 'field of work',
 'P1303': 'instrument',
 'P17': 'country',
 'P127': 'owned by',
 'P103': 'native language',
 'P31': 'instance of',
 'P159': 'headquarters location',
 'P530': 'diplomatic relation',
 'P495': 'country of origin',
 'P37': 'official language',
 'P138': 'named after',
 'P361': 'part of',
 'P140': 'religion or worldview',
 'P1001': 'applies to jurisdiction',
 'P30': 'continent',
 'P178': 'developer',
 'P279': 'subclass of',
 'P449': 'original broadcaster',
 'P364': 'original langua

In [28]:
with open("predicate_labels.json", "w") as f:
    json.dump(pid_name_labels, f)

# Analyze distributions

In [40]:
!pip install pandas

import pandas as pd

Collecting pandas
  Downloading pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pytz>=2020.1
  Using cached pytz-2023.3-py2.py3-none-any.whl (502 kB)
Collecting tzdata>=2022.1
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.0.3 pytz-2023.3 tzdata-2023.3


### Create tables for locations and languages including essential information for later analysis

In [50]:
location_predicates_obj = ["P36", "P740", "P190", "P27", "P47", "P1376", "P937", "P131", "P20", "P276", "P19", "P17", "P159", "P495", "P1001", "P30"]
location_predicates_sub = ["P36", "P190", "P1376", "P131", "P47", "P37", "P30"]
person_predicates_sub = ["P108", "P27", "P937", "P20", "P19", "P101", "P103", "P1412", "P106", "P413"] 
language_predicates_obj = ["P407", "P103", "P37", "P364", "P1412"]

In [45]:
with open("predicate_labels.json", "r") as json_file:
    pid_name_labels = json.load(json_file)

In [None]:
trex_dir = "../TREx"

In [62]:
trex_locations = []

for p in set(location_predicates_obj + location_predicates_sub):
    with open(os.path.join(trex_dir, f"{p}.jsonl")) as json_file:
        f_content = list(json_file)
    for l in f_content:
        list_item = json.loads(l)
        pid = list_item["predicate_id"]
        trex_locations += [{
            "predicate_id" : pid, 
            "predicate_label": pid_name_labels[pid],
            "obj_label" : list_item["obj_label"],
            "sub_label" : list_item["sub_label"],
            "location_is_obj" : p in location_predicates_obj,
            "location_is_sub" : p in location_predicates_sub,
            "obj_uri" : list_item["sub_uri"],
            "sub_uri" : list_item["sub_uri"],
            }]

trex_locations_df = pd.DataFrame(trex_locations)
trex_locations_df.to_csv("trex_locations.csv")


In [64]:
trex_languages = []

for p in set(language_predicates_obj):
    with open(os.path.join(trex_dir, f"{p}.jsonl")) as json_file:
        f_content = list(json_file)
    for l in f_content:
        list_item = json.loads(l)
        pid = list_item["predicate_id"]
        trex_languages += [{
            "predicate_id" : pid, 
            "predicate_label": pid_name_labels[pid],
            "obj_label" : list_item["obj_label"],
            "sub_label" : list_item["sub_label"],
            "sub_uri" : list_item["sub_uri"]
            # all language-related items are subjects in the templates
            }]

trex_locations_df = pd.DataFrame(trex_languages)
trex_locations_df.to_csv("trex_languages.csv")


### Get person entities and add gender information

In [66]:
# Load gender map: Maps Wikidata gender ids to labels (str)
path_to_gender_map = "YOUR PATH" ## Customize
with open(path_to_gender_map, "r") as f:
    gender_map = json.load(f)["map"] 

# Get local file with list of Wikidata entities including gender info
path_to_wiki_entities = "YOUR PATH" ## Customize
with open(path_to_wiki_entities) as json_file:
    wikidata_entity_list = list(json_file)

In [95]:
# Make entity list simpler for better querying of gender: maps person id to gender id
simple_entity_list = {}
for entity in wikidata_entity_list:
    entity = json.loads(entity)
    simple_entity_list[entity["entity_id"]] = entity["gender"]

In [107]:
user_agent = 'AGENT NAME' ## Customize
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)

def get_entity_gender_from_wikidata(entity_uri):
    # Query gender through SPARQL if person id is not in the local dump
    sparql.setQuery(f""" 
    SELECT * WHERE {{
    wd:{entity_uri} wdt:P21 ?gender .
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """)
    sparql.setReturnFormat(JSON)
    query_result = sparql.query().convert()
    try:
        return query_result["results"]["bindings"][0]["gender"]["value"].split("/")[-1]
    except:
        return "NA"

def get_gender_of_entity(entity_uri):
    # Get gender information for a specific person entity
    gender_id = simple_entity_list.get(entity_uri, "NA")
    if gender_id == "NA":
        gender_id = get_entity_gender_from_wikidata(entity_uri)
        if gender_id == "NA":
            print("No gender info available for", entity_uri)
    gender_string = gender_map.get(gender_id, "NA")
    return gender_string
    

In [108]:
# Make list of dicts with person entity infos
trex_people = []

for p in set(person_predicates_sub):
    with open(os.path.join(trex_dir, f"{p}.jsonl")) as json_file:
        f_content = list(json_file)
    for l in f_content:
        list_item = json.loads(l)
        pid = list_item["predicate_id"]
        uri = list_item["sub_uri"]
        trex_people += [{
            "predicate_id" : pid, 
            "predicate_label": pid_name_labels[pid],
            "obj_label" : list_item["obj_label"],
            "sub_label" : list_item["sub_label"],
            "sub_uri" : uri,
            # all person-related items are subjects in the templates
            "gender" : get_gender_of_entity(uri)
            }]


No gender info available for Q5605925
No gender info available for Q5639595
No gender info available for Q378422
No gender info available for Q7916974
No gender info available for Q179132
No gender info available for Q1454986
No gender info available for Q4612907
No gender info available for Q47913
No gender info available for Q580606
No gender info available for Q44703
No gender info available for Q1379239
No gender info available for Q179677
No gender info available for Q46857
No gender info available for Q2125835
No gender info available for Q920064
No gender info available for Q221395
No gender info available for Q980357
No gender info available for Q357503
No gender info available for Q383092
No gender info available for Q674113
No gender info available for Q189210
No gender info available for Q946028
No gender info available for Q669166
No gender info available for Q2470594
No gender info available for Q3567687
No gender info available for Q1815078
No gender info available for Q2

In [110]:
# Make table and store
trex_locations_df = pd.DataFrame(trex_people)
trex_locations_df.to_csv("trex_persons.csv")

In [111]:
trex_locations_df["gender"].value_counts()

gender
male               7494
female             1146
NA                  107
non-binary            2
trans woman           2
female organism       1
Name: count, dtype: int64

### Analyze geospatial distribution (WIP)

Tutorial: https://towardsdatascience.com/using-python-to-create-a-world-map-from-a-list-of-country-names-cd7480d03b10

In [36]:
!pip install pycountry-convert
!pip install geopy
!pip install numpy

Collecting numpy
  Downloading numpy-1.25.2-cp39-cp39-macosx_11_0_arm64.whl (14.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.25.2


In [37]:
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
from geopy.geocoders import Nominatim
import numpy as np

In [32]:
def get_continent(col):
    try:
        cn_a2_code =  country_name_to_country_alpha2(col)
    except:
        cn_a2_code = 'Unknown' 
    try:
        cn_continent = country_alpha2_to_continent_code(cn_a2_code)
    except:
        cn_continent = 'Unknown' 
    return (cn_a2_code, cn_continent)

In [38]:
geolocator = Nominatim(user_agent="trex_eda")
def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan