# Inductive Reasoning with Text: IRT2

This notebook describes how to load the IRT2 dataset. 
Some of the properties are looked at in detail to offer insights into the datamodel.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import irt2
from irt2.dataset import IRT2
from irt2.dataset import MID

#  folder convention:
#     data/irt2/<graph>/<size>
#  where
#     graph = cde|fb
#     size  = tiny|small|medium|large (abbreviated: T|S|M|L)
#  for example:
#     path = 'data/irt2/cde/small'

data = IRT2.from_dir(path=irt2.ENV.DIR.DATA / 'irt2' / 'cde' / 'large')
print(str(data))

IRT2/CDE-L: 17050 vertices | 51 relations | 32666 mentions


In [3]:
import gzip
import textwrap
from itertools import islice
from tabulate import tabulate

from collections import Counter
from collections import defaultdict

from typing import Generator

In [4]:
# configuration

print(tabulate(data.config.items()))

---------------------  ------------------------------------
name                   IRT2/CDE-L
created                2022-02-01T16:29:14.510416
match count threshold  5
mention split ratio    0.5
seed                   5012022
seperator              |
source graph           lib/codex/data
source matches         data/matches/irt.cde
source pages           data/matches/src/matches-v6-codex.db
spacy model            en_core_web_lg
---------------------  ------------------------------------


In [5]:
# show example vertices and relations

print('\nvertices:')
print(f'    vid name')
for vid, name in islice(data.vertices.items(), 10):
    print(f'{vid:7d} {name}')

print('\nrelations:')
print(f'    rid name')
for rid, name in islice(data.relations.items(), 10):
    print(f'{rid:7d} {name}')


vertices:
    vid name
      0 Q108946:A Few Good Men
      1 Q39792:Jack Nicholson
      2 Q1041:Senegal
      3 Q7809:UNESCO
      4 Q314924:Anthony Edwards
      5 Q33999:actor
      6 Q722876:Roy Mayorga
      7 Q855091:guitarist
      8 Q78514:Franz Werfel
      9 Q268970:Republic of German-Austria

relations:
    rid name
      0 P161:cast member
      1 P463:member of
      2 P106:occupation
      3 P27:country of citizenship
      4 P69:educated at
      5 P172:ethnic group
      6 P840:narrative location
      7 P530:diplomatic relation
      8 P509:cause of death
      9 P17:country


In [6]:
# show example closed-world triples

print(tabulate(
    [
        (h, data.vertices[h], r, data.relations[r], t, data.vertices[t])
        for h, t, r in islice(data.closed_triples, 20)
    ],
    headers=('vid', 'head', 'rid', 'relation', 'vid', 'tail')
))

  vid  head                                rid  relation                      vid  tail
-----  --------------------------------  -----  --------------------------  -----  ------------------------------------------------
 5518  Q365985:Claude François              10  P136:genre                   5565  Q58339:disco
 2283  Q238719:Naomi Shemer                 27  P101:field of work            927  Q482:poetry
 7917  Q26702:Cardinal Richelieu             3  P27:country of citizenship     23  Q142:France
 8107  Q286366:Émile Proulx-Cloutier         2  P106:occupation               701  Q753110:songwriter
 8955  Q190772:Siméon Denis Poisson          4  P69:educated at              1482  Q273626:École Polytechnique
12219  Q539171:Dalida                       24  P119:place of burial         2422  Q746647:Montmartre Cemetery
 9492  Q179041:Lisa Kudrow                   2  P106:occupation               278  Q36180:writer
 3104  Q884:South Korea                     35  P37:official language    

In [7]:
# count total amount of available text contexts
# (this can take a while...)

def count_contexts(contexts: Generator, n: int = None):

    counts = Counter()
    count = 0

    for ctx in islice(contexts, n):

        # assert mid in data.mentions
        # TODO pre-filter
        if ctx.mid not in data.mentions:
            continue

        count += 1
        counts[ctx.mid] += 1


    print(f'  read {count} relevant contexts')
    print(f'  saw contexts for {len(counts)} mentions')

    return counts


with data.closed_contexts() as contexts:
    print('\ncounting closed-world contexts')
    ctx_counts_closed: dict[MID, int] = count_contexts(contexts, n=1000)

with data.open_contexts() as contexts:
    print('\ncounting open-world contexts')
    ctx_counts_open: dict[MID, int] = count_contexts(contexts, n=1000)



counting closed-world contexts
  read 1000 relevant contexts
  saw contexts for 312 mentions

counting open-world contexts
  read 1000 relevant contexts
  saw contexts for 312 mentions


In [8]:
# show some mentions

for vid, mids in islice(data.closed_mentions.items(), 30, 35):
    print(f'\n{len(mids)} mentions of {data.vertices[vid]} ({vid=})')
    for mid in mids:
        mention = data.mentions[mid]
        print(f'  {mid=} {mention} ({ctx_counts_closed[mid]} matches)')

print('\nOPEN WORLD ' + '-' * 20)
# open-world mentions
for vid, mids in islice(data.open_mentions.items(), 30, 35):
    print(f'\n{len(mids)} mentions of {data.vertices[vid]} ({vid=})')
    for mid in mids:
        mention = data.mentions[mid]
        print(f'  {mid=} {mention} ({ctx_counts_open[mid]} matches)')


11 mentions of Q5043:Christianity (vid=893)
  mid=192 christendom (0 matches)
  mid=193 religion (0 matches)
  mid=194 christian churches (0 matches)
  mid=195 christianised (0 matches)
  mid=196 christianity (0 matches)
  mid=197 christian faith (0 matches)
  mid=198 christianized (0 matches)
  mid=188 christian (0 matches)
  mid=189 catholic (0 matches)
  mid=190 missionary (0 matches)
  mid=191 christian theology (0 matches)

7 mentions of Q29:Spain (vid=1695)
  mid=199 spanish crown (0 matches)
  mid=200 spanish government (0 matches)
  mid=201 hispanic (0 matches)
  mid=202 españa (0 matches)
  mid=203 spaniards (0 matches)
  mid=204 the country (0 matches)
  mid=205 spain (3 matches)

3 mentions of Q16867:Edgar Allan Poe (vid=7331)
  mid=208 poe's (0 matches)
  mid=209 edgar allan poe (3 matches)
  mid=210 edgar (0 matches)

6 mentions of Q7737:Russian (vid=42)
  mid=211 russian (6 matches)
  mid=212 russian speaking (0 matches)
  mid=213 translation (0 matches)
  mid=214 fsb (0

In [9]:
# some examples for the head and tail tasks

print('\nHEAD TASK ' + '-' * 20)
for (mid, rid), vid in islice(data.open_task_heads.items(), 10):
    print(f'\n"{data.mentions[mid]}" {data.relations[rid]} ?')
    print(f'  answer: {data.vertices[vid]}')

print('\nTAIL TASK ' + '-' * 20)
for (mid, rid), vid in islice(data.open_task_tails.items(), 10):
    print(f'\n? {data.relations[rid]} "{data.mentions[mid]}"')
    print(f'  answer: {data.vertices[vid]}')


HEAD TASK --------------------

"pavel sukhoi" P106:occupation ?
  answer: Q82955:politician

"the bride" P136:genre ?
  answer: Q157394:fantasy film

"dalida" P119:place of burial ?
  answer: Q746647:Montmartre Cemetery

"vince vaughn" P106:occupation ?
  answer: Q3282637:film producer

"lisa kudrow" P106:occupation ?
  answer: Q948329:character actor

"hanns josef ortheil" P27:country of citizenship ?
  answer: Q183:Germany

"lou reed" P509:cause of death ?
  answer: Q147778:liver cirrhosis

"robert siewert" P27:country of citizenship ?
  answer: Q183:Germany

"naya rivera" P106:occupation ?
  answer: Q10798782:television actor

"27 dresses" P840:narrative location ?
  answer: Q60:New York City

TAIL TASK --------------------

? P40:child "philippine de rothschild"
  answer: Q557930:Philippe de Rothschild

? P463:member of "ligue de la patrie française"
  answer: Q163159:Alfred Nicolas Rambaud

? P19:place of birth "greifswald"
  answer: Q76539:Hans Fallada

? P161:cast member "mark

In [13]:
mid2vid = (
    {mid: vid for vid, mids in data.closed_mentions.items() for mid in mids} |
    {mid: vid for vid, mids in data.open_mentions.items() for mid in mids}
)


texts = defaultdict(set)
with data.closed_contexts() as contexts:
    for ctx in islice(contexts, 1000):
        texts[ctx.mid].add(ctx)

    texts = dict(texts)


for mid in islice(texts, 3):
    mention_norm = data.mentions[mid]
    vertex = data.vertices[mid2vid[mid]]

    print(f'\ntext for {mention_norm} ({mid=}) ({vertex=})')
    for ctx in texts[mid]:
        textwrap.indent(str(ctx), '  ') + '\n'


text for patricia kaas (mid=23237) (vertex='Q159704:Patricia Kaas')

text for paris (mid=2056) (vertex='Q90:Paris')

text for russian (mid=211) (vertex='Q7737:Russian')
