# IRT2 - Inductive Reasoning with Text

This notebook describes how to load the IRT2 dataset. 
Some of the properties are looked at in detail to offer insights into the datamodel.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import irt2
from irt2.dataset import IRT2
from irt2.dataset import MID

#  folder convention:
#     data/irt2/<graph>/<size>
#  where
#     graph = cde|fb
#     size  = tiny|small|medium|large (abbreviated: T|S|M|L)
#  for example:
#     path = 'data/irt2/cde/small'

data = IRT2.from_dir(path=irt2.ENV.DIR.DATA / 'irt2' / 'cde' / 'large')
print(str(data))

IRT2/CDE-L: 15020 vertices | 46 relations | 32666 mentions


In [3]:
# it iterates all text contexts and this might take while...
# repeated calls are cheap: return value is cached
print(data.description)


IRT2/CDE-L
created: Wed Apr 13 13:53:41 2022

  vertices: 15020
  relations: 46
  mentions: 32666

  closed-world
    triples: 158463
    vertices: 9769
    mentions: 22578 (~2.311 per vertex)
    contexts: 18616203

  open-world (validation)
    head tasks: 38363
    tail tasks: 4454
    mentions: 3026 (~1.108 per vertex)
    contexts: 477826

  open-world (test)
    head tasks: 88620
    tail tasks: 9944
    mentions: 7062 (~1.231 per vertex)
    contexts: 1180605




In [4]:
# further information is given in the configuration file
# which was used for dataset creation
import yaml

print(yaml.dump(data.config))

create:
  concept relations:
  - P1412:languages spoken, written, or signed
  - P1303:instrument
  - P140:religion
  - P27:country of citizenship
  - P30:continent
  - P509:cause of death
  - P172:ethnic group
  - P2348:time period
  - P102:member of political party
  - P106:occupation
  - P495:country of origin
  - P136:genre
  - P641:sport
  - P19:place of birth
  - P69:educated at
  - P463:member of
  - P264:record label
  - P20:place of death
  - P1050:medical condition
  - P101:field of work
  - P2283:uses
  - P135:movement
  - P119:place of burial
  - P108:employer
  - P37:official language
  - P840:narrative location
  - P17:country
  exclude relations:
  - P1056:product or material produced
  - P3095:practiced by
  - P54:member of sports team
  - P113:airline hub
  - P780:symptoms
  graph loader: codex
  graph loader args:
  - lib/codex/data/triples/codex-m/train.txt
  - lib/codex/data/triples/codex-m/valid.txt
  - lib/codex/data/triples/codex-m/test.txt
  graph loader kwargs:


In [5]:
import textwrap
from itertools import islice
from tabulate import tabulate

from collections import Counter
from collections import defaultdict

from typing import Generator

In [6]:
# show example vertices and relations

print('\nvertices:')
print(f'    vid name')
for vid, name in islice(data.vertices.items(), 10):
    print(f'{vid:7d} {name}')

print('\nrelations:')
print(f'    rid name')
for rid, name in islice(data.relations.items(), 10):
    print(f'{rid:7d} {name}')


vertices:
    vid name
      0 Q108946:A Few Good Men
      1 Q39792:Jack Nicholson
      2 Q1041:Senegal
      3 Q7809:UNESCO
      4 Q314924:Anthony Edwards
      5 Q33999:actor
      6 Q855091:guitarist
      7 Q268970:Republic of German-Austria
      8 Q182973:University of Iowa
      9 Q188093:Langston Hughes

relations:
    rid name
      0 P161:cast member
      1 P463:member of
      2 P106:occupation
      3 P27:country of citizenship
      4 P69:educated at
      5 P172:ethnic group
      6 P840:narrative location
      7 P530:diplomatic relation
      8 P509:cause of death
      9 P17:country


In [7]:
# show example closed-world triples

print(tabulate(
    [
        (h, data.vertices[h], r, data.relations[r], t, data.vertices[t])
        for h, t, r in islice(data.closed_triples, 20)
    ],
    headers=('VID', 'head', 'RID', 'relation', 'VID', 'tail')
))

  VID  head                             RID  relation                                      VID  tail
-----  -----------------------------  -----  ------------------------------------------  -----  ---------------------------------------------------
 8711  Q362106:Boris Christoff            3  P27:country of citizenship                    544  Q219:Bulgaria
 9159  Q78632:Richard Tauber              3  P27:country of citizenship                    445  Q40:Austria
13948  Q349420:Fabolous                   2  P106:occupation                               471  Q753110:songwriter
 7771  Q401107:Ahmed Aboutaleb            2  P106:occupation                               190  Q1930187:journalist
11414  Q946019:Eric Martin               17  P264:record label                             139  Q193023:Capitol Records
13246  Q68468:Otto Sander                 2  P106:occupation                                14  Q2259451:stage actor
 6639  Q295964:Jon Favreau                2  P106:occupation     

In [8]:
# this showcases how to access text contexts

def count_contexts(contexts: Generator, n: int = None):

    counts = dict(total=0, mids=Counter(), origins=Counter())

    for context in islice(contexts, n):

        assert context.mid in data.mentions
        assert context.mention in context.data

        counts['total'] += 1
        counts['mids'][context.mid] += 1
        counts['origins'][context.origin] += 1

    print(f'  read {counts["total"]} relevant contexts')
    print(f'  for {len(counts["mids"])} mentions from {len(counts["origins"])} origins')

    return counts

# Contexts are retrieved using a context manager which handles
# opening/closing files appropriately. The managed object is
# a generator yielding irt2.dataset.Context objects.

n = None

with data.closed_contexts() as contexts:
    print('\ncounting closed-world (training) contexts')
    ctx_counts_closed = count_contexts(contexts, n=n)

with data.open_contexts_validation() as contexts:
    print('\ncounting open-world (validation) contexts')
    ctx_counts_open_val = count_contexts(contexts, n=n)

with data.open_contexts_test() as contexts:
    print('\ncounting open-world (test) contexts')
    ctx_counts_open_test = count_contexts(contexts, n=n)


counting closed-world (training) contexts


  read 18616203 relevant contexts
  for 22577 mentions from 3560179 origins

counting open-world (validation) contexts


  read 477826 relevant contexts
  for 3026 mentions from 237144 origins

counting open-world (test) contexts


  read 1180605 relevant contexts
  for 7062 mentions from 479953 origins


In [9]:
# show some mentions

print('\nclosed-world (training) ' + '-' * 20)
for vid, mids in islice(data.closed_mentions.items(), 30, 35):
    print(f'\n  {len(mids)} mentions of {data.vertices[vid]} ({vid=})')
    for mid in mids:
        mention = data.mentions[mid]
        print(f'    {mid=} {mention} ({ctx_counts_closed["mids"][mid]} matches)')

print('\nopen-world (validation) ' + '-' * 20)
# open-world mentions
for vid, mids in islice(data.open_mentions_val.items(), 30, 35):
    print(f'\n  {len(mids)} mentions of {data.vertices[vid]} ({vid=})')
    for mid in mids:
        mention = data.mentions[mid]
        print(f'    {mid=} {mention} ({ctx_counts_open_val["mids"][mid]} matches)')

print('\nopen-world (test) ' + '-' * 20)
# open-world mentions
for vid, mids in islice(data.open_mentions_test.items(), 30, 35):
    print(f'\n  {len(mids)} mentions of {data.vertices[vid]} ({vid=})')
    for mid in mids:
        mention = data.mentions[mid]
        print(f'    {mid=} {mention} ({ctx_counts_open_test["mids"][mid]} matches)')


closed-world (training) --------------------

  1 mentions of Q45909:John Cale (vid=2062)
    mid=30 john cale (643 matches)

  1 mentions of Q92851:David A. Patterson (vid=7872)
    mid=31 david patterson (10 matches)

  1 mentions of Q207536:The Matrix Revolutions (vid=7550)
    mid=32 the matrix revolutions (160 matches)

  2 mentions of Q78494:Arthur Koestler (vid=987)
    mid=33 koestler (5 matches)
    mid=20541 arthur koestler (309 matches)

  3 mentions of Q107420:Sheldon Lee Glashow (vid=7133)
    mid=34 sheldon lee glashow (18 matches)
    mid=6827 glashow (8 matches)
    mid=5319 sheldon glashow (17 matches)

open-world (validation) --------------------

  1 mentions of Q192762:Darren Aronofsky (vid=10968)
    mid=22608 darren aronofsky (297 matches)

  2 mentions of Q161678:Harry Potter and the Deathly Hallows – Part 1 (vid=3324)
    mid=25008 part 1 (24 matches)
    mid=22609 harry potter and the deathly hallows – part 1 (79 matches)

  1 mentions of Q80321:Halldór Laxnes

In [10]:
# some examples for the head and tail tasks
# also doing a reverse-lookup for head vertices

from itertools import chain

mid2vid = {
    mid: vid
    for vid, mids in chain(
            data.closed_mentions.items(),
            data.open_mentions_val.items(),
            data.open_mentions_test.items(),
    )
    for mid in mids
}


print('\nHEAD TASK ' + '-' * 20)
for (mid, rid), vids in islice(data.open_task_val_heads.items(), 10):
    print(f'\n"{data.mentions[mid]}" ({data.vertices[mid2vid[mid]]}) {data.relations[rid]} ?')
    for vid in vids:
        print(f'  answer: {data.vertices[vid]}')

print('\nTAIL TASK ' + '-' * 20)
for (mid, rid), vids in islice(data.open_task_val_tails.items(), 10):
    print(f'\n? {data.relations[rid]} "{data.mentions[mid]}" ({data.vertices[mid2vid[mid]]})')
    for vid in vids:
        print(f'  answer: {data.vertices[vid]}')


HEAD TASK --------------------

"poisson" (Q190772:Siméon Denis Poisson) P69:educated at ?
  answer: Q273626:École Polytechnique

"carlos ibáñez e ibáñez de ibero" (Q438968:Carlos Ibáñez de Ibero) P463:member of ?
  answer: Q427318:Spanish Royal Academy of Sciences
  answer: Q329464:Royal Prussian Academy of Sciences
  answer: Q337580:Royal Academy of Science, Letters and Fine Arts of Belgium
  answer: Q188771:French Academy of Sciences
  answer: Q270794:National Academy of Sciences

"janne carlsson" (Q383821:Janne Carlsson) P106:occupation ?
  answer: Q386854:drummer
  answer: Q947873:television presenter
  answer: Q36834:composer
  answer: Q33999:actor

"kanye" (Q15935:Kanye West) P106:occupation ?
  answer: Q3427922:restaurateur
  answer: Q33999:actor
  answer: Q183945:record producer
  answer: Q43845:businessperson
  answer: Q2526255:film director
  answer: Q753110:songwriter

"k west" (Q15935:Kanye West) P106:occupation ?
  answer: Q3427922:restaurateur
  answer: Q33999:actor
  a

In [11]:
# print some example texts

texts = defaultdict(set)
with data.closed_contexts() as contexts:
    for ctx in islice(contexts, 1000):
        texts[ctx.mid].add(ctx)

    texts = dict(texts)


for mid, contexts in islice(texts.items(), 3):
    mention_norm = data.mentions[mid]
    vertex = data.vertices[mid2vid[mid]]

    print(f'\ntext for {mention_norm} ({mid=}) ({vertex=})')
    for context in contexts:
        wrapped = '\n'.join(textwrap.wrap(str(context.data), 80),)
        indented = textwrap.indent(wrapped, ' ' * 2)
        print('\n' + indented)


text for united states (mid=1308) (vertex='Q30:United States of America')

  The Battle of Guadalcanal begins as the United States Marines initiate the first
  American offensive of the war with landings on Guadalcanal and Tulagi in the
  Solomon Islands.

  Documented by Europeans first in 1701, the Alabama, Coosa, and Tallapoosa rivers
  were central to the homeland of the Creek Indians before their removal by United
  States forces to the Indian Territory in the 1830s.

  On July 22, AOL ended its youth corps, which consisted of 350 underage community
  leaders At this time, the United States Department of Labor began an
  investigation into the program, but it came to no conclusions about AOL's
  practices  AOL ended its community leader program on June 8, 2005.

  In December 2006, AOL closed their last remaining call center in the United
  States, "taking the America out of America Online" according to industry
  pundits.

  AOL was one of the early pioneers of the Internet in t