# IRT2 - Inductive Reasoning with Text

This notebook describes how to load the IRT2 dataset. 
Some of the properties are looked at in detail to offer insights into the datamodel.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import irt2
from irt2.dataset import IRT2
from irt2.dataset import MID

import textwrap
from itertools import islice
from tabulate import tabulate

from collections import Counter
from collections import defaultdict

from typing import Generator

#  folder convention:
#     data/irt2/<graph>/<size>
#  where
#     graph = cde|fb
#     size  = tiny|small|medium|large (abbreviated: T|S|M|L)
#  for example:
#     path = 'data/irt2/cde/small'

data = IRT2.from_dir(path=irt2.ENV.DIR.DATA / 'irt2' / 'irt2-cde-large')
print(str(data))

IRT2/CDE-L: 15020 vertices | 45 relations | 32666 mentions


In [3]:
# it iterates all text contexts and this might take while...
# repeated calls are cheap: return value is cached
print(data.description)


IRT2/CDE-L
created: Wed May 11 10:29:38 2022

  vertices: 15020
  relations: 45
  mentions: 32666

  closed-world
    triples: 102289
    vertices: 9952
    mentions: 22866 (~2.298 per vertex)
    contexts: 18654485

  open-world (validation)
    mentions: 2940 (~1.112 per vertex)
    contexts: 383351
    tasks:
      heads: 36874
      tails: 4221

  open-world (test)
    mentions: 6860 (~1.223 per vertex)
    contexts: 864598
    task:
      heads: 86396
      tails: 9842




In [4]:
# Further information is given in the configuration file
# which was used for dataset creation. For an explanation of
# the different options, see the original files in /conf.
import yaml

print(yaml.dump(data.config))

create:
  concept relations:
  - P1412:languages spoken, written, or signed
  - P1303:instrument
  - P140:religion
  - P27:country of citizenship
  - P30:continent
  - P509:cause of death
  - P172:ethnic group
  - P2348:time period
  - P102:member of political party
  - P106:occupation
  - P495:country of origin
  - P136:genre
  - P641:sport
  - P19:place of birth
  - P69:educated at
  - P463:member of
  - P264:record label
  - P20:place of death
  - P1050:medical condition
  - P101:field of work
  - P2283:uses
  - P135:movement
  - P119:place of burial
  - P108:employer
  - P37:official language
  - P840:narrative location
  - P17:country
  exclude relations:
  - P1056:product or material produced
  - P3095:practiced by
  - P54:member of sports team
  - P113:airline hub
  - P780:symptoms
  - P50:author
  graph loader: codex
  graph loader args:
  - lib/codex/data/triples/codex-m/train.txt
  - lib/codex/data/triples/codex-m/valid.txt
  - lib/codex/data/triples/codex-m/test.txt
  graph 

In [5]:
# show example vertices and relations

print('\nvertices:')
print(f'    vid name')
for vid, name in islice(data.vertices.items(), 10):
    print(f'{vid:7d} {name}')

print('\nrelations:')
print(f'    rid name')
for rid, name in islice(data.relations.items(), 10):
    print(f'{rid:7d} {name}')


vertices:
    vid name
      0 Q108946:A Few Good Men
      1 Q39792:Jack Nicholson
      2 Q1041:Senegal
      3 Q7809:UNESCO
      4 Q314924:Anthony Edwards
      5 Q33999:actor
      6 Q855091:guitarist
      7 Q268970:Republic of German-Austria
      8 Q182973:University of Iowa
      9 Q188093:Langston Hughes

relations:
    rid name
      0 P161:cast member
      1 P463:member of
      2 P106:occupation
      3 P27:country of citizenship
      4 P69:educated at
      5 P172:ethnic group
      6 P840:narrative location
      7 P530:diplomatic relation
      8 P509:cause of death
      9 P17:country


In [6]:
# show example closed-world triples

print(tabulate(
    [
        (h, data.vertices[h], r, data.relations[r], t, data.vertices[t])
        for h, t, r in islice(data.closed_triples, 20)
    ],
    headers=('VID', 'head', 'RID', 'relation', 'VID', 'tail')
))

  VID  head                                        RID  relation                          VID  tail
-----  ----------------------------------------  -----  ------------------------------  -----  ------------------------------------
 4718  Q444486:Beatrice Hastings                    31  P451:unmarried partner           8194  Q333615:Raymond Radiguet
 7773  Q367073:Donald Crisp                         20  P102:member of political party   3743  Q5020915:California Republican Party
 8362  Q1379164:Taqi al-Din Muhammad ibn Ma'ruf      2  P106:occupation                   191  Q105186:pharmacist
 3494  Q55:Netherlands                               9  P17:country                        55  Q29999:Kingdom of the Netherlands
 2568  Q179126:John Ruskin                           2  P106:occupation                   282  Q1028181:painter
 5243  Q105756:John Updike                           2  P106:occupation                   962  Q214917:playwright
    4  Q314924:Anthony Edwards                 

In [7]:
# this showcases how to access text contexts

def count_contexts(contexts: Generator, n: int = None):

    counts = dict(total=0, mids=Counter(), origins=Counter())

    for context in islice(contexts, n):

        assert context.mid in data.mentions
        assert context.mention in context.data

        counts['total'] += 1
        counts['mids'][context.mid] += 1
        counts['origins'][context.origin] += 1

    print(f'  read {counts["total"]} relevant contexts')
    print(f'  for {len(counts["mids"])} mentions from {len(counts["origins"])} origins')

    return counts

# Contexts are retrieved using a context manager which handles
# opening/closing files appropriately. The managed object is
# a generator yielding irt2.dataset.Context objects.

n = 10_000

with data.closed_contexts() as contexts:
    print('\ncounting closed-world (training) contexts')
    ctx_counts_closed = count_contexts(contexts, n=n)

with data.open_contexts_val() as contexts:
    print('\ncounting open-world (validation) contexts')
    ctx_counts_open_val = count_contexts(contexts, n=n)

with data.open_contexts_test() as contexts:
    print('\ncounting open-world (test) contexts')
    ctx_counts_open_test = count_contexts(contexts, n=n)


counting closed-world (training) contexts


  read 10000 relevant contexts
  for 1452 mentions from 519 origins

counting open-world (validation) contexts


  read 10000 relevant contexts
  for 1112 mentions from 2776 origins

counting open-world (test) contexts
  read 10000 relevant contexts
  for 1862 mentions from 2123 origins


In [8]:
# show some mentions

print('\nclosed-world (training) ' + '-' * 20)
for vid, mids in islice(data.closed_mentions.items(), 30, 35):
    print(f'\n  {len(mids)} mentions of {data.vertices[vid]} ({vid=})')
    for mid in mids:
        mention = data.mentions[mid]
        print(f'    {mid=} {mention} ({ctx_counts_closed["mids"][mid]} matches)')

print('\nopen-world (validation) ' + '-' * 20)
# open-world mentions
for vid, mids in islice(data.open_mentions_val.items(), 30, 35):
    print(f'\n  {len(mids)} mentions of {data.vertices[vid]} ({vid=})')
    for mid in mids:
        mention = data.mentions[mid]
        print(f'    {mid=} {mention} ({ctx_counts_open_val["mids"][mid]} matches)')

print('\nopen-world (test) ' + '-' * 20)
# open-world mentions
for vid, mids in islice(data.open_mentions_test.items(), 30, 35):
    print(f'\n  {len(mids)} mentions of {data.vertices[vid]} ({vid=})')
    for mid in mids:
        mention = data.mentions[mid]
        print(f'    {mid=} {mention} ({ctx_counts_open_test["mids"][mid]} matches)')



closed-world (training) --------------------

  1 mentions of Q180962:Norman Mailer (vid=3097)
    mid=30 norman mailer (0 matches)

  36 mentions of Q9592:Catholic Church (vid=217)
    mid=19073 faith (0 matches)
    mid=10381 catholics (1 matches)
    mid=14606 roman catholic church (1 matches)
    mid=5648 religious (0 matches)
    mid=20627 catholicism (1 matches)
    mid=4373 christians (0 matches)
    mid=21021 catholic (89 matches)
    mid=11550 universal church (0 matches)
    mid=31 catholic faith (0 matches)
    mid=4639 catholic church's (0 matches)
    mid=17313 church (0 matches)
    mid=1828 roman church (0 matches)
    mid=8996 the catholic church (0 matches)
    mid=22438 rome (0 matches)
    mid=21934 knanaya catholic (0 matches)
    mid=7090 ecclesiastical (0 matches)
    mid=10173 roman catholic bishop (0 matches)
    mid=17726 roman catholic (3 matches)
    mid=15425 catholic churches (0 matches)
    mid=1476 roman catholicism (0 matches)
    mid=16708 western (0 m

## Open-World Knowledge Graph Completion

In [9]:
# some examples for the head and tail tasks
# also doing a reverse-lookup for head vertices

from itertools import chain

mid2vid = {
    mid: vid
    for vid, mids in chain(
            data.closed_mentions.items(),
            data.open_mentions_val.items(),
            data.open_mentions_test.items(),
    )
    for mid in mids
}

N = 5

print('\nHEAD TASK ' + '-' * 20)
for (mid, rid), vids in islice(data.open_kgc_val_heads.items(), 10):
    print(f'\n"{data.mentions[mid]}" ({data.vertices[mid2vid[mid]]}) {data.relations[rid]} ?')
    for vid in list(vids)[:N]:
        print(f'  answer: {data.vertices[vid]}')

    if len(vids) > N:
        print(f'  (+{len(vids) - N} more)')

print('\nTAIL TASK ' + '-' * 20)
for (mid, rid), vids in islice(data.open_kgc_val_tails.items(), 10):
    print(f'\n? {data.relations[rid]} "{data.mentions[mid]}" ({data.vertices[mid2vid[mid]]})')
    for vid in list(vids)[:5]:
        print(f'  answer: {data.vertices[vid]}')

    if len(vids) > N:
        print(f'  (+{len(vids) - N} more)')



HEAD TASK --------------------

"cardinal richelieu" (Q26702:Cardinal Richelieu) P27:country of citizenship ?
  answer: Q142:France

"richelieu" (Q26702:Cardinal Richelieu) P27:country of citizenship ?
  answer: Q142:France

"carlos ibáñez e ibáñez de ibero" (Q438968:Carlos Ibáñez de Ibero) P463:member of ?
  answer: Q188771:French Academy of Sciences
  answer: Q337580:Royal Academy of Science, Letters and Fine Arts of Belgium
  answer: Q427318:Spanish Royal Academy of Sciences
  answer: Q329464:Royal Prussian Academy of Sciences
  answer: Q270794:National Academy of Sciences

"heart and souls" (Q238296:Heart and Souls) P136:genre ?
  answer: Q157443:comedy film
  answer: Q157394:fantasy film
  answer: Q859369:comedy-drama

"lisa kudrow" (Q179041:Lisa Kudrow) P106:occupation ?
  answer: Q948329:character actor
  answer: Q28389:screenwriter
  answer: Q33999:actor
  answer: Q3282637:film producer
  answer: Q36180:writer

"mcluhan" (Q193871:Marshall McLuhan) P108:employer ?
  answer: Q13

In [10]:
# print some example texts

texts = defaultdict(set)
with data.closed_contexts() as contexts:
    for ctx in islice(contexts, 1000):
        texts[ctx.mid].add(ctx)

    texts = dict(texts)


for mid, contexts in islice(texts.items(), 3):
    mention_norm = data.mentions[mid]
    vertex = data.vertices[mid2vid[mid]]

    print(f'\ntext for {mention_norm} ({mid=}) ({vertex=})')
    for context in contexts:
        wrapped = '\n'.join(textwrap.wrap(str(context.data), 80),)
        indented = textwrap.indent(wrapped, ' ' * 2)
        print('\n' + indented)


text for united states (mid=9805) (vertex='Q30:United States of America')

  Under the pact, as many as 1,200 Microsoft employees involved with the business
  will be transferred to AOL, and the company will take over the sale of display,
  video, and mobile ads on various Microsoft platforms in nine countries,
  including Brazil, Canada, the United States, and the United Kingdom.

  Its length as measured by the United States Geological Survey is ,U.S.
  Geological Survey.

  In December 2006, AOL closed their last remaining call center in the United
  States, "taking the America out of America Online" according to industry
  pundits.

  Documented by Europeans first in 1701, the Alabama, Coosa, and Tallapoosa rivers
  were central to the homeland of the Creek Indians before their removal by United
  States forces to the Indian Territory in the 1830s.

  The Battle of Guadalcanal begins as the United States Marines initiate the first
  American offensive of the war with landings on G

In [11]:
# we can create a Graph instance from the dataset to look at the
# training data a bit more closely

from irt2.graph import Relation


print(data.graph.description)

relations = Relation.from_graph(data.graph)
print(f'got {len(relations)} relations')

from tabulate import tabulate


def relation_table(relations):
    headers = '#', 'name', 'rid', 'ratio', '#heads', '#tails', '#triples'
    rows = []
    for no, rel in enumerate(relations, 1):
        rows.append((no, rel.name, rel.rid, rel.ratio, len(rel.heads), len(rel.tails), len(rel.triples)))

    return tabulate(rows, headers=headers)


print(relation_table(relations))

IRT2 GRAPH: IRT2/CDE-L
  nodes: 9952
  edges: 102289 (45 types)
  degree:
    mean 20.56
    median 11

got 45 relations
  #  name                                                       rid      ratio    #heads    #tails    #triples
---  -------------------------------------------------------  -----  ---------  --------  --------  ----------
  1  P1412:languages spoken, written, or signed                  11  0.0113418      5202        59        6774
  2  P30:continent                                               19  0.019943        351         7         389
  3  P140:religion                                               25  0.0231106      1601        37        1690
  4  P1303:instrument                                            14  0.0260458      1267        33        1993
  5  P106:occupation                                              2  0.0264254      6963       184       27529
  6  P27:country of citizenship                                   3  0.0266667      6900       184    

## Mention Ranking Task

In [12]:
N = 10

def _gen_task_items(dic):

    seen = set()
    for it in dic.items():
        (_, rid), _ = it
        if rid in seen:
            continue

        yield it
        seen.add(rid)


print(f'\nshowing {N}/{len(data.open_ranking_val_heads)} (head) tasks')
for (vid, rid), mids in islice(_gen_task_items(data.open_ranking_val_heads), N):
    print(f'\nwhat are relevant mentions for ?, {data.relations[rid]}, {data.vertices[vid]}')
    print(f'showing {N}/{len(mids)} mentions:')
    for mid in islice(mids, 10):
        print(f'  {data.mentions[mid]}')

print('-' * 80)
print(f'\nshowing {N}/{len(data.open_ranking_val_tails)} (tail) tasks')
for (vid, rid), mids in islice(_gen_task_items(data.open_ranking_val_tails), N):
    print(f'\nwhat are relevant mentions for {data.vertices[vid]}, {data.relations[rid]}, ?')
    print(f'showing {N}/{len(mids)} mentions:')
    for mid in islice(mids, 10):
        print(f'  {data.mentions[mid]}')




showing 10/4831 (head) tasks

what are relevant mentions for ?, P27:country of citizenship, Q142:France
showing 10/203 mentions:
  clémence poésy
  guillaume thomas françois raynal
  éric elmosnino
  jean marais
  victor hugo's
  alain poher
  abelard
  tina aumont
  alain
  stivell

what are relevant mentions for ?, P463:member of, Q188771:French Academy of Sciences
showing 10/41 mentions:
  donald knuth
  milner
  pierre simon laplace
  viktor ambartsumian
  timoshenko
  giovanni cassini
  borda
  jacques friedel
  napoléon bonaparte
  carlos ibáñez e ibáñez de ibero

what are relevant mentions for ?, P136:genre, Q157443:comedy film
showing 10/35 mentions:
  all about eve
  three amigos
  bright young things
  zoolander
  the smurfs
  film
  good bye, lenin!
  little miss sunshine
  feature film
  a knight's tale

what are relevant mentions for ?, P106:occupation, Q948329:character actor
showing 10/44 mentions:
  brendan gleeson
  fredric march
  dennis hopper
  macy
  alan tudyk
  

In [13]:
import statistics

def count_expected_mentions(**kwargs):
    for name, dic in kwargs.items():
        counts = sorted(map(len, dic.values()), reverse=True)

        print(
            f'{name}:',
            'max', counts[0],
            'min', counts[-1],
            'avg', statistics.mean(counts),
            'median', statistics.median(counts),
        )

count_expected_mentions(
    heads=data.open_ranking_val_heads,
    tails=data.open_ranking_val_tails,
)

heads: max 1136 min 1 avg 7.632788242599876 median 2
tails: max 34 min 1 avg 2.065068493150685 median 1.0
