In [1]:
from brain2kg.text2kg.eda_pipeline import EDA

[nltk_data] Downloading package punkt to /Users/jamino/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from tqdm.autonotebook import tqdm, trange


In [2]:
input_raw_text = """\
The location of Trane is Swords, Dublin.
The Ciudad Ayala city, a part of Morelos with population density and population of 1604.0 and 1,777,539 respectively, has a UTC offset of -6. The government type of Ciudad Ayala is council-manager government and City Manager is one of the leaders.
The 17068.8 millimeter long ALCO RS-3 has a diesel-electric transmission.
Alan B. Miller Hall, in Virginia, USA, was designed by Robert A.M. Stern. The address of the hall is "101 Ukrop Way" and the current tenants are the Mason School of Business.
Liselotte Grschebina was born in Karlsruhe and died in Israel. Ethnic groups in Israel include Arabs.
Agremiação Sportiva Arapiraquense managed by Vica has 17000 members and play in the Campeonato Brasileiro Série C league which is from Brazil.
Bananaman first aired on the 10th of March, 1983 and was created by Steve Bright. It was broadcast by the BBC.
The 11th Mississippi Infantry Monument, built in 2000, is placed in the municipality of Gettysburg in Pennsylvania which is in Adams County, USA. The 11th Mississippi Infantry Monument is classified as a Contributing Property. Cumberland county, Pennsylvania is to the north of Adams County.
"""

import random
input_raw_text = input_raw_text.splitlines()
random.shuffle(input_raw_text)
input_raw_text = '\n'.join(input_raw_text)

config = {
    # OIE
    'oie_llm': 'llama3.1',
    'oie_prompt_template_file_path': 'prompt_templates/oie_fsp_template.txt',
    'oie_few_shot_example_file_path': 'few_shot_examples/oie_few_shot_examples.txt',
    # SD
    'sd_llm': 'llama3.1',
    'sd_prompt_template_file_path': 'prompt_templates/sd_fsp_template.txt',
    'sd_few_shot_example_file_path': 'few_shot_examples/sd_few_shot_examples.txt',
    # SA
    'sa_target_schema_file_path': 'schemas/webnlg_schema.csv',
    'sa_llm': 'llama3.1',
    'sa_embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
    'sa_prompt_template_file_path': 'prompt_templates/sa_template.txt'
}

In [3]:
input_raw_text

'The 17068.8 millimeter long ALCO RS-3 has a diesel-electric transmission.\nAlan B. Miller Hall, in Virginia, USA, was designed by Robert A.M. Stern. The address of the hall is "101 Ukrop Way" and the current tenants are the Mason School of Business.\nBananaman first aired on the 10th of March, 1983 and was created by Steve Bright. It was broadcast by the BBC.\nThe Ciudad Ayala city, a part of Morelos with population density and population of 1604.0 and 1,777,539 respectively, has a UTC offset of -6. The government type of Ciudad Ayala is council-manager government and City Manager is one of the leaders.\nThe location of Trane is Swords, Dublin.\nAgremiação Sportiva Arapiraquense managed by Vica has 17000 members and play in the Campeonato Brasileiro Série C league which is from Brazil.\nLiselotte Grschebina was born in Karlsruhe and died in Israel. Ethnic groups in Israel include Arabs.\nThe 11th Mississippi Infantry Monument, built in 2000, is placed in the municipality of Gettysburg

In [4]:
eda = EDA(**config)

In [5]:
output_kg_list = eda.extract_kg(
    input_raw_text,
    output_dir='examples/outputs'
)

Extracting: 100%|██████████| 14/14 [00:42<00:00,  3.05s/it]
Defining: 100%|██████████| 14/14 [00:48<00:00,  3.45s/it]
Aligning: 100%|██████████| 14/14 [00:13<00:00,  1.00it/s]


In [6]:
# Before
output_kg_list[0]

[[['ALCO_RS-3', 'length', '17068.8 (millimetres)'],
  ['ALCO_RS-3', 'powerType', 'Diesel-electric_transmission']],
 [['Alan_B._Miller_Hall', 'architect', 'Robert_A._M._Stern']],
 [['The_hall', 'address', '"101 Ukrop Way"'],
  ['Mason_School_of_Business', 'currentTenants', 'The_hall']],
 [['Bananaman', 'firstAired', 'March_10,_1983'],
  ['Bananaman', 'creator', 'Steve_Bright']],
 [['It_was_broadcast_by_the_BBC', 'broadcaster', 'BBC']],
 [['Ciudad_Ayala', 'populationDensity', '1604.0'],
  ['Ciudad_Ayala', 'populationMetro', '1777539'],
  ['Ciudad_Ayala', 'utcOffset', '−6'],
  ['Ciudad_Ayala', 'isPartOf', 'Morelos']],
 [['Ciudad_Ayala', 'governmentType', 'Council-manager_government'],
  ['City_Manager', 'title', 'Leader']],
 [['Trane', 'location', 'Swords,_Dublin']],
 [['Agremiação_Sportiva_Arapiraquense', 'numberOfMembers', '17000'],
  ['Agremiação_Sportiva_Arapiraquense',
   'league',
   'Campeonato_Brasileiro_Série_C'],
  ['Campeonato_Brasileiro_Série_C', 'country', 'Brazil'],
  ['Agre

In [7]:
# After
output_kg_list[1]

[[['ALCO_RS-3', 'length', '17068.8 (millimetres)'],
  ['ALCO_RS-3', 'powerType', 'Diesel-electric_transmission']],
 [['Alan_B._Miller_Hall', 'architect', 'Robert_A._M._Stern']],
 [['The_hall', 'address', '"101 Ukrop Way"'],
  ['Mason_School_of_Business', 'tenant', 'The_hall']],
 [['Bananaman', 'firstAired', 'March_10,_1983'],
  ['Bananaman', 'creator', 'Steve_Bright']],
 [['It_was_broadcast_by_the_BBC', 'author', 'BBC']],
 [['Ciudad_Ayala', 'populationDensity', '1604.0'],
  ['Ciudad_Ayala', 'populationMetro', '1777539'],
  ['Ciudad_Ayala', 'utcOffset', '−6'],
  ['Ciudad_Ayala', 'isPartOf', 'Morelos']],
 [['Ciudad_Ayala', 'type', 'Council-manager_government'],
  ['City_Manager', 'occupation', 'Leader']],
 [['Trane', 'location', 'Swords,_Dublin']],
 [['Agremiação_Sportiva_Arapiraquense', 'numberOfMembers', '17000'],
  ['Agremiação_Sportiva_Arapiraquense',
   'league',
   'Campeonato_Brasileiro_Série_C'],
  ['Campeonato_Brasileiro_Série_C', 'country', 'Brazil'],
  ['Agremiação_Sportiva_Ar