In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:

# Sample text data
corpus = [
    "John is the CEO of XYZ Corp.",
    "Jane is an engineer at ABC Corp.",
    "XYZ Corp. is a tech company.",
    "ABC Corp. develops software products.",
]

In [3]:
# Define seed pairs (Person, Role)
seed_pairs = [("John", "CEO"), ("Jane", "engineer")]

In [4]:
# Initialize NLTK components
nltk.download("punkt")
nltk.download("stopwords")
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
# Define a function to extract relationships from text
def extract_relationships(text):
    relationships = []
    sentences = sent_tokenize(text)

    for sentence in sentences:
        for pair in seed_pairs:
            person, role = pair
            pattern = re.escape(person) + r".*?" + re.escape(role)
            matches = re.finditer(pattern, sentence)

            for match in matches:
                relationships.append((person, role))

    return relationships

In [6]:
# Define a function to update seed pairs with new relationships
def update_seed_pairs(relationships):
    new_seed_pairs = []

    for relationship in relationships:
        person, role = relationship
        person = stemmer.stem(person.lower())
        role = stemmer.stem(role.lower())

        if person not in stop_words and role not in stop_words:
            new_seed_pairs.append((person, role))

    return new_seed_pairs

In [7]:
# Bootstrapping process
for i in range(5):  # Perform 5 iterations
    new_relationships = []
    for document in corpus:
        new_relationships.extend(extract_relationships(document))

    new_seed_pairs = update_seed_pairs(new_relationships)
    print(f"Iteration {i + 1}: Seed pairs: {new_seed_pairs}")

    # Break if no new relationships found
    if not new_seed_pairs:
        break

print("Final extracted relationships:")
print(seed_pairs)


Iteration 1: Seed pairs: [('john', 'ceo'), ('jane', 'engin')]
Iteration 2: Seed pairs: [('john', 'ceo'), ('jane', 'engin')]
Iteration 3: Seed pairs: [('john', 'ceo'), ('jane', 'engin')]
Iteration 4: Seed pairs: [('john', 'ceo'), ('jane', 'engin')]
Iteration 5: Seed pairs: [('john', 'ceo'), ('jane', 'engin')]
Final extracted relationships:
[('John', 'CEO'), ('Jane', 'engineer')]
