In [1]:
import pandas as pd
import spacy
from pathlib import Path

In [15]:
MAIN_PATH = Path(r"C:\Users\lhi30\Haein\2023\YBIGTA\2023-2\DA\Wiki_People\Share")

In [17]:
test_df = pd.read_csv(MAIN_PATH / "event_extraction/Winston_Churchill.csv")
test_df = test_df[test_df.iloc[:, 3] == "summary"]

In [18]:
test = test_df.iloc[18, 2]
test

'Ideologically an adherent to economic liberalism and imperialism, he was for most of his career a member of the Conservative Party, which he led from 1940 to 1955.'

In [37]:
text = "In 1876, [1] was appointed Viceroy of Ireland, then part of the United Kingdom. [2] became [0]'s private secretary and the family relocated to Dublin. [1]'s brother, Jack, was born there in 1880. Throughout much of the 1880s, [2] and Jennie were effectively estranged, and [3] were mostly cared for by [4]] When [4] died in 1895, [1] wrote that \"[4] had been [1]'s dearest and most intimate friend during the whole of the twenty years [1] had lived\". [1] began boarding at St George's School in Ascot, Berkshire, at age seven but was not academic and [1]\'s behaviour was poor.[11] In 1884, [1] transferred to Brunswick School in Hove, where [1]\'s academic performance improved."
text

'In 1876, [1] was appointed Viceroy of Ireland, then part of the United Kingdom. [2] became [0]\'s private secretary and the family relocated to Dublin. [1]\'s brother, Jack, was born there in 1880. Throughout much of the 1880s, [2] and Jennie were effectively estranged, and [3] were mostly cared for by [4]] When [4] died in 1895, [1] wrote that "[4] had been [1]\'s dearest and most intimate friend during the whole of the twenty years [1] had lived". [1] began boarding at St George\'s School in Ascot, Berkshire, at age seven but was not academic and [1]\'s behaviour was poor.[11] In 1884, [1] transferred to Brunswick School in Hove, where [1]\'s academic performance improved.'

In [None]:
import spacy
from spacy.symbols import ORTH

def select_subject(person_id: int, num_tag_text: str) -> list[str]:
    ############################################################################################################################################
    # select_subject(): A function that selects sentences that have the designated person as the subject (active or passive) of the sentence.  #
    # person_id: the tag number of the person of interest                                                                                      #
    # num_tag_text: text that has replaced the people to number tags                                                                           #
    ############################################################################################################################################
    nlp = spacy.load("en_core_web_trf") #Load spacy model
    
    # Add special tokens for people
    for i in range(200):
        nlp.tokenizer.add_special_case(f"[{i}]", [{ORTH: f"[{i}]"}])

    doc = nlp(num_tag_text) #The text that has all people replaced with numbered tags is now processed through spacy
    assert doc.has_annotation("SENT_START")
    selected_sentences = [] #Store all sentences with the main person's tag as the subject inside this list
    for sent in doc.sents: # Go through all the sentences
        sentence_doc = nlp(sent.text) # Run the sentence through spacy again, so that we can look through token by token.
        sentence_added = False 
        for token in sentence_doc: # Go through all the tokens
            if((token.dep_ == "nsubj" or token.dep_ == "nsubjpass") and # If the token is an active or passive subject
               token.text == f"[{person_id}]" and # If the token is referring to the person we're interested in
               not sentence_added):# If we haven't added this sentence to selected_sentences before
                selected_sentences.append(sent) # Then, we can add the sentence to the list
    return selected_sentences

In [None]:
from spacy.matcher import Matcher
from spacy.util import filter_spans

In [160]:
# Define the pattern
pattern = [
    {"DEP": {"in": ["nsubj", "nsubjpass"]}, "OP": "+"},
    {"POS": {"in": ["VERB", "AUX"]}, "OP": "+"},
    {"OP": "{0,10}"},
    {"DEP": {"in": ["dobj", "pobj"]}, "OP": "*"},
]

# Instantiate a Matcher instance
matcher = Matcher(nlp.vocab)
matcher.add("CustomPattern", [pattern])


# Apply the matcher to the processed document
matches = matcher(doc)

result_list = []
for match_id, start, end in matches:
    # Get the tokens that match the pattern
    match_tokens = doc[start:end]
    
    # Extract relevant information
    subj_verb_obj = [token.text for token in match_tokens if token.dep_ in ["nsubj", "nsubjpass", "VERB", "AUX", "dobj", "pobj"]]
    
    # Create a list with the extracted information
    result_list.append(subj_verb_obj)

# Print the result
print(result_list)

[['he'], ['he'], ['he', 'most'], ['he', 'most'], ['he', 'most'], ['he', 'most', 'career'], ['he', 'most', 'career'], ['he', 'most', 'career'], ['he', 'most', 'career'], ['he', 'most', 'career'], ['he', 'most', 'career'], ['he', 'most', 'career', 'Party'], ['he'], ['he'], ['he', '1940'], ['he', '1940'], ['he', '1940', '1955'], ['he', '1940', '1955']]


In [107]:
from spacy import displacy
displacy.render(doc, style='dep')