In [14]:
!conda env export > environment_droplet.yml

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
# ------------------------------
# Get data 
# load an article from wikipedia 
# use the summary only 

import wikipedia as wp

doc = wp.summary('Singapore')

# Create wikipedia object for Cats 
wiki = wp.page(
    title = 'Singapore',
    auto_suggest = True)

wiki.summary

'Singapore ( (listen)), officially the Republic of Singapore, is an island country and city-state in maritime Southeast Asia. It lies about one degree of latitude (137 kilometres or 85 miles) north of the equator, off the southern tip of the Malay Peninsula, bordering the Strait of Malacca to the west, the Singapore Strait to the south, the South China Sea to the east, and the Straits of Johor to the north. The country\'s territory is composed of one main island, 63 satellite islands and islets, and one outlying islet; the combined area of these has increased by 25% since the country\'s independence as a result of extensive land reclamation projects. It has the third highest population density in the world. With a multicultural population and recognising the need to respect cultural identities of the major ethnic groups within the nation, Singapore has four official languages: English, Malay, Mandarin, and Tamil. English is the lingua franca and numerous public services are available o

In [16]:
# ------------------------------
# Perform NER 
# load spacey 
# run NER
# generate an initial networkx graph a

import spacy

# ~~import neuralcoref # import Neuralcoref to be used in spaCy pipeline (requires pythone 3.7....)~~
# doc: https://github.com/huggingface/neuralcoref


Troubleshooting for coref: https://github.com/explosion/spaCy/discussions/11585 

```
# Commandline

pip install spacy-experimental==0.6.2
pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.1/en_coreference_web_trf-3.4.0a2-py3-none-any.whl

# Python
import spacy
nlp = spacy.load("en_coreference_web_trf")
doc = nlp("The cats were startled by the dog as it growled at them.") 
print(doc.spans)
```

In [17]:
# Coreference Resolution 
from spacy.tokens import Doc
from wasabi import msg

# Load in the coref model
# You can download the model here
# https://github.com/explosion/spacy-experimental/releases/tag/v0.6.0
nlp = spacy.load("en_coreference_web_trf")

# Process example sentence
# Other examples:
# Philip plays the bass because he loves it.
# Sarah enjoys a nice cup of tea in the morning. She likes it with sugar and a drop of milk.
# John said hi. Big old John is always around.
doc = nlp(wiki.summary)

# Print out component names
msg.info("Pipeline components")
for i, pipe in enumerate(nlp.pipe_names):
    print(f"{i}: {pipe}")

# Print out clusters
msg.info("Found clusters")
for cluster in doc.spans:
    print(f"{cluster}: {doc.spans[cluster]}")

# Define lightweight function for resolving references in text
def resolve_references(doc: Doc) -> str:
    """Function for resolving references with the coref ouput
    doc (Doc): The Doc object processed by the coref pipeline
    RETURNS (str): The Doc string with resolved references
    """
    # token.idx : token.text
    token_mention_mapper = {}
    output_string = ""
    clusters = [
        val for key, val in doc.spans.items() if key.startswith("coref_cluster")
    ]

    # Iterate through every found cluster
    for cluster in clusters:
        first_mention = cluster[0]
        # Iterate through every other span in the cluster
        for mention_span in list(cluster)[1:]:
            # Set first_mention as value for the first token in mention_span in the token_mention_mapper
            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
            
            for token in mention_span[1:]:
                # Set empty string for all the other tokens in mention_span
                token_mention_mapper[token.idx] = ""

    # Iterate through every token in the Doc
    for token in doc:
        # Check if token exists in token_mention_mapper
        if token.idx in token_mention_mapper:
            output_string += token_mention_mapper[token.idx]
        # Else add original token text
        else:
            output_string += token.text + token.whitespace_

    return output_string

# TOPIC = "sega" 
msg.warn("Original document")
print(wiki.summary)
msg.good("Document with resolved references")
doc_resolved = resolve_references(doc)
print(doc_resolved)


[38;5;4mℹ Pipeline components[0m
0: sentencizer
1: transformer
2: coref
3: span_resolver
4: span_cleaner
[38;5;4mℹ Found clusters[0m
coref_clusters_1: [Singapore ( (listen)), officially the Republic of Singapore, It, The country's, the country's, It, the nation, Singapore, Singapore's, Its, Singapore, Singapore, Singapore, Singapore, Singapore's, Singapore, the nation, itself, it, Singapore, It, Singapore, Singapore, its, the country, the country, Singapore, Singapore]
coref_clusters_2: [63 satellite islands and islets, these]
coref_clusters_3: [English, English, English]
coref_clusters_4: [maritime Southeast Asia, Southeast Asia]
coref_clusters_5: [Japan, Japan's]
coref_clusters_6: [Malaysia, Malaysia]
coref_clusters_7: [the new federation of Malaysia, the federation]
coref_clusters_8: [the world, the world, the world]
coref_clusters_9: [the People's Action Party (PAP), The PAP]
[38;5;3m⚠ Original document[0m
Singapore ( (listen)), officially the Republic of Singapore, is an isl

In [18]:
# Run 2nd pass over corrected doc 
# create a language object
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("merge_entities")
# nlp.add_pipe("merge_noun_chunks")

# REMOVED PIPELINE COMPONENTS BECAUSE THIS PROCESS IS SEPERATED 
# assert "transformer" not in nlp.pipe_names
# nlp_coref = spacy.load("en_coreference_web_trf")
# nlp.add_pipe("transformer", source=nlp_coref)
# nlp.add_pipe("coref", source=nlp_coref)
# nlp.add_pipe("span_resolver", source=nlp_coref)
# nlp.add_pipe("span_cleaner", source=nlp_coref)

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [19]:

doc = nlp(doc_resolved)

doc = nlp("Bob, a citizen of Singapore, lost 500 million dollars in a scam")
# visualize, wo tokenizing 
spacy.displacy.render(doc, style = "ent")

In [20]:
spacy.displacy.render(doc, style = "dep")

# Relationship Construction

In [33]:
import pandas as pd
import re 


# TODO fix neural coref with python 3.7
# import neuralcoref 
# neuralcoref.add_to_pipe(nlp)  # Add neuralcoref to the spaCy pipeline to resolve coreference clusters


# Function to extract entity pairs from text
# - Ref: https://towardsdatascience.com/auto-generated-knowledge-graphs-92ca99a81121
def get_entity_pairs(text: str):
    # Preprocess text
    # Replace multiple newlines with period
    text = re.sub(r'\n+', '.', text)
    # text = nlp(text._.coref_resolved)
    
    """
    # TODO: fix coref experimental 
    if coref:
        text = nlp(text._.coref_resolved)

    # TODO convert entities to tokens
    """
    doc = nlp(text)

    # Function to refine an entity
    def refine_ent(ent, sent):
        unwanted_tokens = (
            'PRON',  # pronouns
            'PART',  # particle
            'DET',  # determiner
            'SCONJ',  # subordinating conjunction
            'PUNCT',  # punctuation
            'SYM',  # symbol
            'X',  # other
        )
        # Get entity type
        ent_type = ent.ent_type_

        # If entity type is empty, set it to NOUN_CHUNK
        if ent_type == '':
            ent_type = 'NOUN_CHUNK'
            ent = ' '.join(str(t.text) for t in nlp(str(ent)) if t.pos_ not in unwanted_tokens and t.is_stop == False)
        
        # If entity type is NOMINAL, CARDINAL, or ORDINAL and has no spaces, refine it
        elif ent_type in ('NOMINAL', 'CARDINAL', 'ORDINAL') and str(ent).find(' ') == -1:
            refined = ''
            for i in range(len(sent) - ent.i):
                if ent.nbor(i).pos_ not in ('VERB', 'PUNCT'):
                    refined += ' ' + str(ent.nbor(i))
                else:
                    ent = refined.strip()
                    break
        return ent, ent_type

    """
    Assumption: each sentence has a unique semantic meaning. Alternative assumption is to go by paragraphs.
    - This is not always true, but it is a good starting point.
    """  
    sentences = [sentence.text for sentence in doc.sents]  # convert span object to text
    ent_pairs = [] 

    for sentence in sentences:
        
        sentence = nlp(sentence)
        """
        Identify the subject and object of the sentence and extract them into a list of entity pairs
        - defensive programming:
        - check if the current token is an object node
        - if object, identify the subject nodes of the object and take the first one
        """
        for token in sentence:
            
            if token.dep_ not in ('obj', 'dobj'):  # check if the current token is an object node
                continue

            subject = [w for w in token.head.lefts if w.dep_ in ('subj', 'nsubj')]  # identify the subject nodes of the object
            
            if subject:  # if there is a subject, take the first one
                subject = subject[0]
                relationship = [w for w in token.ancestors if w.dep_ == 'ROOT']  # identify the relationship by root dependency
                if relationship:
                    relationship = relationship[0]  # if there is a relationship, take the first one
                    if relationship.nbor(1).pos_ in ('ADP', 'PART'):  # amend to adposition or particle to relationship if it exists
                        relationship = ' '.join((str(relationship), str(relationship.nbor(1))))
                        # ----------
                        # DEBUGGING 
                        print(relationship)
                        # ----------
                else:
                    relationship = 'unknown'  # if no relationship found, set as unknown
                
                # refine the subject and object entities and add them to the pairs list
                subject, subject_type = refine_ent(subject, sentence)
                token, object_type = refine_ent(token, sentence)

                ent_pairs.append([
                    str(subject), 
                    str(relationship), 
                    str(token),
                    str(subject_type), 
                    str(object_type)
                    ])

    # ent_pairs = [sublist for sublist in ent_pairs if not any(str(ent) == '' for ent in sublist)]  # remove any empty entity pairs
    print('Entity pairs extracted:', str(len(ent_pairs)))  # print the number of entity pairs extracted
    return pd.DataFrame(ent_pairs, columns=['subject', 'relation', 'object','subject_type', 'object_type'])  # create a DataFrame with the extracted entity pairs

# df_wiki = get_entity_pairs("Bob is a citizen of Singapore. Bob lost 500 million dollars in a scam. Bob ran away from home. Bob lost his job.")
df_wiki = get_entity_pairs(wp.summary("America"))
print(df_wiki)

# TODO - do filtering BEFORE hand (refine_ent)
# TODO - add in the Named Entity Linking model (https://neo4j.com/developer-blog/making-sense-of-news-the-knowledge-graph-way/)

culminated in
makes up
Entity pairs extracted: 13
              subject       relation             object subject_type  \
0                             shares            borders   NOUN_CHUNK   
1   the United States         gained       independence          GPE   
2   the United States         gained              state          GPE   
3   the United States        spanned          continent          GPE   
4                      culminated in             states   NOUN_CHUNK   
5                U.S.        entered       World War II          GPE   
6           aftermath           left  the United States   NOUN_CHUNK   
7                             became        involvement   NOUN_CHUNK   
8                                has             levels   NOUN_CHUNK   
9   The United States            has             income          GPE   
10                               has             levels   NOUN_CHUNK   
11               U.S.          holds           over 30%          GPE   
12            

# Graph Visualization 

In [22]:
df_wiki_temp = df_wiki.copy()

# move all "subject" to "object" and set "subject" to "Root"

df_wiki_temp['object'] = df_wiki_temp['subject']
df_wiki_temp['subject'] = "Root"
df_wiki_temp

# concatenate df_wiki_temp and df_wiki
df_wiki = pd.concat([df_wiki, df_wiki_temp], ignore_index=True)
df_wiki

# convert subject and object to lowercase
df_wiki['subject'] = df_wiki['subject'].str.lower()
df_wiki['object'] = df_wiki['object'].str.lower()
df_wiki['relation'] = df_wiki['relation'].str.lower()

df_wiki.describe()


Unnamed: 0,subject,relation,object,subject_type,object_type
count,18,18,18,18,18
unique,8,6,14,4,3
top,root,has,singapore,GPE,NOUN_CHUNK
freq,9,6,4,8,12


In [23]:
from pyvis.network import Network
import networkx as nx

net = Network(
    height="600px",
    width="100%",
    directed=True,
    notebook=True,
    neighborhood_highlight=True,
    select_menu=False,
    filter_menu=False,
    bgcolor="#ffffff",
    font_color=False,
    layout=None,
    heading="",
    cdn_resources="in_line")

def draw_kg_vis(pairs):
    G = nx.from_pandas_edgelist(
        df = pairs, 
        source = 'subject', 
        target = 'object',
        edge_key = 'relation',
        create_using=nx.DiGraph())
    # node_deg = nx.degree(G)
    net.from_nx(G)

    net.show_buttons(filter_=['physics', 'edges', 'nodes'])
    net.show('kg.html')

draw_kg_vis(df_wiki)

# TODO - add labels to edges
# TODO - add hiearchy so the document is at root 
# TODO - cluster commong words together

In [24]:
# PRINT PACKAGE VERSIONS 

print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))


wikipedia==(1, 4, 0)
spacy==3.3.1
pandas==1.5.2
re==2.2.1
networkx==2.8.4


In [25]:
# ------------------------------
# Enrich graph
# search wikipedia / media wikie 
# extract categories 