In [2]:
from pprint import pprint

In [3]:
import requests
from bs4 import BeautifulSoup as BS

In [4]:
# list of urls
raw_d = {}

# Alibaba
# acquisitions: Amblin Partners, Ant Financial, South China Morning Post

# replace urls list with your own list of urls 
urls = ['https://en.wikipedia.org/wiki/Alibaba_Group', 'https://en.wikipedia.org/wiki/Amblin_Partners',
       'https://en.wikipedia.org/wiki/Ant_Financial', 'https://en.wikipedia.org/wiki/South_China_Morning_Post']

urls = ['https://en.wikipedia.org/wiki/Alibaba_Group']

# layout of wiki page is standardized
# text content is all in p tags
for u in urls:
    
    # dictionary to to hold text information by url
    # in case we try and crawl all on a central notebook
    raw_d[u] = []
    wiki_pg = requests.get(u)
    soup = BS(wiki_pg.content, 'lxml')
    for content in soup.select("p"):
        # adding to list
        raw_d[u].append(content.text)

In [5]:
import spacy
from spacy import displacy
from spacy.symbols import nsubj, VERB

import textacy
import neuralcoref

# will need to download eng trained model via cli
# python -m spacy download en_core_web_lg

# there are 3 types of basic trained models (sm, md, lg)
# using medium as the neural coref came out funky with large
nlp = spacy.load("en_core_web_md")

neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0xa2a77ab00>

In [6]:
resolved_d = {}

for u in urls:
    resolved_p = []
    
    full_text = raw_d[u]
    
    for paragraph in full_text:
        
        p = paragraph.strip()
        doc = nlp(p)
        
        # apply neural coref to resolve word references
        cluster = doc._.coref_clusters
        resolved_p.append(doc._.coref_resolved)
        
    resolved_d[u] = resolved_p

In [7]:
# dictionary that will contain relationships from all methods
# key = url 
# value = list of json of relationship described in each method

total_d = {}
for u in resolved_d.keys():
    total_d[u] = {}

In [11]:
# RUN THIS CELL TO LEMMA THE SENTENCES
# SOME SENTENCES ARE BETTER LEMMA-ED OTHERS ARE NOT

LEMMA = True

if LEMMA:
    for u in resolved_d.keys():

        corpus = resolved_d[u]
        lemma_corpus = []

        for p in corpus:
            doc = nlp(p)

            lemma_doc = []
            for token in doc:
                # keeping is
                if token.pos == VERB:
                    if token.lemma_ == 'be':
                        lemma_doc.append('is')
                    else:
                        lemma_doc.append(token.lemma_)
                else:
                    lemma_doc.append(token.text)

            lemma_doc = ' '.join(lemma_doc).strip()
            lemma_corpus.append(lemma_doc)

        resolved_d[u] = lemma_corpus

In [12]:
key = 'https://en.wikipedia.org/wiki/Alibaba_Group'

print (raw_d[key][1])
print (resolved_d[key][1])

Alibaba Group Holding Limited, (also known as Alibaba Group and as Alibaba), is a Chinese multinational conglomerate holding company specializing in e-commerce, retail, Internet, and technology. Founded on 4 April 1999 in Hangzhou, Zhejiang, the company provides consumer-to-consumer (C2C), business-to-consumer (B2C), and business-to-business (B2B) sales services via web portals, as well as electronic payment services, shopping search engines and cloud computing services. It owns and operates a diverse array of businesses around the world in numerous sectors, and is named as one of the world's most admired companies by Fortune.[3][4]

Alibaba Group Holding Limited , ( also know as Alibaba Group and as Alibaba ) , is a Chinese multinational conglomerate hold company specialize in e - commerce , retail , Internet , and technology . found on 4 April 1999 in Hangzhou , Zhejiang , Alibaba provide consumer - to - consumer ( C2C ) , business - to - consumer ( B2C ) , and business - to - busine

In [14]:
# Method 1

### Trys to extract nouns and verbs with preposition phrases based on the token dependency tree from pos tagging ###
# Searches dependency tree and if a verb is found, will then traverse the tree left and right
# to determine the relationship in the sentence
# will also try and find the preopsitional phrase in the sentence if any and try to add that to the relationship as well

# before = node 1 
# after = node 2
# verb = edge from node 1 -> node 2
# prep = description of edge

def get_preps(doc):
    preps = []
    for token in doc:
        if token.pos_ == 'ADP':
            pp = ' '.join([tok.orth_ for tok in token.subtree])
            preps.append(pp)
    return preps

def get_noun_verb(doc):
    
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()
    
    relations = []
    for token in doc:
        if token.pos_ == 'VERB':
            
            subject = [w.text for w in token.head.lefts if w.dep_ == "nsubj"]
            after = [w.text for w in token.head.rights]
            
            if len(subject) == 0:
                subject = [token.head.text]
                after = [w.text for w in token.rights]
            
            relation_dict = {
                'before': [],
                'after': [],
                'verb': token.text,
                'prep': []
            }
            
            relation_dict['before'].extend(subject)
            relation_dict['after'].extend(after)
            
            relations.append(relation_dict)
            
    return relations

def combine_noun_verb_preps(noun_verbs, preps):
    holder = []
    
    for nv in noun_verbs:
        
        # will apply before preposition and after preposition
        holder.append(nv)
        nv['prep'].extend(preps)
        holder.append(nv)
    
    return holder

KEY = 'Method1'
# EACH URL TAKES ABOUT 1 MINUTE TO RUN
for u in urls:
    
    # creating a separate key for Method1
    total_d[u][KEY] = []
    article = resolved_d[u]
    
    for p in article:
        # list is broken up into paragraphs
        # will then break up the paragraph into sentences
        sentences = p.split('.')
        for s in sentences:
            doc = nlp(s)
            
            noun_verbs = get_noun_verb(doc)
            preps = get_preps(doc)
            combined_nvp = combine_noun_verb_preps(noun_verbs, preps)
            print (combined_nvp)
            break
            total_d[u][KEY].append(combined_nvp)

[]
[{'verb': 'know', 'prep': ['as Alibaba Group and as Alibaba', 'as Alibaba', 'in e - commerce , retail , Internet , and technology'], 'before': ['is'], 'after': ['as', ')']}, {'verb': 'know', 'prep': ['as Alibaba Group and as Alibaba', 'as Alibaba', 'in e - commerce , retail , Internet , and technology'], 'before': ['is'], 'after': ['as', ')']}, {'verb': 'is', 'prep': ['as Alibaba Group and as Alibaba', 'as Alibaba', 'in e - commerce , retail , Internet , and technology'], 'before': ['is'], 'after': ['hold']}, {'verb': 'is', 'prep': ['as Alibaba Group and as Alibaba', 'as Alibaba', 'in e - commerce , retail , Internet , and technology'], 'before': ['is'], 'after': ['hold']}, {'verb': 'hold', 'prep': ['as Alibaba Group and as Alibaba', 'as Alibaba', 'in e - commerce , retail , Internet , and technology'], 'before': ['is'], 'after': ['specialize']}, {'verb': 'hold', 'prep': ['as Alibaba Group and as Alibaba', 'as Alibaba', 'in e - commerce , retail , Internet , and technology'], 'befor

[{'verb': 'announce', 'prep': ['In January 2017', 'till 2028', 'in where Alibaba would sponsor the Olympic Games'], 'before': ['Alibaba'], 'after': ['a $ 800 million deal']}, {'verb': 'announce', 'prep': ['In January 2017', 'till 2028', 'in where Alibaba would sponsor the Olympic Games'], 'before': ['Alibaba'], 'after': ['a $ 800 million deal']}, {'verb': 'would', 'prep': ['In January 2017', 'till 2028', 'in where Alibaba would sponsor the Olympic Games'], 'before': ['that'], 'after': ['till', 'in']}, {'verb': 'would', 'prep': ['In January 2017', 'till 2028', 'in where Alibaba would sponsor the Olympic Games'], 'before': ['that'], 'after': ['till', 'in']}, {'verb': 'last', 'prep': ['In January 2017', 'till 2028', 'in where Alibaba would sponsor the Olympic Games'], 'before': ['a $ 800 million deal'], 'after': ['till', 'in']}, {'verb': 'last', 'prep': ['In January 2017', 'till 2028', 'in where Alibaba would sponsor the Olympic Games'], 'before': ['a $ 800 million deal'], 'after': ['till

[{'verb': 'start', 'prep': ['In 2017', 'of supermarkets'], 'before': ['open'], 'after': []}, {'verb': 'start', 'prep': ['In 2017', 'of supermarkets'], 'before': ['open'], 'after': []}, {'verb': 'open', 'prep': ['In 2017', 'of supermarkets'], 'before': ['open'], 'after': ['a chain']}, {'verb': 'open', 'prep': ['In 2017', 'of supermarkets'], 'before': ['open'], 'after': ['a chain']}]


KeyboardInterrupt: 

In [68]:
# Method 2

### Similar to method 1 ###
# Extracts noun chunks and verbs and creates span of nouns and verbs 
# spans are nouns before a verb, verb, and then nouns after
# For example sentence may be: nouns1 + verb1 + nouns2 + verb2 + nouns3 
# the spans will be:
# - nouns1, verb1, nouns2
# - nouns2, verb2, nouns3

# theyre all placed in json and is in a similar style to method 1
# before = node 1 
# after = node 2
# verb = edge from node 1 -> node 2

def get_nouns(doc):
    nouns = []
    noun = textacy.extract.noun_chunks(doc)

    for n in noun:

        end = n.end
        n = str(n)
        temp_word = []
        for ch in n:
            if ch.isalpha() or ch.isnumeric() or ch is ' ':
                temp_word.append(ch)

        nouns.append((''.join(temp_word), end))
        
    return nouns

def get_verbs(doc):
    main_verbs = textacy.spacier.utils.get_main_verbs_of_sent(doc)
    verbs = []
    for verb in main_verbs:

        verb_index = verb.i
        verbs.append((verb.text, verb_index))

    tbd = []
    for i in range(len(verbs)):
        v = verbs[i]

        for n in nouns:
            if v[0] in n[0]:
                tbd.append(v)

    cleaned_verbs = []
    for v in verbs:
        if v in tbd:
            continue
        else:
            cleaned_verbs.append(v)

    return cleaned_verbs

def span_relation(nouns, verbs):
    relations = []

    # ensure we can collect appropriate spans by setting bounds
    before = 0
    end = float('inf')

    for i in range(len(verbs)):

        v = verbs[i]

        # helper variables to handle span creation
        if i == 0:
            before = float('-inf')
        else:
            before = i - 1

        if i == len(verbs) - 1:
            after = float('inf')
        else:
            after = i + 1

        relation_dict = {
            'before': [],
            'verb': v[0],
            'after': []
        }

        # creating spans of noun chunks and verbs
        for n in nouns:

            if before == float('-inf'):
                if n[1] <= verbs[0][1]:
                    relation_dict['before'].append(n[0])
            else:
                if n[1] < v[1] and n[1] > verbs[before][1]:
                    relation_dict['before'].append(n[0])
                elif n[1] == v[1]:
                    relation_dict['before'].append(n[0])

        for n in nouns:

            if after == float('inf'):
                if n[1] > verbs[-1][1]:
                    relation_dict['after'].append(n[0])
            else:
                if n[1] > v[1] and n[1] <= verbs[after][1]:
                    relation_dict['after'].append(n[0])

        relations.append(relation_dict)
        
    return relations

KEY = 'Method2'
# EACH URL TAKES ABOUT 1 MINUTE TO RUN
for u in urls:
    
    # creating a separate key for Method2
    total_d[u][KEY] = []
    article = resolved_d[u]
    
    for p in article:
        # list is broken up into paragraphs
        # will then break up the paragraph into sentences
        sentences = p.split('.')
        for s in sentences:
            doc = nlp(s)
            
            nouns = get_nouns(doc)
            verbs = get_verbs(doc)
            sr = span_relation(nouns, verbs)
            
            total_d[u][KEY].append(sr)

In [69]:
# Attribute Method

# speciify cues (currently set as 'be' or 'is')
# and will grab those relationships

KEY = 'Attributes'
# EACH URL TAKES ABOUT 1 MINUTE TO RUN
for u in urls:
    
    # using the raw form
    total_d[u][KEY] = []
    article = resolved_d[u]
    
    for p in article:
        # list is broken up into paragraphs
        # will then break up the paragraph into sentences
        sentences = p.split('.')
        for s in sentences:
            doc = nlp(s)
            
            for ent in doc.ents:
                # cue - currently set as default ('be')
                # can be modified for different verbs
                relationship = textacy.extract.semistructured_statements(doc, ent.text)
                for r in relationship:
                    
                    l = []
                    for words in r:
                        l.append(str(words))
                        
                    total_d[u][KEY].append(l)

In [70]:
# Money

# taken from spacy website
# added the rest of the spans to the beginning of the relation as well
def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result

def extract_currency_relations(doc):
    # Merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)
    
    str_noun_chunks = []
    for n in list(doc.noun_chunks):
        str_noun_chunks.append(str(n))
        
    relations = []
    for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
        if money.dep_ in ("attr", "dobj"):
            subject = [w.text for w in money.head.lefts if w.dep_ == "nsubj"]
            if subject:
                # subject = subject[0]
                relations.append((subject, money.text, list(str_noun_chunks)))
        elif money.dep_ == "pobj" and money.head.dep_ == "prep":
            relations.append((money.head.head.text, money.text))
    return relations


KEY = 'Money'
# EACH URL TAKES ABOUT 1 MINUTE TO RUN
for u in urls:
    
    # using the raw form
    total_d[u][KEY] = []
    article = resolved_d[u]
    
    for p in article:
        # list is broken up into paragraphs
        # will then break up the paragraph into sentences
        sentences = p.split('.')
        for s in sentences:
            doc = nlp(s)
            
            money_data = extract_currency_relations(doc)
            if len(money_data) > 0:
                total_d[u][KEY].append(money_data)

In [71]:
# quick output of everything
import json


for k in total_d.keys():
    
    filename = '{0}.json'.format(k)
    fil = 'raw_' + filename.split('/')[-1]
    
    with open(fil, 'w') as outfile:
        json.dump(total_d[k], outfile, sort_keys=True, indent=4)
    