In [55]:
from pprint import pprint

In [1]:
import requests
from bs4 import BeautifulSoup as BS

In [38]:
# list of urls
raw_d = {}
urls = ['https://en.wikipedia.org/wiki/Alibaba_Group']

# layout of wiki page is standardized
# text content is all in p tags
for u in urls:
    
    # dictionary to to hold text information by url
    # in case we try and crawl all on a central notebook
    raw_d[u] = []
    wiki_pg = requests.get(u)
    soup = BS(wiki_pg.content, 'lxml')
    for content in soup.select("p"):
        # adding to list
        raw_d[u].append(content.text)

In [87]:
import spacy
from spacy import displacy
from spacy.symbols import nsubj, VERB

import textacy
import neuralcoref

# will need to download eng trained model via cli
# python -m spacy download en_core_web_lg

# there are 3 types of basic trained models (sm, md, lg)
# using medium as the neural coref came out funky with large
nlp = spacy.load("en_core_web_md")

neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0xb035d62e8>

In [46]:
resolved_d = {}

for u in urls:
    resolved_p = []
    
    full_text = raw_d[u]
    
    for paragraph in full_text:
        
        p = paragraph.strip()
        doc = nlp(p)
        
        # apply neural coref to resolve word references
        cluster = doc._.coref_clusters
        resolved_p.append(doc._.coref_resolved)
        
    resolved_d[u] = resolved_p

In [57]:
# dictionary that will contain relationships from all methods
# key = url 
# value = list of json of relationship described in each method

total_d = {}
for u in resolved_d.keys():
    total_d[u] = {}

In [61]:
# RUN THIS CELL TO LEMMA THE SENTENCES
# SOME SENTENCES ARE BETTER LEMMA-ED OTHERS ARE NOT

LEMMA = False

if LEMMA:
    for u in resolved_d.keys():

        corpus = resolved_d[u]
        lemma_corpus = []

        for p in corpus:
            doc = nlp(p)

            lemma_doc = []
            for token in doc:
                # keeping is
                if token.pos == VERB:
                    if token.lemma_ == 'be':
                        lemma_doc.append('is')
                    else:
                        lemma_doc.append(token.lemma_)
                else:
                    lemma_doc.append(token.text)

            lemma_doc = ' '.join(lemma_doc).strip()
            lemma_corpus.append(lemma_doc)

        resolved_d[u] = lemma_corpus

In [64]:
# Method 1

### Trys to extract nouns and verbs with preposition phrases based on the token dependency tree from pos tagging ###
# Searches dependency tree and if a verb is found, will then traverse the tree left and right
# to determine the relationship in the sentence
# will also try and find the preopsitional phrase in the sentence if any and try to add that to the relationship as well

# before = node 1 
# after = node 2
# verb = edge from node 1 -> node 2
# prep = description of edge

def get_preps(doc):
    preps = []
    for token in doc:
        if token.pos_ == 'ADP':
            pp = ' '.join([tok.orth_ for tok in token.subtree])
            preps.append(pp)
    return preps

def get_noun_verb(doc):
    
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()
    
    relations = []
    for token in doc:
        if token.pos_ == 'VERB':
            
            subject = [w for w in token.head.lefts if w.dep_ == "nsubj"]
            after = [w for w in token.head.rights]
            
            if len(subject) == 0:
                subject = [token.head]
                after = [w for w in token.rights]
            
            relation_dict = {
                'before': [],
                'after': [],
                'verb': token,
                'prep': []
            }
            
            relation_dict['before'].extend(subject)
            relation_dict['after'].extend(after)
            
            relations.append(relation_dict)
            
    return relations

def combine_noun_verb_preps(noun_verbs, preps):
    holder = []
    
    for nv in noun_verbs:
        
        # will apply before preposition and after preposition
        holder.append(nv)
        nv['prep'].extend(preps)
        holder.append(nv)
    
    return holder

KEY = 'Method1'
# EACH URL TAKES ABOUT 1 MINUTE TO RUN
for u in urls:
    
    # creating a separate key for Method1
    total_d[u][KEY] = []
    article = resolved_d[u]
    
    for p in article:
        # list is broken up into paragraphs
        # will then break up the paragraph into sentences
        sentences = p.split('.')
        for s in sentences:
            doc = nlp(s)
            
            noun_verbs = get_noun_verb(doc)
            preps = get_preps(doc)
            combined_nvp = combine_noun_verb_preps(noun_verbs, preps)
            
            total_d[u][KEY].append(combined_nvp)

In [70]:
# Method 2

### Similar to method 1 ###
# Extracts noun chunks and verbs and creates span of nouns and verbs 
# spans are nouns before a verb, verb, and then nouns after
# For example sentence may be: nouns1 + verb1 + nouns2 + verb2 + nouns3 
# the spans will be:
# - nouns1, verb1, nouns2
# - nouns2, verb2, nouns3

# theyre all placed in json and is in a similar style to method 1
# before = node 1 
# after = node 2
# verb = edge from node 1 -> node 2

def get_nouns(doc):
    nouns = []
    noun = textacy.extract.noun_chunks(doc)

    for n in noun:

        end = n.end
        n = str(n)
        temp_word = []
        for ch in n:
            if ch.isalpha() or ch.isnumeric() or ch is ' ':
                temp_word.append(ch)

        nouns.append((''.join(temp_word), end))
        
    return nouns

def get_verbs(doc):
    main_verbs = textacy.spacier.utils.get_main_verbs_of_sent(doc)
    verbs = []
    for verb in main_verbs:

        verb_index = verb.i
        verbs.append((verb.text, verb_index))

    tbd = []
    for i in range(len(verbs)):
        v = verbs[i]

        for n in nouns:
            if v[0] in n[0]:
                tbd.append(v)

    cleaned_verbs = []
    for v in verbs:
        if v in tbd:
            continue
        else:
            cleaned_verbs.append(v)

    return cleaned_verbs

def span_relation(nouns, verbs):
    relations = []

    # ensure we can collect appropriate spans by setting bounds
    before = 0
    end = float('inf')

    for i in range(len(verbs)):

        v = verbs[i]

        # helper variables to handle span creation
        if i == 0:
            before = float('-inf')
        else:
            before = i - 1

        if i == len(verbs) - 1:
            after = float('inf')
        else:
            after = i + 1

        relation_dict = {
            'before': [],
            'verb': v[0],
            'after': []
        }

        # creating spans of noun chunks and verbs
        for n in nouns:

            if before == float('-inf'):
                if n[1] <= verbs[0][1]:
                    relation_dict['before'].append(n[0])
            else:
                if n[1] < v[1] and n[1] > verbs[before][1]:
                    relation_dict['before'].append(n[0])
                elif n[1] == v[1]:
                    relation_dict['before'].append(n[0])

        for n in nouns:

            if after == float('inf'):
                if n[1] > verbs[-1][1]:
                    relation_dict['after'].append(n[0])
            else:
                if n[1] > v[1] and n[1] <= verbs[after][1]:
                    relation_dict['after'].append(n[0])

        relations.append(relation_dict)
        
    return relations

KEY = 'Method2'
# EACH URL TAKES ABOUT 1 MINUTE TO RUN
for u in urls:
    
    # creating a separate key for Method2
    total_d[u][KEY] = []
    article = resolved_d[u]
    
    for p in article:
        # list is broken up into paragraphs
        # will then break up the paragraph into sentences
        sentences = p.split('.')
        for s in sentences:
            doc = nlp(s)
            
            nouns = get_nouns(doc)
            verbs = get_verbs(doc)
            sr = span_relation(nouns, verbs)
            
            total_d[u][KEY].append(sr)

In [115]:
# Attribute Method

# speciify cues (currently set as 'be' or 'is')
# and will grab those relationships

KEY = 'Attributes'
# EACH URL TAKES ABOUT 1 MINUTE TO RUN
for u in urls:
    
    # using the raw form
    total_d[u][KEY] = []
    article = resolved_d[u]
    
    for p in article:
        # list is broken up into paragraphs
        # will then break up the paragraph into sentences
        sentences = p.split('.')
        for s in sentences:
            doc = nlp(s)
            
            for ent in doc.ents:
                # cue - currently set as default ('be')
                # can be modified for different verbs
                relationship = textacy.extract.semistructured_statements(doc, ent.text)
                for r in relationship:
                    total_d[u][KEY].append(r)

In [112]:
# Money

# taken from spacy website
# added the rest of the spans to the beginning of the relation as well
def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result

def extract_currency_relations(doc):
    # Merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)
    
    relations = []
    for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
        if money.dep_ in ("attr", "dobj"):
            subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
            if subject:
                # subject = subject[0]
                relations.append((subject, money, list(doc.noun_chunks)))
        elif money.dep_ == "pobj" and money.head.dep_ == "prep":
            relations.append((money.head.head, money))
    return relations


KEY = 'Money'
# EACH URL TAKES ABOUT 1 MINUTE TO RUN
for u in urls:
    
    # using the raw form
    total_d[u][KEY] = []
    article = resolved_d[u]
    
    for p in article:
        # list is broken up into paragraphs
        # will then break up the paragraph into sentences
        sentences = p.split('.')
        for s in sentences:
            doc = nlp(s)
            
            money_data = extract_currency_relations(doc)
            if len(money_data) > 0:
                total_d[u][KEY].append(money_data)

In [120]:
# quick output of everything
from pprint import pprint

for k in total_d.keys():
    
    print ('##############')
    print (k)
    
    for k2 in total_d[k].keys():
        print ('------------')
        print (k2)
        pprint (total_d[k][k2])

##############
https://en.wikipedia.org/wiki/Alibaba_Group
------------
Money
[[([Alibaba 's market value],
   US$ 231 billion,
   [closing time, IPO, history, Alibaba 's market value])],
 [(stand, 352)],
 [(buy, US$ 1 billion)],
 [(a result net, US$ 10 billion)],
 [(raise, 332)],
 [(Alibaba 's IPO price, 68),
  ([Alibaba 's IPO price], 21, [Alibaba 's IPO price])],
 [(an opening price, 92)],
 [(responsible, $ 30   million)],
 [(Taobao 's profit total sales figure, ¥ 400   billion)],
 [(a total transaction volume, 3 trillion yuan)],
 [(a total transaction volume, 3 trillion yuan)],
 [(6 billion, 6 trillion yuan)],
 [([Alibaba],
   100 billion yuan,
   [Alibaba, Alibaba, Alibaba 's commitment, China])],
 [(value, US$ 20 billion)],
 [(a valuation, US$ 150 billion)],
 [(acquire, 804)],
 [([Tencent Music], $ 10bn IPO, [Tencent Music, Alibaba, Alibaba position])],
 [(a consideration, $ 266   million)]]
------------
MD
[]
------------
Attributes
[(Alibaba, is, a good name),
 (Alibaba, is, a 

 [{'after': [profitable, after],
   'before': [com],
   'prep': ['three years after launch'],
   'verb': become},
  {'after': [profitable, after],
   'before': [com],
   'prep': ['three years after launch'],
   'verb': become}],
 [{'after': [improve, ,, launch],
   'before': [ Jack Ma],
   'prep': ['from 2003 onward'],
   'verb': want},
  {'after': [improve, ,, launch],
   'before': [ Jack Ma],
   'prep': ['from 2003 onward'],
   'verb': want},
  {'after': [improve, ,, launch],
   'before': [ Jack Ma],
   'prep': ['from 2003 onward'],
   'verb': improve},
  {'after': [improve, ,, launch],
   'before': [ Jack Ma],
   'prep': ['from 2003 onward'],
   'verb': improve}],
 [],
 [],
 [{'after': [eBay, as, and, reject],
   'before': [Ma],
   'prep': ['into China',
            'in 2003',
            'as a foreign competitor',
            'of Alibaba subsidiary Taobao'],
   'verb': announce},
  {'after': [eBay, as, and, reject],
   'before': [Ma],
   'prep': ['into China',
            'in 2003'

   'verb': sponsor}],
 [{'after': [step],
   'before': [Jack Ma],
   'prep': ['In September 2018',
            'of Alibaba',
            'that',
            'as chairman',
            "in a year 's time",
            'so',
            'on philanthropy'],
   'verb': announce},
  {'after': [step],
   'before': [Jack Ma],
   'prep': ['In September 2018',
            'of Alibaba',
            'that',
            'as chairman',
            "in a year 's time",
            'so',
            'on philanthropy'],
   'verb': announce},
  {'after': [down, as, in, focus],
   'before': [he],
   'prep': ['In September 2018',
            'of Alibaba',
            'that',
            'as chairman',
            "in a year 's time",
            'so',
            'on philanthropy'],
   'verb': would},
  {'after': [down, as, in, focus],
   'before': [he],
   'prep': ['In September 2018',
            'of Alibaba',
            'that',
            'as chairman',
            "in a year 's time",
            '

   'prep': ['in China',
            'to customers all over the world',
            'all over the world',
            'in a wide variety of products',
            'of products'],
   'verb': allow},
  {'after': [sell, ,, result],
   'before': [It],
   'prep': ['in China',
            'to customers all over the world',
            'all over the world',
            'in a wide variety of products',
            'of products'],
   'verb': sell},
  {'after': [sell, ,, result],
   'before': [It],
   'prep': ['in China',
            'to customers all over the world',
            'all over the world',
            'in a wide variety of products',
            'of products'],
   'verb': sell},
  {'after': [sell, ,, result],
   'before': [It],
   'prep': ['in China',
            'to customers all over the world',
            'all over the world',
            'in a wide variety of products',
            'of products'],
   'verb': result},
  {'after': [sell, ,, result],
   'before': [It],
   'prep': ['

   'before': [ Thailand],
   'prep': ['in March 2012',
            'with a business model of sell inventory to customers from '
            'Thailand',
            'of sell inventory',
            'to customers',
            'from Thailand'],
   'verb': launch}],
 [{'after': [a marketplace model],
   'before': [Vietnam],
   'prep': ['  In 2013 Thailand ,', "through Lazada 's site"],
   'verb': add},
  {'after': [a marketplace model],
   'before': [Vietnam],
   'prep': ['  In 2013 Thailand ,', "through Lazada 's site"],
   'verb': add},
  {'after': [sell],
   'before': [a marketplace model],
   'prep': ['  In 2013 Thailand ,', "through Lazada 's site"],
   'verb': allow},
  {'after': [sell],
   'before': [a marketplace model],
   'prep': ['  In 2013 Thailand ,', "through Lazada 's site"],
   'verb': allow},
  {'after': [sell],
   'before': [that],
   'prep': ['  In 2013 Thailand ,', "through Lazada 's site"],
   'verb': sell},
  {'after': [sell],
   'before': [that],
   'prep': ['  In 2

            'in February 2014'],
   'verb': have}],
 [{'after': [a financial product platform call Yu'ebao ( 余额宝],
   'before': [Alipay],
   'prep': ['In 2013'],
   'verb': launch},
  {'after': [a financial product platform call Yu'ebao ( 余额宝],
   'before': [Alipay],
   'prep': ['In 2013'],
   'verb': launch}],
 [{'after': [introduce],
   'before': [Alibaba],
   'prep': ['In 2015', 'that', "by recognize the owner 's face"],
   'verb': announce},
  {'after': [introduce],
   'before': [Alibaba],
   'prep': ['In 2015', 'that', "by recognize the owner 's face"],
   'verb': announce},
  {'after': [a system],
   'before': [Alibaba],
   'prep': ['In 2015', 'that', "by recognize the owner 's face"],
   'verb': will},
  {'after': [a system],
   'before': [Alibaba],
   'prep': ['In 2015', 'that', "by recognize the owner 's face"],
   'verb': will},
  {'after': [introduce],
   'before': [Alibaba],
   'prep': ['In 2015', 'that', "by recognize the owner 's face"],
   'verb': introduce},
  {'after':

   'prep': ['In October 2005',
            'with Yahoo',
            'on 24 September 1999 that focus on Internet services like news , '
            'email , and search',
            'on Internet services',
            'like news , email , and search'],
   'verb': launch}],
 [{'after': [that, ,, as, !],
   'before': [Alibaba Group],
   'prep': ['In April 2013',
            'that',
            'as part of the agreement to buy back the Yahoo',
            'of the agreement to buy back the Yahoo',
            'that',
            'for the Yahoo',
            'of Yahoo'],
   'verb': announce},
  {'after': [that, ,, as, !],
   'before': [Alibaba Group],
   'prep': ['In April 2013',
            'that',
            'as part of the agreement to buy back the Yahoo',
            'of the agreement to buy back the Yahoo',
            'that',
            'for the Yahoo',
            'of Yahoo'],
   'verb': announce},
  {'after': [back, the Yahoo],
   'before': [the agreement],
   'prep': ['In April 

            'since 2015'],
   'verb': succeed},
  {'after': [Ma, on],
   'before': [Daniel Zhang],
   'prep': ['of the Alibaba Group',
            'from its creation',
            'to 10 September 2019',
            'on 10 September 2019',
            'since 2015'],
   'verb': succeed},
  {'after': [Daniel Zhang, ,, and, is],
   'before': [ the former executive chairman],
   'prep': ['of the Alibaba Group',
            'from its creation',
            'to 10 September 2019',
            'on 10 September 2019',
            'since 2015'],
   'verb': is},
  {'after': [Daniel Zhang, ,, and, is],
   'before': [ the former executive chairman],
   'prep': ['of the Alibaba Group',
            'from its creation',
            'to 10 September 2019',
            'on 10 September 2019',
            'since 2015'],
   'verb': is}],
 [{'after': [Alibaba 's executive vice - chairman, since],
   'before': [ Joseph Tsai],
   'prep': ['since 2013'],
   'verb': is},
  {'after': [Alibaba 's executive vice

   'prep': ['As',
            'that',
            'to quality and integrity',
            'with other scandal - associate Chinese business sectors',
            'versus a damage control view',
            'that',
            'by diminished trust in " China Gold Supplier " program '
            'endorsement system',
            'in " China Gold Supplier " program endorsement system',
            'for',
            'as global buyers business - to - business service',
            'to - business',
            'through impact on Alibaba brand and capabilities',
            'on Alibaba brand and capabilities',
            'via the " defenestration of senior people "',
            'of senior people'],
   'verb': endanger},
  {'after': [Alibaba, through, (, latter],
   'before': [remove],
   'prep': ['As',
            'that',
            'to quality and integrity',
            'with other scandal - associate Chinese business sectors',
            'versus a damage control view',
            'th

   'verb': 'want'},
  {'after': ['us2324'], 'before': [], 'verb': 'marry'}],
 [{'after': ['Alibaba'],
   'before': ['4 April', 'Jack Ma and Jack Ma team', '17 friends', 'students'],
   'verb': 'found'}],
 [],
 [{'after': ['US 25 million investment', 'Goldman Sachs', 'SoftBank'],
   'before': ['October', 'Alibaba'],
   'verb': 'receive'}],
 [],
 [{'after': [], 'before': ['com'], 'verb': 'expect'},
  {'after': ['domestic e  commerce market'], 'before': [], 'verb': 'improve'},
  {'after': ['e  commerce platform',
             'Chinese enterprises',
             'especially small and medium  sized enterprises',
             'SMEs'],
   'before': ['domestic e  commerce market'],
   'verb': 'perfect'},
  {'after': [],
   'before': ['e  commerce platform',
              'Chinese enterprises',
              'especially small and medium  sized enterprises',
              'SMEs'],
   'verb': 'help'},
  {'after': ['Chinese products',
             'global market',
             'address World Trade

             'control',
             'just under half',
             'China s online payment market',
             'February'],
   'before': ['analyst research report', 'Alipay'],
   'verb': 'have'}],
 [{'after': ['financial product platform call Yuebao  余额宝'],
   'before': ['Alipay'],
   'verb': 'launch'}],
 [{'after': ['Alibaba'], 'before': ['Alibaba'], 'verb': 'announce'},
  {'after': ['system'], 'before': ['Alibaba'], 'verb': 'introduce'},
  {'after': [], 'before': ['system'], 'verb': 'pay'},
  {'after': ['owner s face'], 'before': [], 'verb': 'recognize'}],
 [{'after': ['re  brand', 'Ant Financial Services'],
   'before': ['16 October', 'Alibaba'],
   'verb': 'is'}],
 [],
 [{'after': ['Alibaba s quick and reliable payment system', 'Alibaba'],
   'before': ['factors', 'Alibaba s success', 'platform'],
   'verb': 'is'},
  {'after': ['several types',
             'payment systems',
             'credit card',
             'debit card',
             'Alipay',
             'Quick  pay'