In [2]:
import requests
from bs4 import BeautifulSoup as BS

In [3]:
# list of urls
d = {}
urls = ['https://en.wikipedia.org/wiki/Alibaba_Group']

# layout of wiki page is standardized
# text content is all in p tags
for u in urls:
    
    # dictionary to to hold text information by url
    # in case we try and crawl all on a central notebook
    d[u] = []
    wiki_pg = requests.get(u)
    soup = BS(wiki_pg.content, 'lxml')
    for content in soup.select("p"):
        # adding to list
        d[u].append(content.text)

In [4]:
for paragraph in d[urls[0]]:
    print (paragraph)


Coordinates: 30°11′23″N 120°11′25″E﻿ / ﻿30.189602°N 120.190371°E﻿ / 30.189602; 120.190371

Alibaba Group Holding Limited, (also known as Alibaba Group and as Alibaba), is a Chinese multinational conglomerate holding company specializing in e-commerce, retail, Internet, and technology. Founded on 4 April 1999 in Hangzhou, Zhejiang, the company provides consumer-to-consumer (C2C), business-to-consumer (B2C), and business-to-business (B2B) sales services via web portals, as well as electronic payment services, shopping search engines and cloud computing services. It owns and operates a diverse array of businesses around the world in numerous sectors, and is named as one of the world's most admired companies by Fortune.[3][4]

At closing time on the date of its initial public offering (IPO) – US$25 billion – the world's highest in history, 19 September 2014, Alibaba's market value was US$231 billion.[5] As of 19 December 2018[update], Alibaba's market cap stood at US$352.28 billion.[6] It

In [5]:
import spacy
import textacy
from spacy import displacy

# will need to download eng trained model via cli
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [6]:
# verbs should be expanded to accomodate additional verbs
# can do a few approaches
# - curated list
# - getting all verbs and then cleaning
verbs_to_search = []

In [7]:
# using textacy to determine verb relationships
# this is being applied to every text document within the wiki
for item in d[urls[0]][1:]:
    doc = nlp(item)
    tup = textacy.extract.subject_verb_object_triples(doc)
    for x in tup:
        print (x)
    
# each noun can be compared to spacy result and see if it matches proper noun
# and can just be dropped or something?

(Group Holding Limited, is, company)
(company, provides, C2C)
(market value, was, 231)
(It, is, one)
(Alibaba, became, company)
(Alibaba, has, brand)
(Alibaba, is, retailer)
(Alibaba, is, commerce company)
(company, hosts, Alibaba.com)
(sales, surpassed, US retailers)
(profits, surpassed, US retailers)
(It, sets, record)
(Alibaba, is, name)
(she, said, Open)
(this, is, name)
(you, know, Alibaba)
(Alibaba, is, business person)
(he, helped, village)
(Alibaba, opens, sesame)
(We, registered, name)
(someone, wants, to marry)
(Jack Ma, founded, Alibaba.com)
(team, founded, Alibaba.com)
(Alibaba, received, investment)
(Alibaba.com, was expected, to improve)
(Alibaba.com, was expected, perfect)
(Ma, wanted, to improve)
(Alibaba, launched, Marketplace)
(eBay, announced, expansion)
(Ma, viewed, company)
(subsidiaries, outperformed, eBay)
(subsidiary Taobao, force, eBay)
(eBay, closing, Web unit)
(This, would, net)
(Alibaba, purchased, % stake)
(founder, agreed, to pay)
(Alibaba, led, D financin

In [8]:
# example node/edge graph
node_graph = {}

In [24]:
# using the first paragraph
# split by period
paragraph = d[urls[0]][1].split('.')

# it would be good to take the entire array of sentences and parse each sentence
# then from each sentence, determine if the ent matches what we are looking for 
# if so examine the sentence for verb and action
for p in paragraph:
    
    # apply nlp model to sentence
    print (p)
    doc = nlp(p)
    
    # get entities
    for ent in doc.ents:
        
        # should do some processing here ("lemma" proper nouns to prevent dupes)
        if ent.text not in node_graph:
            node_graph[ent.text] = []
    
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
    
    break
    
    

Alibaba Group Holding Limited, (also known as Alibaba Group and as Alibaba), is a Chinese multinational conglomerate holding company specializing in e-commerce, retail, Internet, and technology
Alibaba Alibaba PROPN NNP compound Xxxxx True False
Group Group PROPN NNP compound Xxxxx True False
Holding Holding PROPN NNP compound Xxxxx True False
Limited Limited PROPN NNP nsubj Xxxxx True False
, , PUNCT , punct , False False
( ( PUNCT -LRB- punct ( False False
also also ADV RB advmod xxxx True True
known know VERB VBN acl xxxx True False
as as ADP IN prep xx True True
Alibaba Alibaba PROPN NNP compound Xxxxx True False
Group Group PROPN NNP pobj Xxxxx True False
and and CCONJ CC cc xxx True True
as as ADP IN conj xx True True
Alibaba Alibaba PROPN NNP pobj Xxxxx True False
) ) PUNCT -RRB- punct ) False False
, , PUNCT , punct , False False
is be VERB VBZ ROOT xx True True
a a DET DT det x True True
Chinese chinese ADJ JJ amod Xxxxx True False
multinational multinational ADJ JJ amod xxxx 

In [23]:
from pathlib import Path

svg = displacy.render(doc, style="dep", jupyter=False)
output_path = Path("./images/sentence.svg")
output_path.open("w", encoding="utf-8").write(svg)

TypeError: write() argument must be str, not None