<a href="https://colab.research.google.com/github/javier-jaime/Tool-Crib/blob/master/Colab/Knowledge_Graph_from_Wikipedia_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install dependencies and Scrape data

In [43]:
!pip install wikipedia-api spacy networkx scipy

import wikipediaapi  # pip install wikipedia-api
import pandas as pd
import concurrent.futures
import requests
from tqdm import tqdm
import spacy
from spacy import displacy

# import en_core_web_sm
!python -m spacy download en 
nlp = spacy.load('en_core_web_sm')
 
from spacy.tokens import Span
from spacy.matcher import Matcher
 
import matplotlib.pyplot as plot
import networkx as ntx
 
%matplotlib inline

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [47]:
# function that fetch wikipedia articles based on the topic provided

def scrape_wikipedia(name_topic, verbose=True):
   def link_to_wikipedia(link):
       try:
           page = api_wikipedia.page(link)
           if page.exists():
               return {'page': link, 'text': page.text, 'link': page.fullurl, 'categories': list(page.categories.keys())}
       except:
           return None
      
   api_wikipedia = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.WIKI)
   name_of_page = api_wikipedia.page(name_topic)
   if not name_of_page.exists():
       print('Page {} is not present'.format(name_of_page))
       return
  
   links_to_page = list(name_of_page.links.keys())
   procceed = tqdm(desc='Scraped links', unit='', total=len(links_to_page)) if verbose else None
   origin = [{'page': name_topic, 'text': name_of_page.text, 'link': name_of_page.fullurl, 'categories': list(name_of_page.categories.keys())}]
  
   with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
       links_future = {executor.submit(link_to_wikipedia, link): link for link in links_to_page}
       for future in concurrent.futures.as_completed(links_future):
           info = future.result()
           origin.append(info) if info else None
           procceed.update(1) if verbose else None
   procceed.close() if verbose else None
  
   namespaces = ('Wikipedia', 'Special', 'Talk', 'LyricWiki', 'File', 'MediaWiki',
                 'Template', 'Help', 'User', 'Category talk', 'Portal talk')
   origin = pd.DataFrame(origin)
   origin = origin[(len(origin['text']) > 20)
                     & ~(origin['page'].str.startswith(namespaces, na=True))]
   origin['categories'] = origin.categories.apply(lambda a: [b[9:] for b in a])

   origin['topic'] = name_topic
   print('Scraped pages', len(origin))
  
   return origin

In [70]:
# Test the function on a topic

topic = 'COVID-19'

data = scrape_wikipedia(topic)

Scraped links: 100%|██████████| 2358/2358 [01:13<00:00, 32.04/s]

Scraped pages 211





In [71]:
# Save the date to csv
data.to_csv('scraped_data.csv')

In [72]:
data['text'][10]

'On 20 February 2020, stock markets across the world suddenly crashed after growing instability due to the COVID-19 pandemic. It ended on 7 April 2020.\nBeginning on 13 May 2019, the yield curve on U.S. Treasury securities inverted, and remained so until 11 October 2019, when it reverted to normal. Through 2019, while some economists (including Campbell Harvey and former New York Federal Reserve economist Arturo Estrella) argued that a recession in the following year was likely, other economists (including the managing director of Wells Fargo Securities Michael Schumacher and San Francisco Federal Reserve President Mary C. Daly) argued that inverted yield curves may no longer be a reliable recession predictor. The yield curve on U.S. Treasuries would not invert again until 30 January 2020 when the World Health Organization declared the COVID-19 outbreak to be a Public Health Emergency of International Concern, four weeks after local health commission officials in Wuhan, China announced

## Segment Sentences

In [73]:
# Lets take part of the above extracted article
docu = nlp('''The AbC-19 rapid antibody test is an immunological test for COVID-19 exposure developed by
the UK Rapid Test Consortium and manufactured by Abingdon Health. It uses a lateral flow test to determine
whether a person has IgG antibodies to the SARS-CoV-2 virus that causes COVID-19. The test uses a single
drop of blood obtained from a finger prick and yields results in 20 minutes.\n\nSee also\nCOVID-19 rapid
antigen test''')
 
for tokn in docu:
   print(tokn.text, "---", tokn.dep_)

The --- det
AbC-19 --- compound
rapid --- amod
antibody --- compound
test --- nsubj
is --- ROOT
an --- det
immunological --- amod
test --- attr
for --- prep
COVID-19 --- nummod
exposure --- pobj
developed --- acl
by --- agent

 --- 
the --- det
UK --- compound
Rapid --- compound
Test --- compound
Consortium --- pobj
and --- cc
manufactured --- conj
by --- agent
Abingdon --- compound
Health --- pobj
. --- punct
It --- nsubj
uses --- ROOT
a --- det
lateral --- amod
flow --- compound
test --- dobj
to --- aux
determine --- xcomp

 --- 
whether --- mark
a --- det
person --- nsubj
has --- ccomp
IgG --- compound
antibodies --- dobj
to --- prep
the --- det
SARS --- compound
- --- punct
CoV-2 --- compound
virus --- pobj
that --- nsubj
causes --- relcl
COVID-19 --- dobj
. --- punct
The --- det
test --- nsubj
uses --- ROOT
a --- det
single --- amod

 --- 
drop --- dobj
of --- prep
blood --- pobj
obtained --- acl
from --- prep
a --- det
finger --- compound
prick --- pobj
and --- cc
yields --- comp

## Extract Entities

In [74]:
def extract_entities(sents):
   # chunk one
   enti_one = ""
   enti_two = ""
  
   dep_prev_token = "" # dependency tag of previous token in sentence
  
   txt_prev_token = "" # previous token in sentence
  
   prefix = ""
   modifier = ""
  
  
  
   for tokn in nlp(sents):
       # chunk two
       ## move to next token if token is punctuation
      
       if tokn.dep_ != "punct":
           #  check if token is compound word or not
           if tokn.dep_ == "compound":
               prefix = tokn.text
               # add the current word to it if the previous word is 'compound’
               if dep_prev_token == "compound":
                   prefix = txt_prev_token + " "+ tokn.text
                  
           # verify if token is modifier or not
           if tokn.dep_.endswith("mod") == True:
               modifier = tokn.text
               # add it to the current word if the previous word is 'compound'
               if dep_prev_token == "compound":
                   modifier = txt_prev_token + " "+ tokn.text
                  
           # chunk3
           if tokn.dep_.find("subj") == True:
               enti_one = modifier +" "+ prefix + " "+ tokn.text
               prefix = ""
               modifier = ""
               dep_prev_token = ""
               txt_prev_token = ""
              
           # chunk4
           if tokn.dep_.find("obj") == True:
               enti_two = modifier +" "+ prefix +" "+ tokn.text
              
           # chunk 5
           # update variable
           dep_prev_token = tokn.dep_
           txt_prev_token = tokn.text
          
   return [enti_one.strip(), enti_two.strip()]

In [75]:
extract_entities("The AbC-19 rapid antibody test is an immunological test for COVID-19 exposure developed by the UK Rapid Test")

['AbC-19 rapid antibody test', 'COVID-19 UK Rapid Test']

In [76]:
# Use the above function to extract entity pairs for 800 sentences
pairs_of_entities = []
for i in tqdm(data['text'][:800]):
   pairs_of_entities.append(extract_entities(i))

100%|██████████| 211/211 [01:23<00:00,  2.54it/s]


In [77]:
# Subject object pairs from sentences
pairs_of_entities[40:44]

[['Global Aerosol Works', 'External aerosol sampling'],
 ['respectively Africa CDC', 'National public World Health Africa'],
 ['specific  patient', 'specific  tastes'],
 ['also  she', 'original Whistle Giver']]

## Relations extraction

In [78]:
# Function that capture the entities relationships from the sentences
def obtain_relation(sent):
  
   doc = nlp(sent)
  
   matcher = Matcher(nlp.vocab)
  
   pattern = [{'DEP':'ROOT'},
           {'DEP':'prep','OP':"?"},
           {'DEP':'agent','OP':"?"}, 
           {'POS':'ADJ','OP':"?"}]
  
   matcher.add("matching_1", None, pattern)
  
   matcher = matcher(doc)
   h = len(matcher) - 1
  
   span = doc[matcher[h][1]:matcher[h][2]]
  
   return (span.text)

In [None]:
relations = [obtain_relation(j) for j in tqdm(data['text'][:800])]

 40%|████      | 90/225 [00:29<01:06,  2.03it/s]

In [None]:
# Most frequent relations extracted
pd.Series(relations).value_counts()[:10]

## Build a knowledge graph

In [None]:
# subject extraction
source = [j[0] for j in pairs_of_entities]

#object extraction
target = [k[1] for k in pairs_of_entities]

data_kgf = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

# Create DG from the dataframe
graph = ntx.from_pandas_edgelist(data_kgf, "source", "target",
                         edge_attr=True, create_using=ntx.MultiDiGraph())

In [None]:
# plotting the network
plot.figure(figsize=(20, 20))
posn = ntx.spring_layout(graph)
ntx.draw(graph, with_labels=True, node_color='blue', edge_cmap=plot.cm.Blues, pos = posn)
plot.show()

In [None]:
# Pick one relation to visualize a sub-graph

relation = "links"

sub_graph = ntx.from_pandas_edgelist(data_kgf[data_kgf['edge']==relation], "source", "target",
                         edge_attr=True, create_using=ntx.MultiDiGraph())
 
plot.figure(figsize=(20,20))
pos = ntx.spring_layout(sub_graph, k = 1) # k regulates the distance between nodes
ntx.draw(sub_graph, with_labels=True, node_color='green', node_size=1400, edge_cmap=plot.cm.Blues, pos = posn)
plot.show()