In [6]:
!pip install py2neo
!pip install wikipedia
!pip install spacy
!pip install pandas
!pip install numpy
!pip install tqdm
!pip install jupyterlab
!pip install neo4j
!pip install pywikibot
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp39-cp39-win_amd64.whl (7.2 MB)
     ---------------------------------------- 7.2/7.2 MB 11.8 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.1.0
  Downloading scipy-1.8.0-cp39-cp39-win_amd64.whl (36.9 MB)
     ---------------------------------------- 36.9/36.9 MB 5.3 MB/s eta 0:00:00
Collecting joblib>=0.11
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
     -------------------------------------- 307.0/307.0 KB 4.8 MB/s eta 0:00:00
Using legacy 'setup.py install' for sklearn, since package 'wheel' is not installed.
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn, sklearn
  Running setup.py install for sklearn: started
  Running setup.py install for sklea

In [None]:
!python -m spacy download en_core_web_lg

In [22]:
import json
import re
import urllib
from pprint import pprint
import time
from tqdm import tqdm

from py2neo import Node, Graph, Relationship, NodeMatcher
from py2neo.bulk import merge_nodes

import numpy as np
import pandas as pd
import wikipedia
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

print(spacy.__version__)

3.2.4


In [23]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
VERBS = ['ROOT', 'advcl']
OBJECTS = ["dobj", "dative", "attr", "oprd", 'pobj']
ENTITY_LABELS = ['PERSON', 'NORP', 'GPE', 'ORG', 'FAC', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']

api_key = open('.api_key').read()

non_nc = spacy.load('en_core_web_md')

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('merge_noun_chunks')

print(non_nc.pipe_names)
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'merge_noun_chunks']


In [24]:
def remove_special_characters(text):
    
    regex = re.compile(r'[\n\r\t]')
    clean_text = regex.sub(" ", text)
    
    return clean_text


def remove_stop_words_and_punct(text, print_text=False):
    
    result_ls = []
    rsw_doc = non_nc(text)
    
    for token in rsw_doc:
        if print_text:
            print(token, token.is_stop)
            print('--------------')
        if not token.is_stop and not token.is_punct:
            result_ls.append(str(token))
    
    result_str = ' '.join(result_ls)

    return result_str


def create_svo_lists(doc, print_lists):
    
    subject_ls = []
    verb_ls = []
    object_ls = []

    for token in doc:
        if token.dep_ in SUBJECTS:
            subject_ls.append((token.lower_, token.idx))
        elif token.dep_ in VERBS:
            verb_ls.append((token.lemma_, token.idx))
        elif token.dep_ in OBJECTS:
            object_ls.append((token.lower_, token.idx))

    if print_lists:
        print('SUBJECTS: ', subject_ls)
        print('VERBS: ', verb_ls)
        print('OBJECTS: ', object_ls)
    
    return subject_ls, verb_ls, object_ls


def remove_duplicates(tup, tup_posn):
    
    check_val = set()
    result = []
    
    for i in tup:
        if i[tup_posn] not in check_val:
            result.append(i)
            check_val.add(i[tup_posn])
            
    return result


def remove_dates(tup_ls):
    
    clean_tup_ls = []
    for entry in tup_ls:
        if not entry[2].isdigit():
            clean_tup_ls.append(entry)
    return clean_tup_ls


def create_svo_triples(text, print_lists=False):
    
    clean_text = remove_special_characters(text)
    doc = nlp(clean_text)
    subject_ls, verb_ls, object_ls = create_svo_lists(doc, print_lists=print_lists)
    
    graph_tup_ls = []
    dedup_tup_ls = []
    clean_tup_ls = []
    
    for subj in subject_ls: 
        for obj in object_ls:
            
            dist_ls = []
            
            for v in verb_ls:
                
                # Assemble a list of distances between each object and each verb
                dist_ls.append(abs(obj[1] - v[1]))
                
            # Get the index of the verb with the smallest distance to the object 
            # and return that verb
            index_min = min(range(len(dist_ls)), key=dist_ls.__getitem__)
            
            # Remve stop words from subjects and object.  Note that we do this a bit
            # later down in the process to allow for proper sentence recognition.

            no_sw_subj = remove_stop_words_and_punct(subj[0])
            no_sw_obj = remove_stop_words_and_punct(obj[0])
            
            # Add entries to the graph iff neither subject nor object is blank
            if no_sw_subj and no_sw_obj:
                tup = (no_sw_subj, verb_ls[index_min][0], no_sw_obj)
                graph_tup_ls.append(tup)
        
        #clean_tup_ls = remove_dates(graph_tup_ls)
    
    dedup_tup_ls = remove_duplicates(graph_tup_ls, 2)
    clean_tup_ls = remove_dates(dedup_tup_ls)
    
    return clean_tup_ls

In [27]:
def get_obj_properties(tup_ls):
    
    init_obj_tup_ls = []
    
    for tup in tup_ls:

        try:
            text, node_label_ls, url = query_google(tup[2], api_key, limit=1)
            new_tup = (tup[0], tup[1], tup[2], text[0], node_label_ls[0], url[0])
        except:
            new_tup = (tup[0], tup[1], tup[2], [], [], [])
        
        init_obj_tup_ls.append(new_tup)
        
    return init_obj_tup_ls


def add_layer(tup_ls):

    svo_tup_ls = []
    
    for tup in tup_ls:
        
        if tup[3]:
            svo_tup = create_svo_triples(tup[3])
            svo_tup_ls.extend(svo_tup)
        else:
            continue
    
    return get_obj_properties(svo_tup_ls)
        

def subj_equals_obj(tup_ls):
    
    new_tup_ls = []
    
    for tup in tup_ls:
        if tup[0] != tup[2]:
            new_tup_ls.append((tup[0], tup[1], tup[2], tup[3], tup[4], tup[5]))
            
    return new_tup_ls


def check_for_string_labels(tup_ls):
    # This is for an edge case where the object does not get fully populated
    # resulting in the node labels being assigned to string instead of list.
    # This may not be strictly necessary and the lines using it are commnted out
    # below.  Run this function if you come across this case.
    
    clean_tup_ls = []
    
    for el in tup_ls:
        if isinstance(el[2], list):
            clean_tup_ls.append(el)
            
    return clean_tup_ls


def create_word_vectors(tup_ls):

    new_tup_ls = []
    
    for tup in tup_ls:
        if tup[3]:
            doc = nlp(tup[3])
            new_tup = (tup[0], tup[1], tup[2], tup[3], tup[4], tup[5], doc.vector)
        else:
            new_tup = (tup[0], tup[1], tup[2], tup[3], tup[4], tup[5], np.random.uniform(low=-1.0, high=1.0, size=(300,)))
        new_tup_ls.append(new_tup)
        
    return new_tup_ls

In [None]:
text= wikipedia.page('the starry night').content
text

In [28]:
# %%time
initial_tup_ls = create_svo_triples(text, print_lists=False)

In [29]:
initial_tup_ls

[('starry night', 'be', 'canvas'),
 ('starry night', 'be', 'painting'),
 ('starry night', 'be', 'dutch post impressionist painter'),
 ('starry night', 'paint', 'june'),
 ('starry night', 'depict', 'view'),
 ('starry night', 'depict', 'east facing window'),
 ('starry night', 'depict', 'asylum room'),
 ('starry night', 'depict', 'provence'),
 ('starry night', 'be', 'sunrise'),
 ('starry night', 'be', 'addition'),
 ('starry night', 'be', 'imaginary village'),
 ('starry night', 'be', 'permanent collection'),
 ('starry night', 'be', 'museum'),
 ('starry night', 'be', 'modern art'),
 ('starry night', 'be', 'new york city'),
 ('starry night', 'regard', 'lillie p. bliss bequest'),
 ('starry night', 'regard', 'van gogh magnum opus'),
 ('starry night', 'be', 'recognized paintings'),
 ('starry night', 'be', 'western art'),
 ('starry night', 'be', 'aftermath'),
 ('starry night', 'be', '23 december 1888 breakdown'),
 ('starry night', 'admit', 'self mutilation'),
 ('starry night', 'admit', 'left ear

In [30]:
# %%time
init_obj_tup_ls = get_obj_properties(initial_tup_ls)
new_layer_ls = add_layer(init_obj_tup_ls)
starter_edge_ls = init_obj_tup_ls + new_layer_ls
edge_ls = subj_equals_obj(starter_edge_ls)
# clean_edge_ls = check_for_string_labels(edge_ls)
# clean_edge_ls[0:3]
clean_edge_ls = edge_ls

In [31]:
edges_word_vec_ls = create_word_vectors(edge_ls)

In [33]:
edge_ls

[('starry night', 'be', 'canvas', [], [], []),
 ('starry night', 'be', 'painting', [], [], []),
 ('starry night', 'be', 'dutch post impressionist painter', [], [], []),
 ('starry night', 'paint', 'june', [], [], []),
 ('starry night', 'depict', 'view', [], [], []),
 ('starry night', 'depict', 'east facing window', [], [], []),
 ('starry night', 'depict', 'asylum room', [], [], []),
 ('starry night', 'depict', 'provence', [], [], []),
 ('starry night', 'be', 'sunrise', [], [], []),
 ('starry night', 'be', 'addition', [], [], []),
 ('starry night', 'be', 'imaginary village', [], [], []),
 ('starry night', 'be', 'permanent collection', [], [], []),
 ('starry night', 'be', 'museum', [], [], []),
 ('starry night', 'be', 'modern art', [], [], []),
 ('starry night', 'be', 'new york city', [], [], []),
 ('starry night', 'regard', 'lillie p. bliss bequest', [], [], []),
 ('starry night', 'regard', 'van gogh magnum opus', [], [], []),
 ('starry night', 'be', 'recognized paintings', [], [], []),


In [1]:
import wikipediaapi  # pip install wikipedia-api
import pandas as pd
import concurrent.futures
from tqdm import tqdm

In [5]:
def wiki_scrape(name_topic, verbose=True):
   def link_to_wikipedia(link):
       try:
           page = api_wikipedia.page(link)
           if page.exists():
               return {'page': link, 'text': page.text, 'link': page.fullurl, 'categories': list(page.categories.keys())}
       except:
           return None
      
   api_wikipedia = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.WIKI)
   name_of_page = api_wikipedia.page(name_topic)
   if not name_of_page.exists():
       print('Page {} is not present'.format(name_of_page))
       return
  
   links_to_page = list(name_of_page.links.keys())
   procceed = tqdm(desc='Scraped links', unit='', total=len(links_to_page)) if verbose else None
   origin = [{'page': name_topic, 'text': name_of_page.text, 'link': name_of_page.fullurl, 'categories': list(name_of_page.categories.keys())}]
  
   with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
       links_future = {executor.submit(link_to_wikipedia, link): link for link in links_to_page}
       for future in concurrent.futures.as_completed(links_future):
           info = future.result()
           origin.append(info) if info else None
           procceed.update(1) if verbose else None
   procceed.close() if verbose else None
  
   namespaces = ('Wikipedia', 'Special', 'Talk', 'LyricWiki', 'File', 'MediaWiki',
                 'Template', 'Help', 'User', 'Category talk', 'Portal talk')
   origin = pd.DataFrame(origin)
   origin = origin[(len(origin['text']) > 20)
                     & ~(origin['page'].str.startswith(namespaces, na=True))]
   origin['categories'] = origin.categories.apply(lambda a: [b[9:] for b in a])

   origin['topic'] = name_topic
   print('Scraped pages', len(origin))
  
   return origin

In [49]:
wiki_data = wiki_scrape('mozart')

Scraped links: 100%|██████████| 500/500 [01:57<00:00,  4.26/s]

Scraped pages 472





In [50]:
from spacy import displacy

In [56]:
text= wikipedia.page('Mozart').content
text
article = nlp(text)
sentences = [x for x in article.sents]

In [60]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='dep')