### Notebook to study the best way to architect a db to store wikipedia articles multiple languages

In [1]:
import py2neo #Libs to connect to neo4j

def connectDb():
    """Function to connect to neo4j database."""

    py2neo.authenticate("localhost:7474", "neo4j", "lucas")
    dbConnection = py2neo.Graph("http://localhost:7474/db/data/")

    return dbConnection

In [2]:
from wikipydia import wikipedia, wikilinks

In [3]:
#art = wikipedia.get_article_by_href("C%2b%2b")
#wikilinks.get_article_links_score(art)

In [4]:
#"Lu'cas".replace("'", "\\'")

In [5]:
from neo4j.v1 import GraphDatabase, basic_auth

from urllib.parse import unquote

class Wiki4Neo():
    def __init__(self, user="neo4j", password="lucas"):
        """Connect to neo4j db"""
        self.driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth(user, password))
        
        #session = driver.session()
        #session.run("CREATE (a:Person {name: {name}, title: {title}})",{"name": "Arthur", "title": "King"})
        #result = session.run("MATCH (a:Person) WHERE a.name = {name} RETURN a.name AS name, a.title AS title",{"name": "Arthur"})
        #for record in result:
        #    print("%s %s" % (record["title"], record["name"]))
        #session.close()

    def save_article(self, article, links, url, lang):
        #Ensure to escape quotes from the title and url before query
        #"Lu'cas".replace("'", "\\'")
        #pageInfo.title = pageInfo.title.replace(/['\\]/g, "\\$&");
        #pageInfo.url = pageInfo.url.replace(/['\\]/g, "\\$&");

        #Construct query
        neo_query = "\n".join([
            'MERGE (article:WikiArticle{{title:"{}", pageId:{}}})'.format(article.title(), article.page_id()), 
            'ON CREATE SET article.lang = "{}"'.format(lang), 
            'MERGE (articleUrl:Wikiurl{{url_lang:"{}"}})'.format(url + "_" + lang),
            'ON CREATE SET articleUrl.url = "{}", articleUrl.lang = "{}"'.format(url, lang),
            'SET articleUrl.articleId = {}'.format(article.page_id()),
            "CREATE UNIQUE (articleUrl)-[:RedirectsTo]->(article)"
        ])
        
        #Create queries for links to the article
        for i, (link_href, link_score) in enumerate(links):
            #Ensure to escape quotes from the link before query
            #var link = pageInfo.links[i].replace(/['\\]/g, "\\$&");
            link_href = unquote(link_href) #Remove url quotes from the href

            neo_query += "\n\n" + "\n".join([
                'MERGE (articleLink{}:Wikiurl{{url_lang: "{}"}})'.format(i, link_href+"_"+lang),
                'ON CREATE SET articleLink{}.url = "{}", articleLink{}.lang = "{}"'.format(i,link_href,i,lang),
                'CREATE UNIQUE (article)-[:LinksTo{{score:{}}}]->(articleLink{})'.format(link_score, i)
            ])
            
        #Create query to create direct connection between articles
        if False:
            neo_query += "\n"

            #Add in/out ConnectsTo relation to this article 
            neo_query += "\n".join([
                "WITH article MATCH (article)-[:LinksTo]->(:Wikiurl)-[:RedirectsTo]->(targetArticle:Article)",
                "CREATE UNIQUE (article)-[:ConnectsTo]->(targetArticle)",
                "WITH article MATCH (targetArticle:Article)-[:LinksTo]->(:Wikiurl)-[:RedirectsTo]->(article)", 
                " CREATE UNIQUE (targetArticle)-[:ConnectsTo]->(article)"
            ])
            
        neo_query += "\nRETURN id(article) as article_id"

        session = self.driver.session()
        results = session.run(neo_query)
        session.close()
        
        for record in results:
            return record["article_id"]

In [6]:
w4n = Wiki4Neo()

In [7]:
def save_article_and_links_scores_by_href(href, lang):
    art = wikipedia.get_article_by_href(href, lang)
    links = wikilinks.get_article_links_score(art)
    art_id = w4n.save_article(art, links, href, lang)
    print(art_id)

In [8]:
#save_article_and_links_scores_by_href("c%2b%2b", "en")

In [9]:
#from urllib.parse import quote
#for lang, article in wikipedia.get_article_langlinks("MQTT"):
    #article = quote(article) #quote article title to use as href
    #save_article_and_links_scores_by_href(article, lang)

In [30]:
def get_or_download_pagedata_by_href(href, lang):
    """
    Get data from database, downloading missing data.
    """
    
    #start_time = time.time()
    
    #1. Get the wikiarticle data from the db by the href 
    neo_query = "\n".join([
        'MATCH (n:Wikiurl{{url:"{}", lang:"{}"}})-[:RedirectsTo]->(a:WikiArticle)',
        'RETURN a.title as title, a.pageId as pageid, id(a) as id'
    ]).format(href, lang)
    
    session = w4n.driver.session()
    results = session.run(neo_query)
    session.close()
    
    #print("Time of the 1st try to locate the data:{}".format(time.time() - start_time))
    
    for r in results:
        return r['title'], r['pageid'], r['id']
    
    #start_time = time.time()
    #2. If the href doesn't return an article, download this href
    #Try to get the article
    #If suceed, save the href and try to return the article and create a reference from the href
    art = wikipedia.get_article_by_href(href, lang)
    
    neo_query = "\n".join([
        'MERGE (u:Wikiurl{{url_lang:"{}"}})'.format(href + "_" + lang),
        'ON CREATE SET u.url = "{}", u.lang = "{}", u.articleId = {}'.format(href, lang, art.page_id()),
        'WITH u MATCH (a:WikiArticle{{pageId:{}}})'.format(art.page_id()),
        'CREATE UNIQUE (u)-[:RedirectsTo]->(a)',
        'RETURN a.title as title, a.pageId as pageid, id(a) as id'
    ])
    
    session = w4n.driver.session()
    results = session.run(neo_query)
    session.close()
    #print("Time of the 2nd try to locate the data:{}".format(time.time() - start_time))
    
    for r in results:
        return r['title'], r['pageid'], r['id']
    
    #start_time = time.time()
    #3. If nothing has been returned till now, so the article doesn't exists, lets register it
    links = wikilinks.get_article_links_score(art)
    new_art_id = w4n.save_article(art, links, href, lang)
    #print("Time of the 3rd try to locate the data:{}".format(time.time() - start_time))
    
    return art.title(), art.page_id(), new_art_id

In [29]:
#print(get_or_download_pagedata_by_href("c%2b%2b", "en"))

In [39]:
from urllib.parse import quote
import time

def download_and_save_global_articles(href_en):
    start_time = time.time()
    
    #global_articles = list()

    #href_en = "MQTT"
    
    #1. Download english article and get links scores
    #art_en = wikipedia.get_article_by_href(href_en, "en")
    #links_scores = wikilinks.get_article_links_score(art_en)
    #global_articles.append((art_en, links_scores, href_en, "en"))
    #global_article_title = art_en.title()
    #global_article_pageid = art_en.page_id()
    
    #1.1 Get or download english article data
    print("Getting english article data...")
    art_en_title, art_en_page_id, art_en_id = get_or_download_pagedata_by_href(href_en, "en")
    
    global_article_title = art_en_title
    global_article_pageid = art_en_page_id
    
    #2. Get global links by english article title
    print("Getting global links...")
    #global_links = wikipedia.get_article_langlinks(art_en.title())
    global_links = wikipedia.get_article_langlinks(art_en_title)
    

    #3. Download articles and generate links scores fot the global links
    #for art_lang, art_title in global_links:
        #art_href = quote(art_title)
        #art = wikipedia.get_article_by_href(art_href, art_lang)
        #art_links_scores = wikilinks.get_article_links_score(art)
        #global_articles.append((art, art_links_scores, art_href, art_lang))
        
    global_pageids = list()
    print("Getting global links articles data...")
    #3.1 Register hrefs from the global links and get their register ids
    for i, (art_lang, art_title) in enumerate(global_links):
        print("{}/{}".format(i+1, len(global_links)))
        art_href = quote(art_title) #get href by quoting title
        art_title, art_page_id, art_id = get_or_download_pagedata_by_href(art_href, art_lang)
        global_pageids.append(art_id)
        
    print(global_pageids)

    #4. Save them into the database and get their ids
    #global_pageids = list()
    #for art in global_articles:
        #new_page_ids = w4n.save_article(art[0], art[1], art[2], art[3])
        #global_pageids.append(new_page_ids)
    
    
    #5. Points every article to the global article of its kind
    print("Saving global articles...")
    neo_query = "\n".join([
        'MERGE (art:GlobalArticle{{title:"{}", en_page_id:{}}})'.format(global_article_title, global_article_pageid),
        "WITH art MATCH (target_art:WikiArticle) WHERE ID(target_art) IN {}".format(global_pageids),
        "CREATE UNIQUE (art)<-[:DerivesFrom]-(target_art)"
    ])    

    session = w4n.driver.session()
    session.run(neo_query)
    session.close()
    print("Done ({} seconds.)".format(time.time() - start_time))

In [40]:
#import time

#start_time = time.time()
#get_or_download_pagedata_by_href("MQTT", "pt")
#print(time.time() - start_time)

In [38]:
download_and_save_global_articles("MQTT")

Getting english article data...
Getting global links...
Getting global links articles data...
1/9
2/9
3/9
4/9
5/9
6/9
7/9
8/9
9/9
[37, 65, 89, 97, 107, 115, 120, 127, 134]
Saving global articles...
Done
12.043689012527466


In [28]:
#wikipedia.get_article_langlinks("Artificial_neural_network")