### Notebook to study the best way to architect a db to store wikipedia articles multiple languages

In [1]:
import py2neo #Libs to connect to neo4j

def connectDb():
    """Function to connect to neo4j database."""

    py2neo.authenticate("localhost:7474", "neo4j", "lucas")
    dbConnection = py2neo.Graph("http://localhost:7474/db/data/")

    return dbConnection

In [2]:
from wikipydia import wikipedia, wikilinks

In [3]:
#art = wikipedia.get_article_by_href("C%2b%2b")
#wikilinks.get_article_links_score(art)

In [4]:
#"Lu'cas".replace("'", "\\'")

In [96]:
from neo4j.v1 import GraphDatabase, basic_auth

from urllib.parse import unquote

class Wiki4Neo():
    def __init__(self, user="neo4j", password="lucas"):
        """Connect to neo4j db"""
        self.driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth(user, password))
        
        #session = driver.session()
        #session.run("CREATE (a:Person {name: {name}, title: {title}})",{"name": "Arthur", "title": "King"})
        #result = session.run("MATCH (a:Person) WHERE a.name = {name} RETURN a.name AS name, a.title AS title",{"name": "Arthur"})
        #for record in result:
        #    print("%s %s" % (record["title"], record["name"]))
        #session.close()

    def save_article(self, article, links, url, lang):
        #Ensure to escape quotes from the title and url before query
        #"Lu'cas".replace("'", "\\'")
        #pageInfo.title = pageInfo.title.replace(/['\\]/g, "\\$&");
        #pageInfo.url = pageInfo.url.replace(/['\\]/g, "\\$&");
        
        article_title = escp(article.title())
        url = escp(url)

        #Construct query
        neo_query = "\n".join([
            'MERGE (article:WikiArticle{{title:"{}", pageId:{}}})'.format(article_title, article.page_id()), 
            'ON CREATE SET article.lang = "{}"'.format(lang), 
            'MERGE (articleUrl:Wikiurl{{url_lang:"{}"}})'.format(url + "_" + lang),
            'ON CREATE SET articleUrl.url = "{}", articleUrl.lang = "{}"'.format(url, lang),
            'SET articleUrl.articleId = {}'.format(article.page_id()),
            "CREATE UNIQUE (articleUrl)-[:RedirectsTo]->(article)"
        ])
        
        #Create queries for links to the article
        for i, (link_href, link_score) in enumerate(links):
            #Ensure to escape quotes from the link before query
            #var link = pageInfo.links[i].replace(/['\\]/g, "\\$&");
            link_href = escp(unquote(link_href)) #Remove url quotes from the href

            neo_query += "\n\n" + "\n".join([
                'MERGE (articleLink{}:Wikiurl{{url_lang: "{}"}})'.format(i, link_href+"_"+lang),
                'ON CREATE SET articleLink{}.url = "{}", articleLink{}.lang = "{}"'.format(i,link_href,i,lang),
                'CREATE UNIQUE (article)-[:LinksTo{{score:{}}}]->(articleLink{})'.format(link_score, i)
            ])
            
        #Create query to create direct connection between articles
        if False:
            neo_query += "\n"

            #Add in/out ConnectsTo relation to this article 
            neo_query += "\n".join([
                "WITH article MATCH (article)-[:LinksTo]->(:Wikiurl)-[:RedirectsTo]->(targetArticle:Article)",
                "CREATE UNIQUE (article)-[:ConnectsTo]->(targetArticle)",
                "WITH article MATCH (targetArticle:Article)-[:LinksTo]->(:Wikiurl)-[:RedirectsTo]->(article)", 
                " CREATE UNIQUE (targetArticle)-[:ConnectsTo]->(article)"
            ])
            
        neo_query += "\nRETURN id(article) as article_id"

        session = self.driver.session()
        results = session.run(neo_query)
        session.close()
        
        for record in results:
            return record["article_id"]

In [120]:
def escp(text):
    return text
    """Function to escape special characters."""
    new_text = text
    new_text = new_text.replace('"', '\\"') #Two bars because one escape the python code and the other the query str
    new_text = new_text.replace("'", "\\'")
    
    return new_text
    
print(escp('test"'))
print(escp("test'"))

test"
test'


In [121]:
w4n = Wiki4Neo()

In [122]:
def save_article_and_links_scores_by_href(href, lang):
    art = wikipedia.get_article_by_href(href, lang)
    links = wikilinks.get_article_links_score(art)
    art_id = w4n.save_article(art, links, href, lang)
    print(art_id)

In [123]:
#save_article_and_links_scores_by_href("c%2b%2b", "en")

In [124]:
#from urllib.parse import quote
#for lang, article in wikipedia.get_article_langlinks("MQTT"):
    #article = quote(article) #quote article title to use as href
    #save_article_and_links_scores_by_href(article, lang)

In [125]:
def get_or_download_pagedata_by_href(href, lang):
    """
    Get data from database, downloading missing data.
    """
    
    #start_time = time.time()
    
    #1. Get the wikiarticle data from the db by the href 
    neo_query = "\n".join([
        'MATCH (n:Wikiurl{{url:"{}", lang:"{}"}})-[:RedirectsTo]->(a:WikiArticle)',
        'RETURN a.title as title, a.pageId as pageid, id(a) as id'
    ]).format(escp(href), lang)
    
    session = w4n.driver.session()
    results = session.run(neo_query)
    session.close()
    
    #print("Time of the 1st try to locate the data:{}".format(time.time() - start_time))
    
    for r in results:
        return r['title'], r['pageid'], r['id']
    
    #start_time = time.time()
    #2. If the href doesn't return an article, download this href
    #Try to get the article
    #If suceed, save the href and try to return the article and create a reference from the href
    art = wikipedia.get_article_by_href(href, lang)
    
    neo_query = "\n".join([
        'MERGE (u:Wikiurl{{url_lang:"{}"}})'.format(escp(unquote(href)) + "_" + lang),
        'ON CREATE SET u.url = "{}", u.lang = "{}", u.articleId = {}'.format(escp(unquote(href)), lang, art.page_id()),
        'WITH u MATCH (a:WikiArticle{{pageId:{}}})'.format(art.page_id()),
        'CREATE UNIQUE (u)-[:RedirectsTo]->(a)',
        'RETURN a.title as title, a.pageId as pageid, id(a) as id'
    ])
    
    session = w4n.driver.session()
    results = session.run(neo_query)
    session.close()
    #print("Time of the 2nd try to locate the data:{}".format(time.time() - start_time))
    
    for r in results:
        return r['title'], r['pageid'], r['id']
    
    #start_time = time.time()
    #3. If nothing has been returned till now, so the article doesn't exists, lets register it
    links = wikilinks.get_article_links_score(art)
    new_art_id = w4n.save_article(art, links, href, lang)
    #print("Time of the 3rd try to locate the data:{}".format(time.time() - start_time))
    
    return art.title(), art.page_id(), new_art_id

In [126]:
#print(get_or_download_pagedata_by_href("c%2b%2b", "en"))

In [161]:
from urllib.parse import quote
import time

def download_and_save_global_articles(href_en):
    start_time = time.time()
    
    #global_articles = list()

    #href_en = "MQTT"
    
    #1. Download english article and get links scores
    #art_en = wikipedia.get_article_by_href(href_en, "en")
    #links_scores = wikilinks.get_article_links_score(art_en)
    #global_articles.append((art_en, links_scores, href_en, "en"))
    #global_article_title = art_en.title()
    #global_article_pageid = art_en.page_id()
    
    #1.1 Get or download english article data
    print("Getting english article data...")
    art_en_title, art_en_page_id, art_en_id = get_or_download_pagedata_by_href(href_en, "en")
    
    global_article_title = art_en_title
    global_article_pageid = art_en_page_id
    
    #2. Get global links by english article title
    print("Getting global links...")
    #global_links = wikipedia.get_article_langlinks(art_en.title())
    #Quote title for correct results for cases of "C++" etc
    global_links = wikipedia.get_article_langlinks(quote(art_en_title)) 
    

    #3. Download articles and generate links scores fot the global links
    #for art_lang, art_title in global_links:
        #art_href = quote(art_title)
        #art = wikipedia.get_article_by_href(art_href, art_lang)
        #art_links_scores = wikilinks.get_article_links_score(art)
        #global_articles.append((art, art_links_scores, art_href, art_lang))
        
    global_pageids = list()
    global_pageids.append(art_en_id)
    print("Getting global links articles data...")
    #3.1 Register hrefs from the global links and get their register ids
    for i, (art_lang, art_title) in enumerate(global_links):
        art_href = quote(art_title) #get href by quoting title
        print("{} {} - {}/{}".format(art_title, art_lang, i+1, len(global_links)))
        try:
            art_title, art_page_id, art_id = get_or_download_pagedata_by_href(art_href, art_lang)
            global_pageids.append(art_id)
        except:
            print("ERROR. Skiping.")
        
    print(global_pageids)

    #4. Save them into the database and get their ids
    #global_pageids = list()
    #for art in global_articles:
        #new_page_ids = w4n.save_article(art[0], art[1], art[2], art[3])
        #global_pageids.append(new_page_ids)
    
    
    #5. Points every article to the global article of its kind
    print("Saving global articles...")
    neo_query = "\n".join([
        'MERGE (art:GlobalArticle{{title:"{}", en_page_id:{}}})'.format(global_article_title, global_article_pageid),
        "WITH art MATCH (target_art:WikiArticle) WHERE ID(target_art) IN {}".format(global_pageids),
        "CREATE UNIQUE (art)<-[:DerivesFrom]-(target_art)"
    ])    

    session = w4n.driver.session()
    session.run(neo_query)
    session.close()
    print("Done ({} seconds.)".format(time.time() - start_time))

In [162]:
#import time

#start_time = time.time()
#get_or_download_pagedata_by_href("MQTT", "pt")
#print(time.time() - start_time)

In [163]:
download_and_save_global_articles("c%2b%2b")

Getting english article data...
Getting global links...
Getting global links articles data...
C++ af - 1/99
C++ an - 2/99
سي++ ar - 3/99
سى++ arz - 4/99
C++ az - 5/99
سی++ azb - 6/99
C++ bat-smg - 7/99
C++ be - 8/99
C++ be-x-old - 9/99
C++ bg - 10/99
সি++ bn - 11/99
Areg C++ br - 12/99
C++ bs - 13/99
C++ bug - 14/99
C++ ca - 15/99
C++ cs - 16/99
C++ cv - 17/99
C++ da - 18/99
C++ de - 19/99
C++ el - 20/99
C++ eo - 21/99
C++ es - 22/99
C++ et - 23/99
C++ eu - 24/99
سی پلاس‌پلاس fa - 25/99
C++ fi - 26/99
C++ fr - 27/99
C++ ga - 28/99
C++ gan - 29/99
C++ gl - 30/99
C++(પ્રોગ્રામિંગ ભાષા) gu - 31/99
C++ he - 32/99
ERROR. Skiping.
सी++ hi - 33/99
C++ hr - 34/99
C++ hu - 35/99
C++ hy - 36/99
C++ ia - 37/99
C++ id - 38/99
C++ is - 39/99
C++ it - 40/99
C++ ja - 41/99
C++ jv - 42/99
C++ ka - 43/99
C++ kaa - 44/99
C++ kk - 45/99
ಸಿ++ kn - 46/99
C++ ko - 47/99
C++ ky - 48/99
C++ la - 49/99
C++ lb - 50/99
C++ lmo - 51/99
C++ lt - 52/99
C++ lv - 53/99
C++ mk - 54/99
സി++ ml - 55/99
C++ mn - 56/99
सी

In [164]:
query = "\n".join(['MATCH (n:GlobalArticle)<-[:DerivesFrom]-(:WikiArticle)-[l:LinksTo]->(u:Wikiurl)',
                   'WHERE n.title = "C++" RETURN l.score as score,u.url as url, u.lang as lang'
                  ])

session = w4n.driver.session()
result = session.run(query)
session.close()

In [165]:
from collections import defaultdict, Counter

lang_links = defaultdict(list)

for r in result:
    lang_links[r['lang']].append([r['url'], r['score']])

In [166]:
#Normalize scores for every lang
for lang in lang_links.keys():
    score_sum = 0
    for url, score in lang_links[lang]:
        score_sum += score
    
    for i,(url, score) in enumerate(lang_links[lang]):
        lang_links[lang][i].append(score/score_sum)

In [168]:
#Accumulate everything (assuming urls share href among languages)
#Maybe weight things by the number urls they have
#And get only things that the english version links to
result_score = Counter()

for lang in lang_links.keys():
    for url, score, score_norm in lang_links[lang]:
        result_score[url] += score_norm

In [170]:
result_score.most_common()[:10]

[('C', 3.9255560752546277),
 ('International_Standard_Book_Number', 2.504589083441126),
 ('Programming_language', 1.1419850885368126),
 ('Bjarne_Stroustrup', 1.005180843981163),
 ('Vikipediya', 1.0),
 ('C语言', 1.0),
 ('સંદર્ભ_આપો', 1.0),
 ('Ризница_(Викимедија)', 1.0),
 ('Бримкулов,_Улан_Нургазиевич', 1.0),
 ('ब्यार्न_स्त्राऊस्त्रुप', 1.0)]

In [171]:
eng_counter = Counter()
for url, score, score_norm in lang_links["en"]:
    eng_counter[url] += score_norm
    
eng_counter.most_common()

[('C_(programming_language)', 0.2324324324324324),
 ('Standardization', 0.05945945945945945),
 ('International_Organization_for_Standardization', 0.03378378378378378),
 ('Library_(computing)', 0.03378378378378378),
 ('Feature_(software_design)', 0.03243243243243243),
 ('Generic_programming', 0.03175675675675675),
 ('Operator_(programming)', 0.024324324324324322),
 ('Exception_handling', 0.02027027027027027),
 ('Class_(computer_programming)', 0.018018018018018025),
 ('C++14', 0.014864864864864867),
 ('Compile-time', 0.014864864864864862),
 ('Template_(programming)', 0.012837837837837835),
 ('C++11', 0.012162162162162168),
 ('Base_class', 0.012162162162162161),
 ('Variable_(programming)', 0.01081081081081081),
 ('Inheritance_(computer_science)', 0.01081081081081081),
 ('Inheritance_(object-oriented_programming)', 0.01081081081081081),
 ('Data', 0.01081081081081081),
 ('Performance_(software)', 0.009459459459459458),
 ('Compiler', 0.009459459459459458),
 ('Operator_overloading', 0.0094594

In [28]:
#wikipedia.get_article_langlinks("Artificial_neural_network")

In [134]:
#save_article_and_links_scores_by_href("c%2b%2b", "he")

In [131]:
#"ב".encode(encoding='ascii',errors='strict').decode("utf-8", errors="strict") 

In [132]:
#wikipedia.get_article_langlinks("C%2b%2b")

In [133]:
#MATCH (n:GlobalArticle)<-[:DerivesFrom]-(:WikiArticle)-[l:LinksTo]->(u:Wikiurl)-[:RedirectsTo]->(:WikiArticle)-[:DerivesFrom]->(o:GlobalArticle) WHERE n.title = "MQTT" RETURN n,u,l,o