In [1]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re
import gzip

In [2]:
PATH_WIKI_XML = 'd:/MMD- Project - Power2TheWiki/data/uk/'
PATH_WIKI_OUT = 'd:/MGontar/Storages/Google Drive/20. Data Science/0.UCU Study/Mining Massive Datasets/Project/out/'

In [3]:
WIKI_FILENAMES = []
for file in os.listdir(PATH_WIKI_XML):
    if file.endswith(".xml"):
        WIKI_FILENAMES.append(file)

In [4]:
ENCODING = "utf-8"


# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t


totalCount = 0
articleCount = 0
redirectCount = 0
withUkLinkCount = 0
withRuLinkCount = 0
withUkRuLinkCount = 0
title = None

In [5]:
start_time = time.time()
regex_links = re.compile(r"\[\[(?P<article>(?!.*?\:).+?)(?:\|(?P<text>.+?))?\]\]")
regex_lang = re.compile(r"\[\[en\:(?P<article>.+?)\]\]")
regex_lang_ru = re.compile(r"\[\[ru\:(?P<article>.+?)\]\]")
for WikiXML in WIKI_FILENAMES:
    pathWikiXML = os.path.join(PATH_WIKI_XML, WikiXML)
    pathArticles = os.path.join(PATH_WIKI_OUT, WikiXML+"_art.csv")
    pathArticlesRedirect = os.path.join(PATH_WIKI_OUT, WikiXML+"_red.csv")
    with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
    codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH:
        articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
        articlesWriter.writerow(['id', 'title', 'en', 'link_pos', 'link_val', 'link_txt']) 
        redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
        redirectWriter.writerow(['id', 'title', 'redirect'])
        for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
            tname = strip_tag_name(elem.tag)

            if event == 'start':
                if tname == 'page':
                    title = ''
                    id = -1
                    redirect = ''
                    inrevision = False
                    ns = 0
                    txtlen = 0
                    uk_link = ''
                    ru_link = ''
                    links = []
                elif tname == 'revision':
                    # Do not pick up on revision id's
                    inrevision = True
            else:
                if tname == 'title':
                    title = elem.text
                elif tname == 'id' and not inrevision:
                    id = int(elem.text)
                elif tname == 'redirect':
                    redirect = elem.attrib['title']
                elif tname == 'ns':
                    ns = int(elem.text)
                elif tname == 'page' and ns == 0:
                    totalCount += 1
                    if len(uk_link) > 0:
                        withUkLinkCount += 1
                    if len(ru_link) > 0:
                        withRuLinkCount += 1  
                    if len(uk_link) > 0 and len(ru_link) > 0:
                        withUkRuLinkCount += 1
                        
                    if len(redirect) == 0:
                        articleCount += 1
                        if len(links) == 0:
                            articlesWriter.writerow([id, title, uk_link, "", "", ""])
                        for link in links:
                            articlesWriter.writerow([id, title, uk_link, link[0], link[1], link[2]])
                    else:
                        redirectCount += 1
                        redirectWriter.writerow([id, title, redirect])

                    if totalCount > 1 and (totalCount % 100000) == 0:
                        print("{:,}".format(totalCount))
                elif tname == 'text' and elem.text != None:
                    txtlen = len(elem.text)
                    for match in regex_lang.finditer(elem.text):    
                        uk_link = match.group("article")
                    for match in regex_links.finditer(elem.text):    
                        link_pos = match.start()
                        link_title = match.group("article")
                        link_text = match.group("text")
                        links.append((link_pos,link_title,link_text))


                elem.clear()
    
    fn_in = pathArticles
    with open(fn_in, 'rb') as f_in, gzip.open(fn_in+'.gz', 'wb') as f_out:
        f_out.writelines(f_in)
    os.remove(fn_in)
    
    fn_in = pathArticlesRedirect
    with open(fn_in, 'rb') as f_in, gzip.open(fn_in+'.gz', 'wb') as f_out:
        f_out.writelines(f_in)
    os.remove(fn_in)
    
    elapsed_time = time.time() - start_time
    print("File processed: {}".format(WikiXML))
    print("Total pages: {:,}".format(totalCount))
    print("Article pages: {:,}".format(articleCount))
    print("Redirect pages: {:,}".format(redirectCount))
    print("Pages with UK link: {:,}".format(withUkLinkCount))
    print("Pages with RU link: {:,}".format(withRuLinkCount))
    print("Pages with UK and RU link: {:,}".format(withUkRuLinkCount))
    print("Elapsed time: {}".format(hms_string(elapsed_time)))

elapsed_time = time.time() - start_time

print("Total pages: {:,}".format(totalCount))
print("Article pages: {:,}".format(articleCount))
print("Redirect pages: {:,}".format(redirectCount))
print("Pages with UK link: {:,}".format(withUkLinkCount))
print("Pages with RU link: {:,}".format(withRuLinkCount))
print("Pages with UK and RU link: {:,}".format(withUkRuLinkCount))
print("Elapsed time: {}".format(hms_string(elapsed_time)))

100,000
200,000
300,000
400,000
500,000
600,000
700,000
800,000
900,000
1,000,000
1,100,000
1,200,000
File processed: ukwiki-20180620-pages-meta-current.xml
Total pages: 1,239,608
Article pages: 796,714
Redirect pages: 442,894
Pages with UK link: 805
Pages with RU link: 0
Pages with UK and RU link: 0
Elapsed time: 0:19:52.25
Total pages: 1,239,608
Article pages: 796,714
Redirect pages: 442,894
Pages with UK link: 805
Pages with RU link: 0
Pages with UK and RU link: 0
Elapsed time: 0:19:52.25
