In [1]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
import re
import gzip

In [2]:
PATH_WIKI_XML = 'd:/MMD- Project - Power2TheWiki/data/en/'
PATH_WIKI_OUT = 'd:/MGontar/Storages/Google Drive/20. Data Science/0.UCU Study/Mining Massive Datasets/Project/out/'

In [4]:
WIKI_FILENAMES = []
for file in os.listdir(PATH_WIKI_XML):
    if file.endswith(".xml"):
        WIKI_FILENAMES.append(file)

In [5]:
ENCODING = "utf-8"

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t


totalCount = 0
articleCount = 0
redirectCount = 0
total_article_text_len = 0

In [7]:
start_time = time.time()
regex_links = re.compile(r"\[\[(?P<article>(?!.*?\:).*?)(?:\|(?P<text>(?!.*?\:).*?))?\]\]")
regex_lang = re.compile(r"\[\[uk\:(?P<article>.+?)\]\]")
regex_lang_ru = re.compile(r"\[\[ru\:(?P<article>.+?)\]\]")
for WikiXML in WIKI_FILENAMES:
    pathWikiXML = os.path.join(PATH_WIKI_XML, WikiXML)
    pathArticles = os.path.join(PATH_WIKI_OUT, WikiXML+"_art.csv")
    pathArticlesRedirect = os.path.join(PATH_WIKI_OUT, WikiXML+"_red.csv")
    with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
    codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH:
        articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
        articlesWriter.writerow(['id', 'title', 'text_len', 'link_pos', 'link_val', 'link_txt']) 
        redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
        redirectWriter.writerow(['id', 'title', 'redirect'])
        for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
            tname = strip_tag_name(elem.tag)

            if event == 'start':
                if tname == 'page':
                    title = ''
                    id = -1
                    redirect = ''
                    inrevision = False
                    ns = 0
                    article_text_len = 0
                    links = []
                elif tname == 'revision':
                    # Do not pick up on revision id's
                    inrevision = True
            else:
                if tname == 'title':
                    title = elem.text
                elif tname == 'id' and not inrevision:
                    id = int(elem.text)
                elif tname == 'redirect':
                    redirect = elem.attrib['title']
                elif tname == 'ns':
                    ns = int(elem.text)
                elif tname == 'page' and ns == 0:
                    totalCount += 1
                    total_article_text_len += article_text_len                        
                    if len(redirect) == 0:
                        articleCount += 1
                        if len(links) == 0:
                            articlesWriter.writerow([id, title, article_text_len, 0, "", ""])
                        for link in links:
                            articlesWriter.writerow([id, title, article_text_len, link[0], link[1], link[2]])
                    else:
                        redirectCount += 1
                        redirectWriter.writerow([id, title, redirect])

                    if totalCount > 1 and (totalCount % 100000) == 0:
                        print("{:,}".format(totalCount))
                elif tname == 'text' and elem.text != None:
                    article_text_len = len(elem.text)
                    for match in regex_lang.finditer(elem.text):    
                        uk_link = match.group("article")
                    for match in regex_links.finditer(elem.text):    
                        link_pos = match.start()
                        link_title = match.group("article")
                        link_title = link_title.replace("&nbsp;", " ")
                        link_title = link_title.replace("&ndash;", "-")
                        link_title = link_title.replace("&mdash;", "—")
                        link_title = link_title.replace("%20", " ")
                        
                        link_text = match.group("text")
                        links.append((link_pos,link_title,link_text))


                elem.clear()
    
    fn_in = pathArticles
    with open(fn_in, 'rb') as f_in, gzip.open(fn_in+'.gz', 'wb') as f_out:
        f_out.writelines(f_in)
    os.remove(fn_in)
    
    fn_in = pathArticlesRedirect
    with open(fn_in, 'rb') as f_in, gzip.open(fn_in+'.gz', 'wb') as f_out:
        f_out.writelines(f_in)
    os.remove(fn_in)
    
    elapsed_time = time.time() - start_time
    print("File processed: {}".format(WikiXML))
    print("Total pages: {:,}".format(totalCount))
    print("Article pages: {:,}".format(articleCount))
    print("Redirect pages: {:,}".format(redirectCount))
    print("Total article lenght: {:,}".format(total_article_text_len))
    print("Elapsed time: {}".format(hms_string(elapsed_time)))

elapsed_time = time.time() - start_time

print("Total pages: {:,}".format(totalCount))
print("Article pages: {:,}".format(articleCount))
print("Redirect pages: {:,}".format(redirectCount))
print("Total article lenght: {:,}".format(total_article_text_len))
print("Elapsed time: {}".format(hms_string(elapsed_time)))

File processed: enwiki-20180620-pages-meta-current01-p10p30303.xml
Total pages: 49,652
Article pages: 38,291
Redirect pages: 11,361
Total article lenght: 584,538,815
Elapsed time: 0:00:58.31
File processed: enwiki-20180620-pages-meta-current02-p30304p88444.xml
Total pages: 88,197
Article pages: 65,129
Redirect pages: 23,068
Total article lenght: 1,276,094,074
Elapsed time: 0:02:21.91
100,000
File processed: enwiki-20180620-pages-meta-current03-p88445p200507.xml
Total pages: 170,115
Article pages: 130,653
Redirect pages: 39,462
Total article lenght: 2,392,075,343
Elapsed time: 0:04:49.92
200,000
File processed: enwiki-20180620-pages-meta-current04-p200511p352689.xml
Total pages: 253,573
Article pages: 179,768
Redirect pages: 73,805
Total article lenght: 3,282,443,066
Elapsed time: 0:06:59.37
300,000
File processed: enwiki-20180620-pages-meta-current05-p352690p565313.xml
Total pages: 373,022
Article pages: 244,761
Redirect pages: 128,261
Total article lenght: 4,243,772,308
Elapsed time: 

8,700,000
File processed: enwiki-20180620-pages-meta-current24-p33503451p33952815.xml
Total pages: 8,780,946
Article pages: 3,736,996
Redirect pages: 5,043,950
Total article lenght: 27,929,477,395
Elapsed time: 1:53:42.58
8,800,000
8,900,000
9,000,000
File processed: enwiki-20180620-pages-meta-current25-p33952816p35452816.xml
Total pages: 9,097,352
Article pages: 3,850,478
Redirect pages: 5,246,874
Total article lenght: 28,612,133,364
Elapsed time: 1:57:16.95
9,100,000
9,200,000
9,300,000
9,400,000
File processed: enwiki-20180620-pages-meta-current25-p35452816p36952816.xml
Total pages: 9,462,085
Article pages: 4,001,675
Redirect pages: 5,460,410
Total article lenght: 29,457,358,030
Elapsed time: 2:01:08.93
9,500,000
9,600,000
9,700,000
File processed: enwiki-20180620-pages-meta-current25-p36952816p38067202.xml
Total pages: 9,717,927
Article pages: 4,097,291
Redirect pages: 5,620,636
Total article lenght: 30,032,740,010
Elapsed time: 2:03:58.09
9,800,000
9,900,000
10,000,000
File proces