In [None]:
import xml.etree.ElementTree as etree
import multiprocessing as mp
import bz2
import codecs
import csv
import time
import os,glob
import traceback

WIKIPEDIA_ROOT = "/media/jeff/Data/data/wikipedia"
WIKIPEDIA_DL = os.path.join(WIKIPEDIA_ROOT, 'dl')
WORKER_REPORT = 1000
ENTIRE_TASK_REPORT = 100000

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def strip_tag_name(t):
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

class ExtractWikipedia:
    def __init__(self, worker):
        self.totalCount = 0
        self.articleCount = 0
        self.redirectCount = 0
        self.templateCount = 0
        self.worker = worker

    def extract_file(self, path):
        start_time = time.time()

        title = None
        redirect = ""
        count = 0
        with bz2.BZ2File(path, "r") as fp:
            for event, elem in etree.iterparse(fp, events=('start', 'end')):
                tname = strip_tag_name(elem.tag)

                if event == 'start':
                    if tname == 'page':
                        title = ''
                        id = -1
                        redirect = ''
                        inrevision = False
                        ns = 0
                    elif tname == 'revision':
                        # Do not pick up on revision id's
                        inrevision = True   
                else:
                    if tname == 'title':
                        title = elem.text
                    elif tname == 'id' and not inrevision:
                        id = int(elem.text)
                    elif tname == 'redirect':
                        redirect = elem.attrib['title']
                    elif tname == 'ns':
                        ns = int(elem.text)
                    elif tname == 'page':
                        self.totalCount += 1

                        if ns == 10:
                            self.templateCount += 1
                            self.worker.process_template(id, title)
                        elif ns == 0:
                            if len(redirect) > 0:
                                self.articleCount += 1
                                self.worker.process_redirect(id, title, redirect)
                            else:
                                self.redirectCount += 1
                                self.worker.process_article(id, title)

                        title = ""
                        redirect = ""
                        ns = -100
                        if self.totalCount > 1 and (self.totalCount % WORKER_REPORT) == 0:
                            self.worker.report_progress(self.totalCount)
                            self.totalCount = 0

        elem.clear()
        self.worker.report_progress(self.totalCount)
        
class ProcessPages():
    def __init__(self, outputQueue):
        self.templates = []
        self.articles = []
        self.redirects = []
        self.outputQueue = outputQueue
        
    def process_template(self, id, title):
        self.outputQueue.put(
            {'template': [id, title] }
        )
    
    def process_article(self, id, title):
        self.outputQueue.put(
            {'article': [id, title] }
        )
    
    def process_redirect(self, id, title, redirect):
        self.outputQueue.put(
            {'redirect': [id, title, redirect] }
        )
        
    def report_progress(self, completed):
        self.outputQueue.put({"completed": completed})

def worker2(inputQueue, outputQueue, config):
    done = False
    while not done:
        path = inputQueue.get()
        
        if path == "**exit**":
            done = True

        try:
            p = ProcessPages(outputQueue)
            e = ExtractWikipedia(p)
            e.extract_file(path)
        except Exception as e:
            print(f"Error: {e}")
            traceback.print_exc()
        finally:
            outputQueue.put({"file_complete":True})
    
ENCODING = "utf-8"

TEST_FILE = 'enwiki-20201101-pages-meta-current9.xml-p2936261p4045402.bz2'
TEST_PATH = os.path.join(WIKIPEDIA_DL, TEST_FILE)

files = glob.glob(os.path.join(WIKIPEDIA_DL, "*.bz2"))
print(f"Processing {len(files)} files")
cpus = mp.cpu_count()
print(f"Detected {cpus} cores.")
workers = cpus * 1
print(f"Using {workers} threads")

inputQueue = mp.Queue()
outputQueue = mp.Queue()
config = {}

for i in range(workers):
    p = mp.Process(target=worker2, args=(inputQueue, outputQueue, config))
    p.start()
    
for file in files:
    inputQueue.put(file)
    
    
pathArticles = os.path.join(WIKIPEDIA_ROOT, "articles.csv")
pathRedirect = os.path.join(WIKIPEDIA_ROOT, "redirect.csv")
pathTemplate = os.path.join(WIKIPEDIA_ROOT, "template.csv")
    
total_count = 0
file_count = 0
done = False
while not done:
    z = outputQueue.get()
    if "completed" in z:
        total_count += z["completed"]
        if total_count > 1 and (total_count % ENTIRE_TASK_REPORT) == 0:
            print(f"{total_count:,}, files: {file_count}")
    elif "file_complete" in z:
        file_count += 1
        if file_count()>=len(files):
            print("done")
            done = True
        
        

#
#e = ExtractWikipedia(WIKIPEDIA_DL, p)
#e.extract_file(TEST_PATH)
#e.extract()
#p.close()


Processing 59 files
Detected 48 cores.
Using 48 threads
100,000, files: 0
200,000, files: 0
300,000, files: 0
