In [1]:
from process_wikipedia import *

#WIKIPEDIA_ROOT = "/home/jeff/data/wikipedia"
WIKIPEDIA_ROOT = "C:\\Users\\jeffh\\data\\wikipedia\\"
WIKIPEDIA_DL = os.path.join(WIKIPEDIA_ROOT, 'dl')

In [30]:
import json
import sys
import hashlib

BUF_SIZE = 65536


def sha1_file(path):
    sha1 = hashlib.sha1()
    with open(path, 'rb') as f:
        while True:
            data = f.read(BUF_SIZE)
            if not data:
                break
            sha1.update(data)
    return sha1.hexdigest()

def check_file(path):

def wikidump_download(target_path, wiki_url=WIKIPEDIA_URL,wiki_lang=WIKIPEDIA_LANG, timestamp=None):
    if not timestamp:
        timestamp = get_latest_wikidump(get_wikidump_available(wiki_url,wiki_lang))
    
    dump_url = posixpath.join(wiki_url, wiki_lang, str(timestamp))
    print(dump_url)
    
    status_file = posixpath.join(dump_url, 'dumpstatus.json')
    dump_json = urllib.request.urlopen(status_file).read()
    dump_status = json.loads(dump_json)
    dump_current = dump_status['jobs']['metacurrentdump']
    job_status = dump_current['status']
    
    if job_status != 'done':
        raise ValueError(f"Current article dump status: [job_status]")
        
    files = dump_current['files']
    for file in files.keys():
        meta = files[file]
        source_url = urllib.parse.urljoin(wiki_url, meta['url'])
        target_file = os.path.join(target_path,file)
        if os.path.exists(target_file):
            sha1_local = sha1_file(target_file)
            if sha1_local != meta['sha1']:
                print(f"Corrupt: {file}")
                os.remove("file")
                should_download = True
            else:
                print(f"Exists: {file}")
                should_download = False
        else:
            print(f"Missing: {file}")
            should_download = True
            
        if should_download:
            try:
                urllib.request.urlretrieve(source_url, target_file)
                print(f"Downloaded: {file}")
            except urllib.error.URLError as e:
                try:
                    os.remove(target_file)
                finally:
                    print(f"Download Error: {file}")


wikidump_download(WIKIPEDIA_DL)

https://dumps.wikimedia.org/enwiki/20210320
Exists: enwiki-20210320-pages-meta-current1.xml-p1p41242.bz2
Exists: enwiki-20210320-pages-meta-current2.xml-p41243p151573.bz2
Exists: enwiki-20210320-pages-meta-current3.xml-p151574p311329.bz2
Exists: enwiki-20210320-pages-meta-current4.xml-p311330p558391.bz2
Exists: enwiki-20210320-pages-meta-current5.xml-p558392p958045.bz2
Exists: enwiki-20210320-pages-meta-current6.xml-p958046p1483661.bz2
Exists: enwiki-20210320-pages-meta-current7.xml-p1483662p2134111.bz2
Exists: enwiki-20210320-pages-meta-current8.xml-p2134112p2936260.bz2
Exists: enwiki-20210320-pages-meta-current9.xml-p2936261p4045402.bz2
Exists: enwiki-20210320-pages-meta-current10.xml-p4045403p5399366.bz2
Exists: enwiki-20210320-pages-meta-current11.xml-p5399367p6899366.bz2
Exists: enwiki-20210320-pages-meta-current11.xml-p6899367p7054859.bz2
Exists: enwiki-20210320-pages-meta-current12.xml-p7054860p8554859.bz2
Exists: enwiki-20210320-pages-meta-current12.xml-p8554860p9172788.bz2
Exi

# Download Wikipedia Dump Data

This part only needs to be run once, at the beginning to download Wikipedia to somewhere on your system.  Note, this part will take a long time!  It will transfer much data!

In [2]:
wikidump_download(WIKIPEDIA_DL)

https://dumps.wikimedia.org/enwiki/20210320
C:\Users\jeffh\data\wikipedia\dl\enwiki-20210320-pages-meta-current1.xml-p1p41242.bz2 already downloaded.
C:\Users\jeffh\data\wikipedia\dl\enwiki-20210320-pages-meta-current2.xml-p41243p151573.bz2 already downloaded.
C:\Users\jeffh\data\wikipedia\dl\enwiki-20210320-pages-meta-current3.xml-p151574p311329.bz2 already downloaded.
C:\Users\jeffh\data\wikipedia\dl\enwiki-20210320-pages-meta-current4.xml-p311330p558391.bz2 already downloaded.
C:\Users\jeffh\data\wikipedia\dl\enwiki-20210320-pages-meta-current5.xml-p558392p958045.bz2 already downloaded.
C:\Users\jeffh\data\wikipedia\dl\enwiki-20210320-pages-meta-current6.xml-p958046p1483661.bz2 already downloaded.
C:\Users\jeffh\data\wikipedia\dl\enwiki-20210320-pages-meta-current7.xml-p1483662p2134111.bz2 already downloaded.
C:\Users\jeffh\data\wikipedia\dl\enwiki-20210320-pages-meta-current8.xml-p2134112p2936260.bz2 already downloaded.
C:\Users\jeffh\data\wikipedia\dl\enwiki-20210320-pages-meta-cu

# Process Wikipedia Data

This can be run after the previous section has downloaded Wikipedia data.  This example creates 3 files from extracted Wikipedia data:

* article.csv - A listing of all articles on Wikipedia, with their Wikipedia ID.
* redirect.csv - A listing of all redirects of articles on Wikipedia. e.g. USA to United_States
* template.csv - A listing of all templates on Wikipedia.  These are the "types" of articles.

In [3]:
class ProcessPagesWorker():
    def __init__(self, config, outputQueue):
        self.config = config
        self.outputQueue = outputQueue
        
    def process_template(self, id, title):
        self.outputQueue.put(
            {'template': [id, title] }
        )
    
    def process_article(self, id, title):
        self.outputQueue.put(
            {'article': [id, title] }
        )
    
    def process_redirect(self, id, title, redirect):
        self.outputQueue.put(
            {'redirect': [id, title, redirect] }
        )
        
    def report_progress(self, completed):
        self.outputQueue.put({"completed": completed})
        
    def close(self):
        self.articles_fp.close()
        self.redirect_fp.close()
        self.template_fp.close()

class ProcessPages:
    def __init__(self, output_path):        
        pathArticles = os.path.join(output_path, "article.csv")
        pathRedirect = os.path.join(output_path, "redirect.csv")
        pathTemplate = os.path.join(output_path, "template.csv")
        
        self.articles_fp = codecs.open(pathArticles, "w", ENCODING)
        self.redirect_fp = codecs.open(pathRedirect, "w", ENCODING)
        self.template_fp = codecs.open(pathTemplate, "w", ENCODING)
    
        self.articlesWriter = csv.writer(self.articles_fp, quoting=csv.QUOTE_MINIMAL)
        self.redirectWriter = csv.writer(self.redirect_fp, quoting=csv.QUOTE_MINIMAL)
        self.templateWriter = csv.writer(self.template_fp, quoting=csv.QUOTE_MINIMAL)
        
        self.articlesWriter.writerow(['id', 'title'])
        self.redirectWriter.writerow(['id', 'title', 'redirect'])
        self.templateWriter.writerow(['id', 'title'])
    
    def handle_event(self, evt):

        if "article" in evt:
            self.articlesWriter.writerow(evt['article'])
        elif "template" in evt:
            self.templateWriter.writerow(evt['template'])
        elif "redirect" in evt:
            self.redirectWriter.writerow(evt['redirect'])
            
    def get_worker_class(self, outputQueue, config):
        return ProcessPagesWorker(config, outputQueue)
    

wiki = ExtractWikipedia(
    ProcessPages(WIKIPEDIA_ROOT), # where you want the extracted Wikipedia files to go
    WIKIPEDIA_DL #Location you downloaded Wikipedia to
)
wiki.process()

Processing 58 files
Detected 4 cores.
Using 4 threads


TypeError: cannot serialize '_io.BufferedWriter' object