# Get covid related pages from the dumps
By Diego Saez-Trumper
Sept 18, 2020

In [1]:
import pandas as pd
import glob,gzip

#Reading input data
languages = glob.glob('wik*clean*v3*')
pages = {}
idsToWikidata = {}
for lang in languages:
    langcode =lang[-10:-8]
    pages[langcode] = pd.read_csv(lang,index_col=0)
    pages[langcode]['Wikidata'] = pages[langcode].index
    #Here we build a dictionary mapping page_ids to Wikidata Item
    idsToWikidata[langcode] =    pages[langcode][['page_id','Wikidata']].set_index('page_id').to_dict()['Wikidata']
    

In [2]:
def process_dump(dump,  path):
    global lang
    global pages
    global idsToWikidata
    for page in dump:    
        if page.namespace == 0:
            if int(page.id) in idsToWikidata[lang]:
                for revision in page: 
                    try:
                    
                        output = {'page_id':page.id,'page_title':page.title,
                                  'text':revision.text,'QID':idsToWikidata[lang][page.id],
                                 'revision':revision.id,'time':str(revision.timestamp)}
                        yield output
                    except:
                        pass


In [4]:
import mwxml
import json
date = "20200820"

for lang in pages.keys():
    print(lang)
    paths = glob.glob('/mnt/data/xmldatadumps/public/%swiki/%s/%swiki-%s-pages-meta-current*.xml*.bz2' % (lang,date,lang,date))
    paths.remove('/mnt/data/xmldatadumps/public/%swiki/%s/%swiki-%s-pages-meta-current.xml.bz2' % (lang,date,lang,date))
    print(paths)
    #f = gzip.open('covid_pages_from_dump_current_%s.jsonl.gzip' % lang,'wb')
    f = open('covid_pages_from_dump_current_%s.jsonl' % lang,'w')
    for result in mwxml.map(process_dump, paths, threads = 40):
        linedata = json.dumps(result) + '\n'
        #f.write(linedata.encode())
        f.write(linedata)



zh
['/mnt/data/xmldatadumps/public/zhwiki/20200820/zhwiki-20200820-pages-meta-current6.xml-p6231440p7171101.bz2', '/mnt/data/xmldatadumps/public/zhwiki/20200820/zhwiki-20200820-pages-meta-current5.xml-p2771087p4271086.bz2', '/mnt/data/xmldatadumps/public/zhwiki/20200820/zhwiki-20200820-pages-meta-current5.xml-p4271087p4731439.bz2', '/mnt/data/xmldatadumps/public/zhwiki/20200820/zhwiki-20200820-pages-meta-current4.xml-p1154618p2654617.bz2', '/mnt/data/xmldatadumps/public/zhwiki/20200820/zhwiki-20200820-pages-meta-current6.xml-p4731440p6231439.bz2', '/mnt/data/xmldatadumps/public/zhwiki/20200820/zhwiki-20200820-pages-meta-current2.xml-p162887p544644.bz2', '/mnt/data/xmldatadumps/public/zhwiki/20200820/zhwiki-20200820-pages-meta-current1.xml-p1p162886.bz2', '/mnt/data/xmldatadumps/public/zhwiki/20200820/zhwiki-20200820-pages-meta-current4.xml-p2654618p2771086.bz2', '/mnt/data/xmldatadumps/public/zhwiki/20200820/zhwiki-20200820-pages-meta-current3.xml-p544645p1154617.bz2']
ru
['/mnt/data/x

Namespace id conflict detected.  <title>=مستخدم:منصورالواقدي/مقالات, <namespace>=105, mapped_namespace=2


fr
['/mnt/data/xmldatadumps/public/frwiki/20200820/frwiki-20200820-pages-meta-current5.xml-p6092356p7494128.bz2', '/mnt/data/xmldatadumps/public/frwiki/20200820/frwiki-20200820-pages-meta-current4.xml-p2516882p4016881.bz2', '/mnt/data/xmldatadumps/public/frwiki/20200820/frwiki-20200820-pages-meta-current3.xml-p2427546p2516881.bz2', '/mnt/data/xmldatadumps/public/frwiki/20200820/frwiki-20200820-pages-meta-current3.xml-p927546p2427545.bz2', '/mnt/data/xmldatadumps/public/frwiki/20200820/frwiki-20200820-pages-meta-current6.xml-p8994129p10494128.bz2', '/mnt/data/xmldatadumps/public/frwiki/20200820/frwiki-20200820-pages-meta-current4.xml-p4016882p4592355.bz2', '/mnt/data/xmldatadumps/public/frwiki/20200820/frwiki-20200820-pages-meta-current6.xml-p7494129p8994128.bz2', '/mnt/data/xmldatadumps/public/frwiki/20200820/frwiki-20200820-pages-meta-current6.xml-p11994129p13494128.bz2', '/mnt/data/xmldatadumps/public/frwiki/20200820/frwiki-20200820-pages-meta-current1.xml-p1p275787.bz2', '/mnt/data/

es
['/mnt/data/xmldatadumps/public/eswiki/20200820/eswiki-20200820-pages-meta-current4.xml-p3119673p4080860.bz2', '/mnt/data/xmldatadumps/public/eswiki/20200820/eswiki-20200820-pages-meta-current6.xml-p9424816p9562485.bz2', '/mnt/data/xmldatadumps/public/eswiki/20200820/eswiki-20200820-pages-meta-current5.xml-p5580861p6424815.bz2', '/mnt/data/xmldatadumps/public/eswiki/20200820/eswiki-20200820-pages-meta-current1.xml-p1p143637.bz2', '/mnt/data/xmldatadumps/public/eswiki/20200820/eswiki-20200820-pages-meta-current3.xml-p597335p1619672.bz2', '/mnt/data/xmldatadumps/public/eswiki/20200820/eswiki-20200820-pages-meta-current4.xml-p1619673p3119672.bz2', '/mnt/data/xmldatadumps/public/eswiki/20200820/eswiki-20200820-pages-meta-current5.xml-p4080861p5580860.bz2', '/mnt/data/xmldatadumps/public/eswiki/20200820/eswiki-20200820-pages-meta-current6.xml-p6424816p7924815.bz2', '/mnt/data/xmldatadumps/public/eswiki/20200820/eswiki-20200820-pages-meta-current6.xml-p7924816p9424815.bz2', '/mnt/data/xml