# Extracting pages containing keyword from a dump 

This notebook extracts the pages of a given dump containing a keyword from a set of given ones.

Code inspired on: https://github.com/mediawiki-utilities/python-mwxml/blob/master/ipython/labs_example.ipynb

## Define paths to visit

In [1]:
import glob
import mwxml
import mwtypes
import csv
import datetime
import re

paths = glob.glob('/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current*.bz2')
paths = [mwtypes.files.normalize_path(path) for path in paths]
paths = paths[28:]
paths

['/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current21.xml-p22722158p23927983.bz2',
 '/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current22.xml-p23927984p25427984.bz2',
 '/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current22.xml-p25427984p26823660.bz2',
 '/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current23.xml-p26823661p28323661.bz2',
 '/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current23.xml-p28323661p29823661.bz2',
 '/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current23.xml-p29823661p30503449.bz2',
 '/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current24.xml-p30503451p32003451.bz2',
 '/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current24.xml-p32003451p33503451.bz2',
 '/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current24.xml-p33503451p33952815.bz2',
 '/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current25

In [2]:
print(len(paths))

29


In [1]:
keywords_raw1 = "flood,floods,flooding,flooded,inundation"
keywords = keywords_raw1.split(',')
print(keywords)

['flood', 'floods', 'flooding', 'flooded', 'inundation']


# Find keywords function

Returns a boolean defining whether a keyword was found or not

In [2]:
def find_keywords(text):
    if any(k in text for k in keywords):
        return True
    else:
        return False

In [3]:
find_keywords('This city is flooded')

True

## XML Processor on path

In [5]:
def process_dump(dump, path):
    print(path, datetime.datetime.now())
    i = 0
    for page in dump:
        page_name = re.findall('title=(.+?),', str(page))[0].replace("'", '')
        page_id = re.findall('id=(.+?),', str(page))[0].replace("'", '')
        if (int(page.namespace) == 0):
            try:
                for revision in page: pass
                text = revision.text
                paragraphs = text.split('\n\n')
                for paragraph in paragraphs:
                    output = []
                    text_split = re.split('(?<=[.?\n])[ [<\n]', paragraph)
                    sentences = []
                    for i in text_split:
                        sentences+= list(filter(bool, i.splitlines()))
                    for sentence in sentences:
                        if (find_keywords(sentence) == True or find_keywords(page_name) == True):
                            elem = {}
                            elem['page'] = page_name
                            elem['page_id'] = page_id
                            elem['sentence'] = sentence
                            elem['paragraph'] = paragraph.replace('\n', '')
                            output.append(elem)
                    if (len(output)):
                        yield output
            except:
                pass

We will use the function `mwxml.map()` to process in parallel each of the dumps and write the results to an output file.

In [6]:
count = 0
dict_keys = ['page', 'page_id', 'sentence', 'paragraph']
filename = 'sentences_keyword.csv'

with open(filename, 'w', newline='') as myfile:
    wr = csv.DictWriter(myfile, dict_keys)
    wr.writeheader()
    
    for info in mwxml.map(process_dump, paths, 8):
        for i in info:
            wr.writerow(i)
            count += 1
        


/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current22.xml-p23927984p25427984.bz2 2019-12-28 15:29:50.642495
/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current21.xml-p22722158p23927983.bz2 2019-12-28 15:29:50.676202
/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current22.xml-p25427984p26823660.bz2 2019-12-28 15:29:50.785303
/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current23.xml-p26823661p28323661.bz2 2019-12-28 15:29:50.837174
/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current23.xml-p28323661p29823661.bz2 2019-12-28 15:29:50.817941
/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current23.xml-p29823661p30503449.bz2 2019-12-28 15:29:50.861558
/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current24.xml-p32003451p33503451.bz2 2019-12-28 15:29:50.870490
/mnt/vmdata/bigpicture-wssc/jrando/enwiki-20190520-pages-meta-current24.xml-p30503451p32003451.bz2 2019-12-28 15:29:50