# Extract  and process Wikipedia dumps

In [1]:
import requests
import os
import sys
from keras.utils import get_file
import bz2
import subprocess
import xml.sax
import mwparserfromhell
import re, html
from WikiExtractor import * #Available at: https://github.com/attardi/wikiextractor
from timeit import default_timer as timer
import gc
import json
from tqdm import tqdm, tqdm_notebook
from multiprocessing import Pool
from itertools import chain # List of lists to single list
from functools import partial # Sending keyword arguments in map
from multiprocessing.dummy import Pool as Threadpool

Using TensorFlow backend.


In [2]:
def clean(text):
    # Adjust mall-formed tags
    text = text.replace("< ","<").replace(" >",">")
    
    # Remove well-formed tags and section headers
    # tag_re = re.compile(r'(<!--(.*)-->|<[^>].*>|==(.*)==)')
    tag_re = re.compile(r'(<!--[\s\S]*?-->|==(.*)==)')
    text = tag_re.sub('', text)
    
    # Remove references in <ref>...</ref>
    ref1_re = re.compile(r'<ref>(.*?)<\/ref>')
    text = ref1_re.sub('', text)
    # Remove references in <ref name="...">...</ref>
    ref2_re = re.compile(r'<ref([^\/]*?)>(.*?)<\/ref>')
    text = ref2_re.sub('', text)
    # Remove references in <ref name="..." />
    ref3_re = re.compile(r'<ref(.*?)>')
    text = ref3_re.sub('', text)
    
    # Remove * as bullet points
    text = re.sub("\* |\*", "", text)
    
    # Remove # as enumerated points
    text = re.sub("\# ", "", text)

    # Remove too many whitespaces
    many_whitespaces = re.compile(' +')
    text = many_whitespaces.sub(' ', text)

    # Remove too many new lines
    many_new_lines = re.compile('(\n )+')
    text = many_new_lines.sub('\n\n', text)
    
    # Remove '& nbsp;'
    text = re.sub("& nbsp;", "", text)

    return text

def process_article(title, text, timestamp, template = 'Infobox book'):
    """Process a wikipedia article looking for template"""

    text = replaceInternalLinks(text) #method from WikiExtractor
    text = replaceExternalLinks(text) #method from WikiExtractor

    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)

    # Search through templates for the template
    matches = wikicode.filter_templates(matches = template)

    if matches:
        # default: strip_code(normalize=True, collapse=True, keep_template_params=False)
        parsed_text = wikicode.strip_code().strip()
        parsed_text = clean(parsed_text)
        # Find approximate length of article (character count, incl spaces)
        text_length = len(wikicode.strip_code().strip())

        return {"title": title, "parsed_text": parsed_text, "timestamp": timestamp, "text_length": text_length}

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Parse through XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._medical_conditions = []
        self._article_count = 0
        self._non_matches = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._article_count += 1
            # Search through the page to see if the page is about a medical condition
            medical_condition = process_article(**self._values, template = 'Infobox medical condition')
            # Append to the list of medical conditions
            if medical_condition:
                self._medical_conditions.append(medical_condition)

def find_med_conditions(data_path, limit = None, save = True):
    """Find all the articles about a medical condition from a compressed wikipedia XML dump.
       `limit` is an optional argument to only return a set number of articles.
        If save, articles are saved to partition directory based on file name"""
    
    partition_dir = '/home/workstation/Desktop/linh_thesis/datasources/wikipedia/extracted/articles/med_condition_partitions_V4/'
    # Create file name based on partition name
    p_str = data_path.split('-')[-1].split('.')[-2]
    
    if os.path.exists(f'{partition_dir}{p_str}.json'):
        print(f"File '{data_path}' already processed.")
    else:
        print("Processing file: ", data_path)

        # Object for handling xml
        handler = WikiXmlHandler()

        # Parsing object
        parser = xml.sax.make_parser()
        parser.setContentHandler(handler)

        # Iterate through compressed file
        for i, line in enumerate(subprocess.Popen(['bzcat'],
                                 stdin = open(data_path),
                                 stdout = subprocess.PIPE).stdout):

            # if (i + 1) % 10000 == 0:
            #     print(f'Processed {i + 1} lines so far.', end = '\r')

            try:
                parser.feed(line)
            except mwparserfromhell.parser.ParserError as e:
                print(e)
                pass
            except Exception:
                pass
            except StopIteration:
                break

            # Optional limit
            if limit is not None and len(handler._medical_conditions) >= limit:
                return handler._medical_conditions

        medical_conditions = handler._medical_conditions
        print(f'\nSearched through {handler._article_count} articles.\nFound {len(medical_conditions)} medical conditions.')

        if save:
            out_dir = f'{partition_dir}{p_str}.json'

            # Open the file
            with open(out_dir, 'w') as fout:
                # Write as json
                for med_condition in handler._medical_conditions:
                    fout.write(json.dumps(med_condition) + '\n')

            print(f'{len(os.listdir(partition_dir))} files processed.', end = '\r')

        # Memory management
        del handler
        del parser
        gc.collect() # in case of multithreading
        return None

def read_data(file_path):
    """Read in json data from `file_path`"""

    data = []

    # Open the file and load in json
    with open(file_path, 'rt', encoding='utf8') as fin:
        for i,l in enumerate(fin.readlines()):
            data.append(json.loads(l))

            if i==1:
                print("========================================")
                print(l)

    return data


## Search for all pages about a medical condition on Wikipedia

In [3]:
import pprint

keras_home = '/home/workstation/.keras/datasets/'
partitions = [keras_home + file for file in sorted(os.listdir(keras_home)) if 'xml-p' in file]

print(len(partitions))
pprint.pprint(partitions)

58
['/home/workstation/.keras/datasets/enwiki-20200201-pages-articles1.xml-p10p30302.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles10.xml-p2336425p3046511.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles11.xml-p3046517p3926861.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles12.xml-p3926864p5040435.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles13.xml-p5040438p6197593.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles14.xml-p6197599p7697599.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles14.xml-p7697599p7744799.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles15.xml-p7744803p9244803.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles15.xml-p9244803p9518046.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles16.xml-p11018059p11539266.bz2',
 '/home/workstation/.keras/datasets/enwiki-

In [4]:
# For test run
data_paths = partitions[-4:]
pprint.pprint(data_paths)

['/home/workstation/.keras/datasets/enwiki-20200201-pages-articles6.xml-p565314p892912.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles7.xml-p892914p1268691.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles8.xml-p1268693p1791079.bz2',
 '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles9.xml-p1791081p2336422.bz2']


In [4]:
import os
os.cpu_count()

16

In [4]:
# Create a pool of workers to execute processes
pool = Pool(processes = 14)

start = timer()

# # Return results all at once, duration: 679-687s
# # Map (service, tasks), applies function to each partition
# results = pool.map(find_med_conditions, partitions)

# Return results as they finish, duration: 685-693s
# Run partitions in parallel
results = []
for x in tqdm_notebook(pool.imap_unordered(find_med_conditions, partitions), total = len(partitions)):
    results.append(x)

pool.close()
pool.join()

end = timer()
print(f'{end - start} seconds elapsed.')


File '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles1.xml-p10p30302.bz2' already processed.
File '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles12.xml-p3926864p5040435.bz2' already processed.
File '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles13.xml-p5040438p6197593.bz2' already processed.
File '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles11.xml-p3046517p3926861.bz2' already processed.
File '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles10.xml-p2336425p3046511.bz2' already processed.
File '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles18.xml-p15193075p16120541.bz2' already processed.
File '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles19.xml-p17620560p18754723.bz2' already processed.
File '/home/workstation/.keras/datasets/enwiki-20200201-pages-articles17.xml-p11539268p13039268.bz2' already processed.
File '/home/workstation/.keras/datasets/enwiki-20200201-pa

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


HBox(children=(FloatProgress(value=0.0, max=58.0), HTML(value='')))


Searched through 479567 articles.
Found 105 medical conditions.
58 files processed.
1714.6890515350005 seconds elapsed.


__Isolated before:__

enwiki-20200201-pages-articles19.xml-p17620560p18754723.bz2

(Due to following error: \
This is a bug and should be reported. Info: C tokenizer exited with non-empty token stack.)


__Processed PART 1:__

    p10p30302.json           p30304p88444.json        p50163464p51663464.json
    p11018059p11539266.json  p3046517p3926861.json    p5040438p6197593.json
    p11539268p13039268.json  p33503454p33952815.json  p51663464p53163464.json
    p1268693p1791079.json    p33952817p35452817.json  p53163464p54663464.json
    p13039268p13693066.json  p352690p565312.json      p565314p892912.json
    p13693075p15193075.json  p35452817p36952817.json  p57663464p59163464.json
    p15193075p16120541.json  p36952817p38067198.json  p59163464p60663464.json
    p16120560p17620560.json  p38067204p39567204.json  p60663464p62163464.json
    p1791081p2336422.json    p3926864p5040435.json    p6197599p7697599.json
    p18754736p20254736.json  p39567204p41067204.json  p62163464p63000741.json
    p200511p352689.json      p41067204p42567204.json  p7697599p7744799.json
    p20254736p21222156.json  p42567204p42663461.json  p7744803p9244803.json
    p21222161p22722161.json  p42663464p44163464.json  p88445p200507.json
    p22722161p23927980.json  p44163464p45663464.json  p892914p1268691.json
    p2336425p3046511.json    p45663464p47163464.json  p9244803p9518046.json
    p23927984p25427984.json  p47163464p48663464.json  p9518059p11018059.json
    p25427984p26823658.json  p48663464p50163464.json
    
__Processed PART 2:__

    p10p30302.json           p28323661p29823661.json  p48663464p50163464.json
    p11018059p11539266.json  p29823661p30503448.json  p50163464p51663464.json
    p11539268p13039268.json  p30304p88444.json        p5040438p6197593.json
    p1268693p1791079.json    p3046517p3926861.json    p51663464p53163464.json
    p13039268p13693066.json  p32003454p33503454.json  p53163464p54663464.json
    p13693075p15193075.json  p33503454p33952815.json  p54663464p56163464.json
    p15193075p16120541.json  p33952817p35452817.json  p56163464p57663464.json
    p16120560p17620560.json  p352690p565312.json      p565314p892912.json
    p17620560p18754723.json  p35452817p36952817.json  p57663464p59163464.json
    p1791081p2336422.json    p36952817p38067198.json  p59163464p60663464.json
    p18754736p20254736.json  p38067204p39567204.json  p60663464p62163464.json
    p200511p352689.json      p3926864p5040435.json    p6197599p7697599.json
    p20254736p21222156.json  p39567204p41067204.json  p62163464p63000741.json
    p21222161p22722161.json  p41067204p42567204.json  p7697599p7744799.json
    p22722161p23927980.json  p42567204p42663461.json  p7744803p9244803.json
    p2336425p3046511.json    p42663464p44163464.json  p88445p200507.json
    p23927984p25427984.json  p44163464p45663464.json  p892914p1268691.json
    p25427984p26823658.json  p45663464p47163464.json  p9244803p9518046.json
    p26823661p28323661.json  p47163464p48663464.json  p9518059p11018059.json

In [7]:
# Results test run
results

[[{'title': 'Autism',
   'parsed_text': ' \n\nAutism is a developmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior. Parents often notice signs during the first three years of their child\'s life. These signs often develop gradually, though some children with autism experience worsening in their communication and social skills after reaching developmental milestones at a normal pace. \n\nAutism is associated with a combination of genetic and environmental factors. Risk factors during pregnancy include certain infections, such as rubella, toxins including valproic acid, alcohol, cocaine, pesticides, lead, and air pollution, fetal growth restriction, and autoimmune diseases. Controversies surround other proposed environmental causes; for example, the vaccine hypothesis, which has been disproven. Autism affects information processing in the brain and how nerve cells and their synapses connect and organize; how

In [10]:
results

[[{'title': 'Exploding head syndrome',
   'parsed_text': ' \n\nExploding head syndrome (EHS) is a condition in which a person experiences unreal noises that are loud and of short duration when falling asleep or waking up. The noise may be frightening, typically occurs only occasionally, and is not a serious health concern. \n\nThe cause is unknown. Potential explanations include ear problems, temporal lobe seizure, nerve dysfunction, or specific genetic changes. \n\nThere is no high quality evidence to support treatment. \n\nIndividuals with exploding head syndrome hear or experience loud imagined noises as they are falling asleep or waking up, have a strong, often frightened emotional reaction to the sound, and do not report significant pain; around 10% of people also experience visual disturbances like perceiving visual static, lightning, or flashes of light. Some people may also experience heat, strange feelings in their torso, or a feeling of electrical tinglings that ascends to th

## Join data together

In [6]:
start = timer()

# List of files to read in
saved_files = ['/home/workstation/Desktop/linh_thesis/datasources/wikipedia/extracted/articles/med_condition_partitions_V4/' + x for x in sorted(os.listdir('/home/workstation/Desktop/linh_thesis/datasources/wikipedia/extracted/articles/med_condition_partitions_V4/')) if '.json' in x]
# print(len(saved_files))
# print(saved_files[:5])

# Create a threadpool for reading in files
threadpool = Threadpool(processes = 16)

# Read in the files as a list of lists
results = threadpool.map(read_data, saved_files)

# Flatten the list of lists to a single list
med_condition_list = list(chain(*results))

end = timer()

print(f'\nFound {len(med_condition_list)} medical conditions in {round(end - start)} seconds.')

if not os.path.exists('/home/workstation/Desktop/linh_thesis/datasources/wikipedia/extracted/articles/medical_conditions_V4.json'):
    with open('/home/workstation/Desktop/linh_thesis/datasources/wikipedia/extracted/articles/medical_conditions_V4.json', 'wt') as fout:
        for med_condition in med_condition_list:
            fout.write(json.dumps(med_condition, ensure_ascii=False) + '\n')
    print('File saved.')
else:
    print('File already saved.')

{"title": "Acute motor axonal neuropathy", "parsed_text": "Acute motor axonal neuropathy (AMAN) is a variant of Guillain\u2013Barr\u00e9 syndrome. It is characterized by acute paralysis and loss of reflexes without sensory loss. Pathologically, there is motor axonal degeneration with antibody-mediated attacks of motor nerves and nodes of Ranvier. \n\nA link to Campylobacter jejuni was suspected when a young girl was admitted to Second Teaching Hospital. She had become ill after feeding the family chickens. She developed acute paralysis and respiratory failure. Investigators discovered that several of the chickens in the home displayed similar symptoms and C. jejuni was found in their droppings. Several of the paralysis patients were found to have antibodies to C. jejuni and anti-GD1a antibodies, suggesting a link between the pathogen and the disease. In 2015, Zika virus was linked to AMAN. \n\nThe syndrome typically presents as a progressive flaccid symmetric paralysis with areflexia, 

{"title": "Wikipedia:WikiProject Medicine/Translation task force/RTT/Simple Tennis elbow", "parsed_text": " \n\nTennis elbow, also known as lateral epicondylitis, is a condition in which the outer part of the elbow becomes painful and tender. The pain may also extend into the back of the forearm and grip strength may be weak. Onset of symptoms is generally gradual. Golfer's elbow is a similar condition that affects the inside of the elbow. \n\nIt is due to excessive use of the muscles of the back of the forearm. Typically this occurs as a result of work or sports classically racquet sports. The diagnosis is typically based on the symptoms with medical imaging used to rule out other potential causes. It is more likely if pain increases when a person tries to bend back their wrist well their hand is held in a neutral position. It is classified as a chronic tendinosis not a tendinitis. \n\nTreatment involves decreasing activities that bring on the symptoms together with physical therapy. 

File saved.
