# Extract  and process Simple Wikipedia dumps

## Imports

In [12]:
import sys
sys.path.insert(1, './wikipedia/')
from WikiExtractor import *

import requests
from bs4 import BeautifulSoup
import os
import sys
from keras.utils import get_file
import bz2
import subprocess
import xml.sax
import mwparserfromhell
import re, html
from timeit import default_timer as timer
import gc
import json
from tqdm import tqdm
from multiprocessing import Pool
from itertools import chain # List of lists to single list
from functools import partial # Sending keyword arguments in map
from multiprocessing.dummy import Pool as Threadpool

Using TensorFlow backend.


## Downloading Simple Wikipedia articles

In [2]:
base_url = 'https://dumps.wikimedia.org/simplewiki/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')

# Find the links that are dates of dumps
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]
dumps

['../',
 '20200201/',
 '20200220/',
 '20200301/',
 '20200401/',
 '20200420/',
 '20200501/',
 '20200520/',
 '20200601/',
 '20200620/',
 '20200701/',
 'latest/']

In [3]:
dump_url = base_url + '20200701/'
print(dump_url)

# Retrieve the html
dump_html = requests.get(dump_url).text
dump_html[:10]

https://dumps.wikimedia.org/simplewiki/20200701/


'<!DOCTYPE '

In [4]:
# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

# Find li elements with the class file
soup_dump.find_all('li', {'class': 'file'}, limit = 10)[:4]

[<li class="file"><a href="/simplewiki/20200701/simplewiki-20200701-pages-articles-multistream.xml.bz2">simplewiki-20200701-pages-articles-multistream.xml.bz2</a> 183.3 MB</li>,
 <li class="file"><a href="/simplewiki/20200701/simplewiki-20200701-pages-articles-multistream-index.txt.bz2">simplewiki-20200701-pages-articles-multistream-index.txt.bz2</a> 2.7 MB</li>,
 <li class="file"><a href="/simplewiki/20200701/simplewiki-20200701-pages-meta-history.xml.7z">simplewiki-20200701-pages-meta-history.xml.7z</a> 883.6 MB</li>,
 <li class="file"><a href="/simplewiki/20200701/simplewiki-20200701-pages-meta-history.xml.bz2">simplewiki-20200701-pages-meta-history.xml.bz2</a> 1.7 GB</li>]

In [5]:
files = []

# Search through all files
for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    # Select the relevant files
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))
        
files[:5]

[('simplewiki-20200701-pages-articles-multistream.xml.bz2', ['183.3', 'MB']),
 ('simplewiki-20200701-pages-articles-multistream-index.txt.bz2',
  ['2.7', 'MB']),
 ('simplewiki-20200701-pages-articles.xml.bz2', ['167.5', 'MB'])]

In [6]:
files_to_download = [file[0] for file in files if '.xml' and not 'multistream' in file[0]]
print(len(files_to_download))
files_to_download

1


['simplewiki-20200701-pages-articles.xml.bz2']

In [7]:
keras_home = '/home/workstation/.keras/datasets/simplewiki/'

data_paths = []
file_info = []

# Iterate through each file
for file in files_to_download:
    path = keras_home + file
    
    # Check to see if the path exists (if the file is already downloaded)
    if not os.path.exists(path):
        print('Downloading')
        # If not, download the file
        data_paths.append(get_file(fname=path, origin=dump_url+file))
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        
        # Find the number of articles
#         file_articles = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
#         file_info.append((file, file_size, file_articles))
        file_info.append((file, file_size))
        
    # If the file is already downloaded find some information
    else:
        data_paths.append(path)
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        
        # Find the number of articles
#         file_number = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
#         file_info.append((file.split('-')[-1], file_size, file_number))
        file_info.append((file, file_size))


In [8]:
file_info

[('simplewiki-20200701-pages-articles.xml.bz2', 175.659444)]

## Parsing the articles

In [9]:
med_conditions = pd.read_json('../wikipedia/extracted/medical_conditions.json', lines=True)
med_conditions.columns

Index(['title', 'parsed_text', 'timestamp', 'text_length'], dtype='object')

In [10]:
list_of_medcon = med_conditions.title.tolist()
print(len(list_of_medcon))
list_of_medcon

8677


['Autism',
 'Motor neuron disease',
 'Acute disseminated encephalomyelitis',
 'Ataxia',
 'Abscess',
 'Arachnophobia',
 'Arthritis',
 'Aphasia',
 'Albinism in humans',
 'Alcoholism',
 'Amputation',
 'Arteriovenous malformation',
 'Brain abscess',
 'Bipolar disorder',
 'Bacterial vaginosis',
 'Plague (disease)',
 'Bipolar I disorder',
 'Catatonia',
 'Creutzfeldt–Jakob disease',
 'Coma',
 'Bradycardia',
 'Coronary artery disease',
 'Congenital iodine deficiency syndrome',
 'Chagas disease',
 'Chlamydia',
 'Candidiasis',
 'Color blindness',
 'Cholera',
 'Cerebral arteriovenous malformation',
 'Charcot–Marie–Tooth disease',
 'Central pontine myelinolysis',
 'Alcohol intoxication',
 'Down syndrome',
 'Dyslexia',
 'Major depressive disorder',
 'Endocarditis',
 'Expressive aphasia',
 'Epilepsy',
 'Essential tremor',
 'Fugue state',
 'Flatulence',
 'Foix–Alajouanine syndrome',
 'Goitre',
 'Genetic disorder',
 'Hypoxia (medical)',
 'Hypoglycemia',
 'Hyperthyroidism',
 'Hemiparesis',
 'Haemophili

In [18]:
def clean(text):
    # Adjust mall-formed tags
    text = text.replace("< ","<").replace(" >",">")
    
    # Remove well-formed tags and section headers
    # tag_re = re.compile(r'(<!--(.*)-->|<[^>].*>|==(.*)==)')
    tag_re = re.compile(r'(<!--[\s\S]*?-->|==(.*)==)')
    text = tag_re.sub('', text)
    
    # Remove references in <ref>...</ref>
    ref1_re = re.compile(r'<ref>(.*?)<\/ref>')
    text = ref1_re.sub('', text)
    # Remove references in <ref name="...">...</ref>
    ref2_re = re.compile(r'<ref([^\/]*?)>(.*?)<\/ref>')
    text = ref2_re.sub('', text)
    # Remove references in <ref name="..." />
    ref3_re = re.compile(r'<ref(.*?)>')
    text = ref3_re.sub('', text)
    
    # Remove * as bullet points
    text = re.sub("\* |\*", "", text)
    
    # Remove # as enumerated points
    text = re.sub("\# ", "", text)

    # Remove too many whitespaces
    many_whitespaces = re.compile(' +')
    text = many_whitespaces.sub(' ', text)

    # Remove too many new lines
    many_new_lines = re.compile('(\n )+')
    text = many_new_lines.sub('\n\n', text)
    
    # Remove '& nbsp;'
    text = re.sub("& nbsp;", "", text)

#     # Clean up anything else by escaping
#     text = html.escape(text)

    return text

def process_article(title, text, timestamp, template = 'Infobox medical condition'):
    """Process a wikipedia article looking for template"""
    #matches = False
    #if title in list_of_medcon:
        #matches = True
        
    text = replaceInternalLinks(text) #method from WikiExtractor
    text = replaceExternalLinks(text) #method from WikiExtractor

    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)

    # Search through templates for the template
    matches = wikicode.filter_templates(matches = template)

    if matches or (title in list_of_medcon):
        # default: strip_code(normalize=True, collapse=True, keep_template_params=False)
        parsed_text = wikicode.strip_code().strip()
        parsed_text = clean(parsed_text)
        # Find approximate length of article (character count, incl spaces)
        text_length = len(wikicode.strip_code().strip())

        return {"title": title, "parsed_text": parsed_text, "timestamp": timestamp, "text_length": text_length}

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Parse through XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._medical_conditions = []
        self._article_count = 0
        self._non_matches = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._article_count += 1
            # Search through the page to see if the page is about a medical condition
            medical_condition = process_article(**self._values, template = 'Infobox medical condition')
            # Append to the list of medical conditions
            if medical_condition:
                self._medical_conditions.append(medical_condition)

                # # If json file doesn't exist yet, create one, and append to it in the next iterations
                # with open('/Users/linh/Documents/GitHub/Thesis_Project/data/wikipedia/extracted/med_conditions/manually_save/p17620560p18754723.json', 'a') as fout:
                #     fout.write(json.dumps(medical_condition) + '\n')

                # print(f'Found medical conditions: {len(self._medical_conditions)}')

def find_med_conditions(data_path, limit = None, save = True):
    """Find all the articles about a medical condition from a compressed wikipedia XML dump.
       `limit` is an optional argument to only return a set number of articles.
        If save, articles are saved to partition directory based on file name"""
    
    partition_dir = '/home/workstation/Desktop/linh_thesis/datasources/simplewiki/extracted/'
    # Create file name based on partition name
    #p_str = data_path.split('-')[-1].split('.')[-2]
    p_str = "simplewiki_medcon"
    
    if os.path.exists(f'{partition_dir}{p_str}.json'):
        print(f"File '{data_path}' already processed.")
    else:
        print("Processing file: ", data_path)

        start = timer()
        # Object for handling xml
        handler = WikiXmlHandler()

        # Parsing object
        parser = xml.sax.make_parser()
        parser.setContentHandler(handler)

        # Iterate through compressed file
        for i, line in enumerate(tqdm_notebook(subprocess.Popen(['bzcat'],
                                 stdin = open(data_path),
                                 stdout = subprocess.PIPE).stdout)):

            if (i + 1) % 10000 == 0:
                print(f'Processed {i + 1} lines so far.', end = '\r')

            try:
                parser.feed(line)
            except mwparserfromhell.parser.ParserError as e:
                print(e)
                pass
            except Exception:
                pass
            except StopIteration:
                break

            # Optional limit
            if limit is not None and len(handler._medical_conditions) >= limit:
                return handler._medical_conditions
        
        end = timer()
        medical_conditions = handler._medical_conditions
        print(f'\nSearched through {handler._article_count} articles.\nFound {len(medical_conditions)} medical conditions.')
        
        if save:
            out_dir = f'{partition_dir}{p_str}.json'

            # Open the file
            with open(out_dir, 'w') as fout:
                # Write as json
                for med_condition in handler._medical_conditions:
                    fout.write(json.dumps(med_condition) + '\n')

            print(f'{len(os.listdir(partition_dir))} files processed.', end = '\r')

        # Memory management
        del handler
        del parser
        gc.collect() # in case of multithreading
        return None

def read_data(file_path):
    """Read in json data from `file_path`"""

    data = []

    # Open the file and load in json
    with open(file_path, 'rt', encoding='utf8') as fin:
        for i,l in enumerate(fin.readlines()):
            data.append(json.loads(l))

            if i==1:
                print("========================================")
                print(l)

    return data


## Search for all pages about a medical condition on Simple Wikipedia

In [15]:
import pprint

keras_home = '/home/workstation/.keras/datasets/simplewiki/'
partitions = [keras_home + file for file in sorted(os.listdir(keras_home)) if 'xml' in file]

print(len(partitions))
pprint.pprint(partitions)

1
['/home/workstation/.keras/datasets/simplewiki/simplewiki-20200701-pages-articles.xml.bz2']


In [18]:
# Via 'Infobox medical condition' template
start = timer()

find_med_conditions(data_path=partitions[0])

end = timer()
print(f'{end - start} seconds elapsed.')

Processing file:  /home/workstation/.keras/datasets/simplewiki/simplewiki-20200701-pages-articles.xml.bz2


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Processed 15950000 lines so far.

Searched through 312537 articles.
Found 27 medical conditions.

Found 27 medical conditions in 617 seconds.
616.6127884179996 seconds elapsed.


In [16]:
# Via list of extracted medical conditions from Wikipedia
start = timer()

find_med_conditions(data_path=partitions[0])

end = timer()
print(f'{end - start} seconds elapsed.')

Processing file:  /home/workstation/.keras/datasets/simplewiki/simplewiki-20200701-pages-articles.xml.bz2


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Processed 15950000 lines so far.

Searched through 312537 articles.
Found 813 medical conditions.
94.91557847600052 seconds elapsed.


In [20]:
# Combined
start = timer()

find_med_conditions(data_path=partitions[0])

end = timer()
print(f'{end - start} seconds elapsed.')


Processing file:  /home/workstation/.keras/datasets/simplewiki/simplewiki-20200701-pages-articles.xml.bz2


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Processed 15950000 lines so far.

Searched through 312537 articles.
Found 814 medical conditions.
656.3651013910021 seconds elapsed.


## Extract all wikipedia disease articles via DBpedia

In [4]:
import pandas as pd
from tqdm import tqdm
from collections import OrderedDict
from SPARQLWrapper import SPARQLWrapper, JSON
import urllib.request
from urllib.error import HTTPError
import traceback
from time import sleep

pd.set_option('display.max_colwidth', None)

proxy_support = urllib.request.ProxyHandler({})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)

sparql = SPARQLWrapper("http://dbpedia.org/sparql", agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")

def query_to_df(query):
    attempts = 20
    while attempts > 0:
        try:
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
        except HTTPError:
            if attempts == 1:
                print(traceback.format_exc())
                print("The above error has occurred for the following query: " + "\n" + query)
            attempts -= 1
            sleep(1)
            continue
        except:
            print(traceback.format_exc())
            print("The above error has occurred for the following query: " + "\n" + query)    
        break
        
    dbpedia_diseases = []
    for item in results['results']['bindings']:
        dbpedia_diseases.append(OrderedDict({
            'disease': item['disease']['value'][28:],
            'dbpediapage': item['disease']['value'],
            'wikipage': item['wikipage']['value']
                #if 'wikipage' in item else None,
#             'locationCountry': item['locationCountry']['value'][28:] 
#                 if 'locationCountry' in item else None,
        }))

    df = pd.DataFrame(dbpedia_diseases)
    
    return df

query = """
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX prov: <http://www.w3.org/ns/prov#>

        SELECT ?disease ?wikipage WHERE {
            ?disease a dbo:Disease .
            ?disease prov:wasDerivedFrom ?wikipage.
        }
        """

diseases_df = query_to_df(query)
diseases_df


Unnamed: 0,disease,dbpediapage,wikipage
0,Meredith_effect,http://dbpedia.org/resource/Meredith_effect,http://en.wikipedia.org/wiki/Meredith_effect?oldid=732198477
1,5-alpha-reductase_deficiency,http://dbpedia.org/resource/5-alpha-reductase_deficiency,http://en.wikipedia.org/wiki/5-alpha-reductase_deficiency?oldid=741212000
2,Abdominal_obesity,http://dbpedia.org/resource/Abdominal_obesity,http://en.wikipedia.org/wiki/Abdominal_obesity?oldid=743874742
3,Achondroplasia,http://dbpedia.org/resource/Achondroplasia,http://en.wikipedia.org/wiki/Achondroplasia?oldid=742386653
4,Acrochordon,http://dbpedia.org/resource/Acrochordon,http://en.wikipedia.org/wiki/Acrochordon?oldid=730346918
...,...,...,...
9995,Chorioretinitis,http://dbpedia.org/resource/Chorioretinitis,http://en.wikipedia.org/wiki/Chorioretinitis?oldid=738589565
9996,Colorectal_cancer,http://dbpedia.org/resource/Colorectal_cancer,http://en.wikipedia.org/wiki/Colorectal_cancer?oldid=743654315
9997,Colpocephaly,http://dbpedia.org/resource/Colpocephaly,http://en.wikipedia.org/wiki/Colpocephaly?oldid=708776034
9998,Comorbidity,http://dbpedia.org/resource/Comorbidity,http://en.wikipedia.org/wiki/Comorbidity?oldid=724756379
