In [1]:
import os, sys
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
sys.path.insert(0, 'scripts')
from WikiPage import WikiPage

In [2]:
os.chdir('data')
#xmlfile = 'Wikipedia-20170717213140.xml'
xmlfile='Wikipedia-fish.xml'

# Parse file

Loop over lines in the XML file and look for appearances of:
* title
* id
* text

At each step, save the variables into a dictionary.

After reading the file, the dictionary "data" is saved into a "csv" file.


In [3]:
outfile = xmlfile.replace("xml","csv")
data = []

# Parsing xml using ElementTree

For a short tutorial, see: https://docs.python.org/3/library/xml.etree.elementtree.html#module-xml.etree.ElementTree

In [4]:
def get_xml_field(xml_data):
    '''Returns field from xml'''
    return xml_data.tag.split('}')[1]

In [5]:
tree = ET.parse(xmlfile)
root = tree.getroot()

In [6]:
# As an Element, root has a tag and a dictionary of attributes
print(root.tag)
print(root.attrib)

{http://www.mediawiki.org/xml/export-0.10/}mediawiki
{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd', 'version': '0.10', '{http://www.w3.org/XML/1998/namespace}lang': 'en'}


In [7]:
# Root also has children nodes over which we can iterate
# examine first 10 entries
for child in root[:10]:
     print(child.tag)

{http://www.mediawiki.org/xml/export-0.10/}siteinfo
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page


In [8]:
# Alternate way to get all children from root
# view first 10 children of root
root_children = root.getchildren()
print(root_children[:10])

[<Element '{http://www.mediawiki.org/xml/export-0.10/}siteinfo' at 0x11539b2c8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x1153a5138>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x1153a58b8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x1153adb88>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x1153c21d8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x1153c27c8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x1153c2e08>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x1153c7458>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x1153c7a98>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x1153c7b88>]


In [9]:
# Explore a subset of root (i.e. children of root)
print(root,'\n')
for child in root.getchildren()[:4]:
    if len(child) > 0:
        print(child.tag)
        for gchild in child.getchildren():
            print(gchild.tag, '\n', gchild.text)

<Element '{http://www.mediawiki.org/xml/export-0.10/}mediawiki' at 0x11539b278> 

{http://www.mediawiki.org/xml/export-0.10/}siteinfo
{http://www.mediawiki.org/xml/export-0.10/}sitename 
 Wikipedia
{http://www.mediawiki.org/xml/export-0.10/}dbname 
 enwiki
{http://www.mediawiki.org/xml/export-0.10/}base 
 https://en.wikipedia.org/wiki/Main_Page
{http://www.mediawiki.org/xml/export-0.10/}generator 
 MediaWiki 1.30.0-wmf.10
{http://www.mediawiki.org/xml/export-0.10/}case 
 first-letter
{http://www.mediawiki.org/xml/export-0.10/}namespaces 
 
      
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}title 
 Category:Lists of fishes
{http://www.mediawiki.org/xml/export-0.10/}ns 
 14
{http://www.mediawiki.org/xml/export-0.10/}id 
 12161313
{http://www.mediawiki.org/xml/export-0.10/}revision 
 
      
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}title 
 Fish
{http://www.mediawiki.org/xml/export-0.10/}ns 
 0

In [10]:
# Dive deeper into the tree by exploring a page from root (i.e. grandchildren of root)
child_index = 30
print(root[child_index],'\n')
# Children are nested, and we can access specific child nodes by index
# Fully explore branches of a page
print('Branches of %s:\n' % (root.getchildren()[child_index].tag))
for child in root[child_index].getchildren():
    if len(child) > 0:
        print(child.tag)
        for gchild in child.getchildren():
            print(gchild.tag, '\n', gchild.text)

<Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115427728> 

Branches of {http://www.mediawiki.org/xml/export-0.10/}page:

{http://www.mediawiki.org/xml/export-0.10/}revision
{http://www.mediawiki.org/xml/export-0.10/}id 
 684775879
{http://www.mediawiki.org/xml/export-0.10/}parentid 
 684775504
{http://www.mediawiki.org/xml/export-0.10/}timestamp 
 2015-10-08T17:48:46Z
{http://www.mediawiki.org/xml/export-0.10/}contributor 
 
        
{http://www.mediawiki.org/xml/export-0.10/}minor 
 None
{http://www.mediawiki.org/xml/export-0.10/}comment 
 Reverted edits by [[Special:Contribs/208.190.90.125|208.190.90.125]] ([[User talk:208.190.90.125|talk]]) to last version by Addbot
{http://www.mediawiki.org/xml/export-0.10/}model 
 wikitext
{http://www.mediawiki.org/xml/export-0.10/}format 
 text/x-wiki
{http://www.mediawiki.org/xml/export-0.10/}text 
 [[Category:Animal products]]
[[Category:Whaling|Products]]
[[Category:Whales]]
{http://www.mediawiki.org/xml/export-0.10/}sha1 
 b

## Explore findall()

In [11]:
base = '{http://www.mediawiki.org/xml/export-0.10/}'
print('# of hits:', len(root.findall('*/%srevision/%stext' % (base, base))), '\n')
for rev_text in root.findall('*/%srevision/%stext' % (base, base))[:5]:
    print(rev_text.text[:500])

# of hits: 48 

[[Category:Lists of vertebrates|Fishes]]
[[Category:Fish]]
{{pp|small=yes}} 
{{for2|fish as eaten by humans|[[Fish as food]]|other uses|[[Fish (disambiguation)]]}}
{{sprotect2}}
{{Use dmy dates|date=October 2014}}
{{Paraphyletic group
|name=Fish
|fossil_range={{fossilrange|Mid Cambrian|Recent|latest=0}}
|image=Georgia Aquarium - Giant Grouper edit.jpg
|image_caption=[[Giant grouper]] swimming among [[Shoaling and schooling|schools]] of other fish
|image_width=250px
|image2=Pterois volitans Manado-e edit.jpg
|image2_caption= Head-on view of a [[red lionf
{{WPSS-cat}}
{{Stub Category|article=[[fish]]|newstub=fish-stub|category=Fish}}
{{Category TOC|numerals=no}}

[[Category:Chordate stubs| Fish]]
[[Category:WikiProject Fishes|μ]]
[[Category:Animal health]]
[[Category:Fish|Health]]
{{Commons cat|Fish anatomy}}

[[Category:Fish|Anatomy]]
[[Category:Vertebrate anatomy]]


## Test WikiPage Class

In [12]:
test_index = 40
test = WikiPage(root[test_index])

In [13]:
print(test.website)
print(test.id)
print(test.text_raw)
print(test.text_parentid)

https://en.wikipedia.org/wiki/Big_Miracle_(book)
44137874
{{Infobox book | 
| name         = Big Miracle
| author       = Tom Rose
| country      = [[United States]]
| language     = [[English language|English]]
| genre        = [[Nonfiction]]
| publisher    = St. Martin's Press
| release_date = 20 December 2011
| pages        = 336 pp
| media_type   = Print (Paperback)
| isbn         = 978-0-312-62519-1
}}
'''''Big Miracle''''' tells the true story of three gray whales trapped beneath [[Arctic]] ice in the fall of 1988, and of [[Operation Breakthrough]], the collaborative efforts to free them by oil company executives, activists, [[Inupiat]] people, the U.S. military, and [[Soviet]] [[ice-breaker|ice-breakers]].<ref name="pw">{{cite web|title=Big Miracle|url=http://www.publishersweekly.com/978-0-312-62519-1|website=[[Publishers Weekly]]|accessdate=17 October 2014}}</ref><ref name="free">{{cite news | last1=Dorfman | first1=Andrea | last2=Postman | first2=David | url=http://www.time.co

In [14]:
test.children

[<Element '{http://www.mediawiki.org/xml/export-0.10/}title' at 0x1154593b8>,
 <Element '{http://www.mediawiki.org/xml/export-0.10/}ns' at 0x115459408>,
 <Element '{http://www.mediawiki.org/xml/export-0.10/}id' at 0x115459458>,
 <Element '{http://www.mediawiki.org/xml/export-0.10/}revision' at 0x1154594a8>]

In [15]:
type(test.children[0])

xml.etree.ElementTree.Element

In [16]:
test.data

Unnamed: 0,website,title,page_type,text_raw
44137874,https://en.wikipedia.org/wiki/Big_Miracle_(book),Big Miracle (book),article,{{Infobox book | \n| name = Big Miracl...


In [17]:
print(test.data.website[0])

https://en.wikipedia.org/wiki/Big_Miracle_(book)


In [18]:
from tools import merge_Wiki_pages
df=merge_Wiki_pages(root)
df.head()

Unnamed: 0,website,title,page_type,text_raw
12161313,https://en.wikipedia.org/wiki/Category:Lists_o...,Category:Lists of fishes,category,[[Category:Lists of vertebrates|Fishes]]\n[[Ca...
4699587,https://en.wikipedia.org/wiki/Fish,Fish,article,{{pp|small=yes}} \n{{for2|fish as eaten by hum...
1834140,https://en.wikipedia.org/wiki/Category:Fish_stubs,Category:Fish stubs,category,{{WPSS-cat}}\n{{Stub Category|article=[[fish]]...
11037163,https://en.wikipedia.org/wiki/Category:Fish_he...,Category:Fish health,category,[[Category:Animal health]]\n[[Category:Fish|He...
7998150,https://en.wikipedia.org/wiki/Category:Fish_an...,Category:Fish anatomy,category,{{Commons cat|Fish anatomy}}\n\n[[Category:Fis...


In [19]:
#Write DF to file
df.to_csv(outfile)
