In [1]:
import os, sys
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
sys.path.insert(0, 'scripts')
from WikiPage import WikiPage

In [2]:
os.chdir('data')
xmlfile = 'Wikipedia-20170717213140.xml'

# Parse file

Loop over lines in the XML file and look for appearances of:
* title
* id
* text

At each step, save the variables into a dictionary.

After reading the file, the dictionary "data" is saved into a "csv" file.


In [3]:
outfile = xmlfile.replace("xml","csv")
data = []

# Parsing xml using ElementTree

For a short tutorial, see: https://docs.python.org/3/library/xml.etree.elementtree.html#module-xml.etree.ElementTree

In [4]:
def get_xml_field(xml_data):
    '''Returns field from xml'''
    return xml_data.tag.split('}')[1]

In [5]:
tree = ET.parse(xmlfile)
root = tree.getroot()

In [6]:
# As an Element, root has a tag and a dictionary of attributes
print(root.tag)
print(root.attrib)

{http://www.mediawiki.org/xml/export-0.10/}mediawiki
{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd', 'version': '0.10', '{http://www.w3.org/XML/1998/namespace}lang': 'en'}


In [7]:
# Root also has children nodes over which we can iterate
# examine first 10 entries
for child in root[:10]:
     print(child.tag)

{http://www.mediawiki.org/xml/export-0.10/}siteinfo
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page


In [8]:
# Alternate way to get all children from root
# view first 10 children of root
root_children = root.getchildren()
print(root_children[:10])

[<Element '{http://www.mediawiki.org/xml/export-0.10/}siteinfo' at 0x10d3ec2c8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d3f8138>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d3f89a8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d3fb048>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d3fb638>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d3fbcc8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d3fe318>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d3fe9a8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d3e3a98>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d403188>]


In [9]:
# Explore a subset of root (i.e. children of root)
print(root,'\n')
for child in root.getchildren()[:4]:
    if len(child) > 0:
        print(child.tag)
        for gchild in child.getchildren():
            print(gchild.tag, '\n', gchild.text)

<Element '{http://www.mediawiki.org/xml/export-0.10/}mediawiki' at 0x10d3ec278> 

{http://www.mediawiki.org/xml/export-0.10/}siteinfo
{http://www.mediawiki.org/xml/export-0.10/}sitename 
 Wikipedia
{http://www.mediawiki.org/xml/export-0.10/}dbname 
 enwiki
{http://www.mediawiki.org/xml/export-0.10/}base 
 https://en.wikipedia.org/wiki/Main_Page
{http://www.mediawiki.org/xml/export-0.10/}generator 
 MediaWiki 1.30.0-wmf.9
{http://www.mediawiki.org/xml/export-0.10/}case 
 first-letter
{http://www.mediawiki.org/xml/export-0.10/}namespaces 
 
      
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}title 
 Category:Dog shows and showing
{http://www.mediawiki.org/xml/export-0.10/}ns 
 14
{http://www.mediawiki.org/xml/export-0.10/}id 
 970284
{http://www.mediawiki.org/xml/export-0.10/}revision 
 
      
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}title 
 Category:Dog health
{http://www.mediawiki.org/xml/e

In [10]:
# Dive deeper into the tree by exploring a page from root (i.e. grandchildren of root)
child_index = 30
print(root[child_index],'\n')
# Children are nested, and we can access specific child nodes by index
# Fully explore branches of a page
print('Branches of %s:\n' % (root.getchildren()[child_index].tag))
for child in root[child_index].getchildren():
    if len(child) > 0:
        print(child.tag)
        for gchild in child.getchildren():
            print(gchild.tag, '\n', gchild.text)

<Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10d4672c8> 

Branches of {http://www.mediawiki.org/xml/export-0.10/}page:

{http://www.mediawiki.org/xml/export-0.10/}revision
{http://www.mediawiki.org/xml/export-0.10/}id 
 691460390
{http://www.mediawiki.org/xml/export-0.10/}parentid 
 685069728
{http://www.mediawiki.org/xml/export-0.10/}timestamp 
 2015-11-19T23:52:50Z
{http://www.mediawiki.org/xml/export-0.10/}contributor 
 
        
{http://www.mediawiki.org/xml/export-0.10/}comment 
 Remove duplicate name from reference - it doesn't need one and it's causing error in the ref list
{http://www.mediawiki.org/xml/export-0.10/}model 
 wikitext
{http://www.mediawiki.org/xml/export-0.10/}format 
 text/x-wiki
{http://www.mediawiki.org/xml/export-0.10/}text 
 {{Infobox award
| name           = PDSA Certificate for Animal Bravery or Devotion
| current_awards = 
| image          = 
| imagesize      = 150px
| alt            = 
| caption        = The PDSA Certificate for Animal 

## Explore findall()

In [11]:
base = '{http://www.mediawiki.org/xml/export-0.10/}'
print('# of hits:', len(root.findall('*/%srevision/%stext' % (base, base))), '\n')
for rev_text in root.findall('*/%srevision/%stext' % (base, base))[:5]:
    print(rev_text.text[:500])

# of hits: 147 

{{Cat main|Conformation show|Show dog}}
{{portal|Dogs}}
This is an automatically collected list of articles related to showing dogs for their appearance in [[conformation show]]s.

[[Category:Dogs|Shows and showing]]
[[Category:Competitions]]
[[Category:Animal shows| Dog]]
This is a collection of articles about the health and care of dogs, primarily domestic dogs. This includes medical conditions, health issues, parasites, surgical procedures, and similar topics.

{{Cat main|Dog health}}
{{Commons cat|Dog health}}

[[Category:Dogs|Health]]
[[Category:Mammal health]]
This is an automatically collected list of articles about organizations related to [[dog]]s.

[[Category:Dogs|Organizations]]
[[Category:Animal organizations]]
This is an automatically accumulated list of articles about sports involving dogs.

{{Cat main|Dog sports}}
{{Commons category|Dog sports}}

[[Category:Dogs|Sports for dogs]]
[[Category:Animals in sport]]
[[Category:Dogs|Pets]]
[[Category:Mammals as 

## Test WikiPage Class

In [12]:
test_index = 85
test = WikiPage(root[test_index])

In [13]:
print(test.website)
print(test.id)
print(test.text_raw)
print(test.text_parentid)

https://en.wikipedia.org/wiki/File:Hawaiian_Poi_Dog_from_Jack_L._Throp's_program,_c._1969.jpg
53486800
== Summary ==
{{Non-free use rationale 
| Article           = Hawaiian Poi Dog
| Description       =Female bred in Jack L. Throp's program at the Honolulu Zoo to reconstitute the early Hawaiian breed. Photo courtesy of ''The Honolulu Advertiser''. 
| Source            ={{cite journal|last1=Titcomb|first1=Margaret|last2=Pukui|first2=Mary Kawena|title=Dog and Man in the Ancient Pacific, with Special Attention to Hawaii|volume=59|year=1969|publisher=Bernice P. Bishop Museum Special Publications|location=Honolulu|oclc=925631874|page=23}}
| Portion           = Entire
| Low resolution    = Yes  
| Purpose           = The photo is only being used for informational purposes. Its inclusion in the article adds significantly to the article because the photo and its historical significance are the object of discussion in the article.
| Replaceability    = 
As the subject is deceased, and the prog

In [14]:
test.children

[<Element '{http://www.mediawiki.org/xml/export-0.10/}title' at 0x10d5da778>,
 <Element '{http://www.mediawiki.org/xml/export-0.10/}ns' at 0x10d5da7c8>,
 <Element '{http://www.mediawiki.org/xml/export-0.10/}id' at 0x10d5da818>,
 <Element '{http://www.mediawiki.org/xml/export-0.10/}revision' at 0x10d5da868>]

In [15]:
type(test.children[0])

xml.etree.ElementTree.Element

In [16]:
test.data

Unnamed: 0,website,page_type,text_raw
53486800,https://en.wikipedia.org/wiki/File:Hawaiian_Po...,file,== Summary ==\n{{Non-free use rationale \n| Ar...


In [17]:
print(test.data.website[0])

https://en.wikipedia.org/wiki/File:Hawaiian_Poi_Dog_from_Jack_L._Throp's_program,_c._1969.jpg


In [18]:
from tools import merge_Wiki_pages
df=merge_Wiki_pages(root)
df.head()

Unnamed: 0,website,page_type,text_raw
970284,https://en.wikipedia.org/wiki/Category:Dog_sho...,category,{{Cat main|Conformation show|Show dog}}\n{{por...
972913,https://en.wikipedia.org/wiki/Category:Dog_health,category,This is a collection of articles about the hea...
970251,https://en.wikipedia.org/wiki/Category:Dog_org...,category,This is an automatically collected list of art...
729436,https://en.wikipedia.org/wiki/Category:Dog_sports,category,This is an automatically accumulated list of a...
978163,https://en.wikipedia.org/wiki/Category:Dogs_as...,category,[[Category:Dogs|Pets]]\n[[Category:Mammals as ...


In [19]:
#Write DF to file
df.to_csv(outfile)
