In [1]:
import os, sys
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
sys.path.insert(0, 'scripts')
from WikiPage import WikiPage

In [2]:
os.chdir('data')
xmlfile = 'Wikipedia-20170717213140.xml'

# Parse file

Loop over lines in the XML file and look for appearances of:
* title
* id
* text

At each step, save the variables into a dictionary.

After reading the file, the dictionary "data" is saved into a "csv" file.


In [3]:
outfile = xmlfile.replace("xml","csv")
data = []

In [4]:
f = open(xmlfile,"r")

state=0
page={}
for line in f.readlines():
    if(state==0):
        # Look for keywork "page" and jump to state=1
        if "page" in line:
            state=1
    elif(state==1):
        # Look for keyword "title" and jump to state=2
        # store "title"
        if "title" in line:
            ll=line.replace("<title>","")
            title=ll.replace("</title>","").strip()
            page['title']=title
            #print(title)
            state=2
    elif(state==2):
        # Look for keywork "id" and jump to state=3
        # store "id"
        if "id" in line:
            ll=line.replace("<id>","")
            aid=ll.replace("</id>","").strip()
            page['id']=aid
            #print(aid)
            state=3
    elif state==3:
        # Look for keywork "<text"
        if ("<text" in line):
            line=line.replace("<text","").strip()
            ll=line.split(">")
            text=ll[1]
            if "</text>" in text:
                # If found </text> then jump to state=5, and store text
                text=text.replace("</text>","")
                state=5
            else:
                state=4
    elif state==4:
        # If found </text> then jump to state=5, and store text
        if "</text>" in line:
            ll=line.replace("</text>","").strip()
            text=text+ll
            state=5
        else:
            text=text+line.strip()
    elif(state==5):
        # "</page>" return to the top (state=0)
        # add current page dictionary into the list.
        if "</page>" in line:
            page['text']=text
            #print(text)
            data.append(page)
            page={}
            state=0
            
f.close()

In [5]:
# Convert list of dictionaries to dataframe
df=pd.DataFrame(data)

df.to_csv(outfile)
df.head()

Unnamed: 0,id,text,title
0,970284,{{Cat main|Conformation show|Show dog}}{{porta...,Category:Dog shows and showing
1,972913,This is a collection of articles about the hea...,Category:Dog health
2,970251,This is an automatically collected list of art...,Category:Dog organizations
3,729436,This is an automatically accumulated list of a...,Category:Dog sports
4,978163,[[Category:Dogs|Pets]][[Category:Mammals as pe...,Category:Dogs as pets


# Parsing xml using ElementTree

For a short tutorial, see: https://docs.python.org/3/library/xml.etree.elementtree.html#module-xml.etree.ElementTree

In [6]:
def get_xml_field(xml_data):
    '''Returns field from xml'''
    return xml_data.tag.split('}')[1]

In [7]:
tree = ET.parse(xmlfile)
root = tree.getroot()

In [8]:
# As an Element, root has a tag and a dictionary of attributes
print(root.tag)
print(root.attrib)

{http://www.mediawiki.org/xml/export-0.10/}mediawiki
{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd', 'version': '0.10', '{http://www.w3.org/XML/1998/namespace}lang': 'en'}


In [9]:
# Root also has children nodes over which we can iterate
# examine first 10 entries
for child in root[:10]:
     print(child.tag)

{http://www.mediawiki.org/xml/export-0.10/}siteinfo
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page


In [10]:
# Alternate way to get all children from root
# view first 10 children of root
root_children = root.getchildren()
print(root_children[:10])

[<Element '{http://www.mediawiki.org/xml/export-0.10/}siteinfo' at 0x115fa3e08>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115e7c2c8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115fa8778>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115fa3c28>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115fa3098>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115fa8f48>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115f9f598>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115f9fc28>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115ea4368>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115ea49f8>]


In [11]:
# Explore a subset of root (i.e. children of root)
print(root,'\n')
for child in root.getchildren()[:4]:
    if len(child) > 0:
        print(child.tag)
        for gchild in child.getchildren():
            print(gchild.tag, '\n', gchild.text)

<Element '{http://www.mediawiki.org/xml/export-0.10/}mediawiki' at 0x115fa3688> 

{http://www.mediawiki.org/xml/export-0.10/}siteinfo
{http://www.mediawiki.org/xml/export-0.10/}sitename 
 Wikipedia
{http://www.mediawiki.org/xml/export-0.10/}dbname 
 enwiki
{http://www.mediawiki.org/xml/export-0.10/}base 
 https://en.wikipedia.org/wiki/Main_Page
{http://www.mediawiki.org/xml/export-0.10/}generator 
 MediaWiki 1.30.0-wmf.9
{http://www.mediawiki.org/xml/export-0.10/}case 
 first-letter
{http://www.mediawiki.org/xml/export-0.10/}namespaces 
 
      
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}title 
 Category:Dog shows and showing
{http://www.mediawiki.org/xml/export-0.10/}ns 
 14
{http://www.mediawiki.org/xml/export-0.10/}id 
 970284
{http://www.mediawiki.org/xml/export-0.10/}revision 
 
      
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}title 
 Category:Dog health
{http://www.mediawiki.org/xml/e

In [12]:
# Dive deeper into the tree by exploring a page from root (i.e. grandchildren of root)
child_index = 1
print(root[child_index],'\n')
# Children are nested, and we can access specific child nodes by index
# Fully explore branches of a page
print('Branches of %s:\n' % (root.getchildren()[child_index].tag))
for child in root[child_index].getchildren():
    if len(child) > 0:
        print(child.tag)
        for gchild in child.getchildren():
            print(gchild.tag, '\n', gchild.text)

<Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x115e7c2c8> 

Branches of {http://www.mediawiki.org/xml/export-0.10/}page:

{http://www.mediawiki.org/xml/export-0.10/}revision
{http://www.mediawiki.org/xml/export-0.10/}id 
 732620169
{http://www.mediawiki.org/xml/export-0.10/}parentid 
 696498320
{http://www.mediawiki.org/xml/export-0.10/}timestamp 
 2016-08-02T04:39:56Z
{http://www.mediawiki.org/xml/export-0.10/}contributor 
 
        
{http://www.mediawiki.org/xml/export-0.10/}comment 
 new key for [[Category:Animal shows]]: " Dog" using [[WP:HC|HotCat]]
{http://www.mediawiki.org/xml/export-0.10/}model 
 wikitext
{http://www.mediawiki.org/xml/export-0.10/}format 
 text/x-wiki
{http://www.mediawiki.org/xml/export-0.10/}text 
 {{Cat main|Conformation show|Show dog}}
{{portal|Dogs}}
This is an automatically collected list of articles related to showing dogs for their appearance in [[conformation show]]s.

[[Category:Dogs|Shows and showing]]
[[Category:Competitions]]
[[Cat

## Test WikiPage Class

In [13]:
test_index = 85
test = WikiPage(root[test_index])

In [14]:
print(test.website)
print(test.id)
print(test.text_raw)
print(test.text_parentid)

https://en.wikipedia.org/wiki/File:Hawaiian_Poi_Dog_from_Jack_L._Throp's_program,_c._1969.jpg
53486800
== Summary ==
{{Non-free use rationale 
| Article           = Hawaiian Poi Dog
| Description       =Female bred in Jack L. Throp's program at the Honolulu Zoo to reconstitute the early Hawaiian breed. Photo courtesy of ''The Honolulu Advertiser''. 
| Source            ={{cite journal|last1=Titcomb|first1=Margaret|last2=Pukui|first2=Mary Kawena|title=Dog and Man in the Ancient Pacific, with Special Attention to Hawaii|volume=59|year=1969|publisher=Bernice P. Bishop Museum Special Publications|location=Honolulu|oclc=925631874|page=23}}
| Portion           = Entire
| Low resolution    = Yes  
| Purpose           = The photo is only being used for informational purposes. Its inclusion in the article adds significantly to the article because the photo and its historical significance are the object of discussion in the article.
| Replaceability    = 
As the subject is deceased, and the prog

In [15]:
test.children

[<Element '{http://www.mediawiki.org/xml/export-0.10/}title' at 0x11620f048>,
 <Element '{http://www.mediawiki.org/xml/export-0.10/}ns' at 0x11620f098>,
 <Element '{http://www.mediawiki.org/xml/export-0.10/}id' at 0x11620f0e8>,
 <Element '{http://www.mediawiki.org/xml/export-0.10/}revision' at 0x11620f138>]

In [16]:
type(test.children[0])

xml.etree.ElementTree.Element

In [17]:
test.data

Unnamed: 0,id,website,text_raw
0,53486800,https://en.wikipedia.org/wiki/File:Hawaiian_Po...,== Summary ==\n{{Non-free use rationale \n| Ar...


In [18]:
print(test.data.website[0])

https://en.wikipedia.org/wiki/File:Hawaiian_Poi_Dog_from_Jack_L._Throp's_program,_c._1969.jpg
