In [1]:
import pandas as pd
import numpy as np
import os
os.chdir('data')

In [2]:
xmlfile='Wikipedia-20170717213140.xml'
outfile=xmlfile.replace("xml","csv")
data=[]

# Parse file

Loop over lines in the XML file and look for appearances of:
* title
* id
* text

At each step, save the variables into a dictionary.

After reading the file, the dictionary "data" is saved into a "csv" file.


In [3]:
f = open(xmlfile,"r")

state=0
page={}
for line in f.readlines():
    if(state==0):
        # Look for keywork "page" and jump to state=1
        if "page" in line:
            state=1
    elif(state==1):
        # Look for keyword "title" and jump to state=2
        # store "title"
        if "title" in line:
            ll=line.replace("<title>","")
            title=ll.replace("</title>","").strip()
            page['title']=title
            #print(title)
            state=2
    elif(state==2):
        # Look for keywork "id" and jump to state=3
        # store "id"
        if "id" in line:
            ll=line.replace("<id>","")
            aid=ll.replace("</id>","").strip()
            page['id']=aid
            #print(aid)
            state=3
    elif state==3:
        # Look for keywork "<text"
        if ("<text" in line):
            line=line.replace("<text","").strip()
            ll=line.split(">")
            text=ll[1]
            if "</text>" in text:
                # If found </text> then jump to state=5, and store text
                text=text.replace("</text>","")
                state=5
            else:
                state=4
    elif state==4:
        # If found </text> then jump to state=5, and store text
        if "</text>" in line:
            ll=line.replace("</text>","").strip()
            text=text+ll
            state=5
        else:
            text=text+line.strip()
    elif(state==5):
        # "</page>" return to the top (state=0)
        # add current page dictionary into the list.
        if "</page>" in line:
            page['text']=text
            #print(text)
            data.append(page)
            page={}
            state=0
            
f.close()

In [4]:
# Convert list of dictionaries to dataframe
df=pd.DataFrame(data)

df.to_csv(outfile)
df.head()

Unnamed: 0,id,text,title
0,970284,{{Cat main|Conformation show|Show dog}}{{porta...,Category:Dog shows and showing
1,972913,This is a collection of articles about the hea...,Category:Dog health
2,970251,This is an automatically collected list of art...,Category:Dog organizations
3,729436,This is an automatically accumulated list of a...,Category:Dog sports
4,978163,[[Category:Dogs|Pets]][[Category:Mammals as pe...,Category:Dogs as pets


# Parsing xml using ElementTree

For a short tutorial, see: https://docs.python.org/3/library/xml.etree.elementtree.html#module-xml.etree.ElementTree

In [5]:
import xml.etree.ElementTree as ET

In [6]:
tree = ET.parse(xmlfile)
root = tree.getroot()

In [7]:
# as an Element, root has a tag and a dictionary of attributes
print(root.tag)
print(root.attrib)

{http://www.mediawiki.org/xml/export-0.10/}mediawiki
{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd', 'version': '0.10', '{http://www.w3.org/XML/1998/namespace}lang': 'en'}


In [8]:
# root also has children nodes over which we can iterate
# examine first 10 entries
for child in root[:10]:
     print(child.tag)

{http://www.mediawiki.org/xml/export-0.10/}siteinfo
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page
{http://www.mediawiki.org/xml/export-0.10/}page


In [9]:
# alternate way to get all children from root
root_children = root.getchildren()
print(root_children[:10])

[<Element '{http://www.mediawiki.org/xml/export-0.10/}siteinfo' at 0x10ff01a48>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10ff0def8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10feaf7c8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10feafe08>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10feb4458>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10feb4ae8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10feb8138>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10feb87c8>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10feb8b38>, <Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10febc228>]


In [10]:
# Children are nested, and we can access specific child nodes by index
test_entry = 1

print(root.getchildren()[test_entry].tag, '\n')

print(root[test_entry],'\n')

for i in range(len(root[test_entry])):
    print(root[test_entry][i].tag, root[test_entry][i].text)

{http://www.mediawiki.org/xml/export-0.10/}page 

<Element '{http://www.mediawiki.org/xml/export-0.10/}page' at 0x10ff0def8> 

{http://www.mediawiki.org/xml/export-0.10/}title Category:Dog shows and showing
{http://www.mediawiki.org/xml/export-0.10/}ns 14
{http://www.mediawiki.org/xml/export-0.10/}id 970284
{http://www.mediawiki.org/xml/export-0.10/}revision 
      


In [11]:
test = root.getchildren()[20]
for i in range(len(test)):
    if len(test[i]) > 0:
        print('child:', test[i].tag)
        for j in range(len(test[i])):
            print(j, test[i][j].tag, '\n', test[i][j].text)

child: {http://www.mediawiki.org/xml/export-0.10/}revision
0 {http://www.mediawiki.org/xml/export-0.10/}id 
 759352260
1 {http://www.mediawiki.org/xml/export-0.10/}parentid 
 753052214
2 {http://www.mediawiki.org/xml/export-0.10/}timestamp 
 2017-01-10T17:44:40Z
3 {http://www.mediawiki.org/xml/export-0.10/}contributor 
 
        
4 {http://www.mediawiki.org/xml/export-0.10/}minor 
 None
5 {http://www.mediawiki.org/xml/export-0.10/}comment 
 /* Dogs in legend */ Standardized hatnote
6 {http://www.mediawiki.org/xml/export-0.10/}model 
 wikitext
7 {http://www.mediawiki.org/xml/export-0.10/}format 
 text/x-wiki
8 {http://www.mediawiki.org/xml/export-0.10/}text 
 {{refimprove|date=December 2008}}
{{originalresearch|date=December 2008}}

'''Dogs''' (''Canis lupus familiaris''), known in [[Classical Chinese]] as ''quan'' ({{zh|c=犬|p=quǎn|w=''ch'üan''}}), played an important role in ancient Chinese society.

== Domestication ==

An examination of the genetic evidence by Carles Vila and others 

In [12]:
class WikiPage(ET.Element):
    '''Class to access and manipulate XML Wikipedia data'''
    def __init__(self, page):
        ET.Element.__init__(self, page)
                
        self.page = page
        self.children = self.page.getchildren()
        
        def get_xml_field(xml_data):
            '''returns field from xml'''
            return xml_data.tag.split('}')[1]
        
        for child in self.children:
            if get_xml_field(child) == 'title':
                self.title = child.text
            elif get_xml_field(child) == 'id':
                self.id = child.text
            elif get_xml_field(child) == 'revision':
                for gchild in child.getchildren():
                    if get_xml_field(gchild) == 'text':
                        self.raw_text = gchild.text

In [13]:
test = WikiPage(root[1])

In [14]:
print(test.title)
print(test.id)
print(test.raw_text)

Category:Dog shows and showing
970284
{{Cat main|Conformation show|Show dog}}
{{portal|Dogs}}
This is an automatically collected list of articles related to showing dogs for their appearance in [[conformation show]]s.

[[Category:Dogs|Shows and showing]]
[[Category:Competitions]]
[[Category:Animal shows| Dog]]
