In [15]:
import requests
from bs4 import BeautifulSoup

First, we make a get request to obtain the Wikipedia page on Mars in XML format, using the Wikipedia API.

In [16]:
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=xml&exintro=&titles=Mars'

In [17]:
xml_data = requests.get(url).content

# Create a BeautifulSoup object from the xml
soup = BeautifulSoup(xml_data, "lxml")

# Prettify the BeautifulSoup object
pretty_soup = BeautifulSoup.prettify(soup)

# Print the response
print(pretty_soup)

<?xml version="1.0"?>
<html>
 <body>
  <api batchcomplete="">
   <query>
    <pages>
     <page _idx="14640471" ns="0" pageid="14640471" title="Mars">
      <extract xml:space="preserve">
       &lt;p&gt;&lt;b&gt;Mars&lt;/b&gt; is the fourth planet from the Sun and the second-smallest planet in the Solar System after Mercury. In English, Mars carries a name of the Roman god of war, and is often referred to as the "&lt;b&gt;Red Planet&lt;/b&gt;" because the reddish iron oxide prevalent on its surface gives it a reddish appearance that is distinctive among the astronomical bodies visible to the naked eye. Mars is a terrestrial planet with a thin atmosphere, having surface features reminiscent both of the impact craters of the Moon and the valleys, deserts, and polar ice caps of Earth.&lt;/p&gt;
&lt;p&gt;The rotational period and seasonal cycles of Mars are likewise similar to those of Earth, as is the tilt that produces the seasons. Mars is the site of Olympus Mons, the largest volcano a

In [61]:
# with open('Mars.xml', 'w') as file:
#    file.write(pretty_soup)

We wish to extract the data above and put into a (pandas) dataframe.

In [18]:
import xml.etree.ElementTree as ET
import pandas as pd

In [19]:
class XML2DataFrame:

    def __init__(self, xml_data):
        self.root = ET.XML(xml_data)
        
    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.tag] = element.text
        for child in list(element): # RECURSION for nested tags
            self.parse_element(child, parsed)
        return parsed

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)] # list(element) vs iter(root) ?

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

# Citation: http://www.austintaylor.io/lxml/python/pandas/xml/dataframe/2016/07/08/convert-xml-to-pandas-dataframe/

In [20]:
xml2df = XML2DataFrame(xml_data)
xml_dataframe = xml2df.process_data()

In [23]:
xml_dataframe.iloc[:,0:5]

Unnamed: 0,_idx,extract,ns,pageid,title
0,14640471,<p><b>Mars</b> is the fourth planet from the S...,0,14640471,Mars


In [24]:
# Access intro of Wikipedia article on Mars
xml_dataframe.iloc[0,1]

'<p><b>Mars</b> is the fourth planet from the Sun and the second-smallest planet in the Solar System after Mercury. In English, Mars carries a name of the Roman god of war, and is often referred to as the "<b>Red Planet</b>" because the reddish iron oxide prevalent on its surface gives it a reddish appearance that is distinctive among the astronomical bodies visible to the naked eye. Mars is a terrestrial planet with a thin atmosphere, having surface features reminiscent both of the impact craters of the Moon and the valleys, deserts, and polar ice caps of Earth.</p>\n<p>The rotational period and seasonal cycles of Mars are likewise similar to those of Earth, as is the tilt that produces the seasons. Mars is the site of Olympus Mons, the largest volcano and second-highest known mountain in the Solar System, and of Valles Marineris, one of the largest canyons in the Solar System. The smooth Borealis basin in the northern hemisphere covers 40% of the planet and may be a giant impact feat

# Multiple XML files

Now for the case of multiple XML files, we loop through each file.

In [25]:
earth_pages = 'https://en.wikipedia.org/w/api.php?action=query&generator=allpages&gaplimit=100&gapfrom=Earth&format=xml&gapfilterredir=nonredirects'

In [26]:
earth_data = requests.get(earth_pages).content

# Create a BeautifulSoup object from the xml
earth_soup = BeautifulSoup(earth_data, "lxml")

In [27]:
earth_soup

<?xml version="1.0"?><html><body><api batchcomplete=""><continue continue="gapcontinue||" gapcontinue="Earth's_weight"></continue><query><pages><page _idx="9228" ns="0" pageid="9228" title="Earth"></page><page _idx="9940789" ns="0" pageid="9940789" title="Earth's Answer"></page><page _idx="29803921" ns="0" pageid="29803921" title="Earth's Birthday Project"></page><page _idx="1833777" ns="0" pageid="1833777" title="Earth's Children"></page><page _idx="22528186" ns="0" pageid="22528186" title="Earth's Creation"></page><page _idx="51117758" ns="0" pageid="51117758" title="Earth's Greatest Spectacles"></page><page _idx="24524199" ns="0" pageid="24524199" title="Earth's Last Citadel"></page><page _idx="24161171" ns="0" pageid="24161171" title="Earth's Man"></page><page _idx="37764175" ns="0" pageid="37764175" title="Earth's Own Food Company"></page><page _idx="48436511" ns="0" pageid="48436511" title="Earth's Quality"></page><page _idx="28004154" ns="0" pageid="28004154" title="Earth's Righ

In [28]:
earth_tags = earth_soup.find_all('page')

In [29]:
earth_tags

[<page _idx="9228" ns="0" pageid="9228" title="Earth"></page>,
 <page _idx="9940789" ns="0" pageid="9940789" title="Earth's Answer"></page>,
 <page _idx="29803921" ns="0" pageid="29803921" title="Earth's Birthday Project"></page>,
 <page _idx="1833777" ns="0" pageid="1833777" title="Earth's Children"></page>,
 <page _idx="22528186" ns="0" pageid="22528186" title="Earth's Creation"></page>,
 <page _idx="51117758" ns="0" pageid="51117758" title="Earth's Greatest Spectacles"></page>,
 <page _idx="24524199" ns="0" pageid="24524199" title="Earth's Last Citadel"></page>,
 <page _idx="24161171" ns="0" pageid="24161171" title="Earth's Man"></page>,
 <page _idx="37764175" ns="0" pageid="37764175" title="Earth's Own Food Company"></page>,
 <page _idx="48436511" ns="0" pageid="48436511" title="Earth's Quality"></page>,
 <page _idx="28004154" ns="0" pageid="28004154" title="Earth's Rightful Ruler"></page>,
 <page _idx="39040241" ns="0" pageid="39040241" title="Earth's Skin"></page>,
 <page _idx="1

In [30]:
id_list = []

for link in earth_tags:
    id_list.append(int(link.get('pageid')))

In [31]:
id_list

[9228,
 9940789,
 29803921,
 1833777,
 22528186,
 51117758,
 24524199,
 24161171,
 37764175,
 48436511,
 28004154,
 39040241,
 11360081,
 944638,
 11434033,
 41077022,
 14997569,
 146983,
 878461,
 4396171,
 29247528]

In [32]:
base_url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=xml&exintro=&'

for pageid in id_list:

    query = 'pageids=%i' % pageid

    # perform a GET request using the base_url and query
    xml_data = requests.get(base_url+query).content
    
    xml2df = XML2DataFrame(xml_data)
    xml_dataframe_2 = xml2df.process_data()
    xml_dataframe = pd.concat([xml_dataframe,xml_dataframe_2], ignore_index=True, join='inner')

In [33]:
xml_dataframe.iloc[:,0:5]

Unnamed: 0,_idx,extract,ns,pageid,title
0,14640471,<p><b>Mars</b> is the fourth planet from the S...,0,14640471,Mars
1,9228,<p><b>Earth</b> is the third planet from the S...,0,9228,Earth
2,9940789,<p><b>Earth's Answer</b> is a poem by William ...,0,9940789,Earth's Answer
3,29803921,"<ul><li class=""mw-empty-elt"">\n<li class=""mw-e...",0,29803921,Earth's Birthday Project
4,1833777,<p><i><b>Earth's Children</b></i> is a series ...,0,1833777,Earth's Children
5,22528186,<p><i><b>Earth's Creation</b></i> is a paintin...,0,22528186,Earth's Creation
6,51117758,<p><i><b>Earth's Greatest Spectacles</b></i> (...,0,51117758,Earth's Greatest Spectacles
7,24524199,<p><i><b>Earth's Last Citadel</b></i> is a sci...,0,24524199,Earth's Last Citadel
8,24161171,<p><i><b>Earth's Man</b></i> (Hungarian: <i><s...,0,24161171,Earth's Man
9,37764175,<p><b>Earth's Own Food Company</b> (previously...,0,37764175,Earth's Own Food Company
