# Lesson 7: Advanced Web Scraping and Data Gathering
## Topic 2: Read data from XML

In [1]:
!pip install xml

ERROR: Could not find a version that satisfies the requirement xml (from versions: none)
ERROR: No matching distribution found for xml


In [2]:
import xml.etree.ElementTree as ET

### Exercise 11: Create some random data yourself to understand the XML data format better

In [5]:
data = '''
<person>
  <name>Dave</name>
  <surname>Piccardo</surname>
  <phone type="intl">
     +1 742 101 4456
   </phone>
   <email hide="yes">
   dave.p@gmail.com</email>
</person>'''

In [6]:
print(data)


<person>
  <name>Dave</name>
  <surname>Piccardo</surname>
  <phone type="intl">
     +1 742 101 4456
   </phone>
   <email hide="yes">
   dave.p@gmail.com</email>
</person>


In [7]:
type(data)

str

### Exercise 12: Read the string data as an XML `Element` object 

In [8]:
tree = ET.fromstring(data)

In [9]:
type(tree)

xml.etree.ElementTree.Element

### Exercise 13: Find various elements of data within the tree (element)

In [10]:
# Print the name of the person
print('Name:', tree.find('name').text)

Name: Dave


In [11]:
# Print the surname
print('Surname:', tree.find('surname').text)

Surname: Piccardo


In [12]:
# Print the phone number
print('Phone:', tree.find('phone').text.strip())

Phone: +1 742 101 4456


In [13]:
# Print email status and the actual email
print('Email hidden:', tree.find('email').get('hide'))
print('Email:', tree.find('email').text.strip())

Email hidden: yes
Email: dave.p@gmail.com


### Exercise 14: Read from a local XML file (perhaps downloaded) into an `ElementTree` object

In [14]:
tree2=ET.parse('xml1.xml')

In [15]:
type(tree2)

xml.etree.ElementTree.ElementTree

### Exercise 15: How to 'traverse' the tree? Find the `root` and explore all `child` nodes and their `attributes`

In [16]:
root=tree2.getroot()

In [17]:
for child in root:
    print ("Child tag:",child.tag, "| Child attribute:",child.attrib)

Child tag: country | Child attribute: {'name': 'Liechtenstein'}
Child tag: country | Child attribute: {'name': 'Singapore'}
Child tag: country | Child attribute: {'name': 'Panama'}


### Exercise 16: Use the `.text()` method to extract meaningful data

In [18]:
root[0][2]

<Element 'gdppc' at 0x000001ABFE0C6EF0>

In [19]:
root[0][2].text

'141100'

In [20]:
root[0][2].tag

'gdppc'

In [21]:
root[0]

<Element 'country' at 0x000001ABFE14EB80>

In [22]:
root[0].tag

'country'

In [23]:
root[0].attrib

{'name': 'Liechtenstein'}

### Exercise 17: Write a loop to extract and print the GDP/per capita information against each country 

In [24]:
for c in root:
    country_name=c.attrib['name']
    gdppc = int(c[2].text)
    print("{}: {}".format(country_name,gdppc))

Liechtenstein: 141100
Singapore: 59900
Panama: 13600


### Exercise 18: Find all the neighboring countries for each country and print them
Note how to use `findall` and `attrib` together

In [25]:
for c in root:
    ne=c.findall('neighbor') # Find all the neighbors
    print("Neighbors\n"+"-"*25)
    for i in ne: # Iterate over the neighbors and print their 'name' attribute
        print(i.attrib['name'])
    print('\n')

Neighbors
-------------------------
Austria
Switzerland


Neighbors
-------------------------
Malaysia


Neighbors
-------------------------
Costa Rica
Colombia




In [30]:
for elem in root.iter():
    print(elem.text)


    

        
1
2008
141100
None
None

        
4
2011
59900
None

        
68
2011
13600
None
None


### Exercise 19: A simple demo of using XML data obtained by web scraping

In [26]:
import urllib.request, urllib.parse, urllib.error

In [27]:
serviceurl = 'http://www.recipepuppy.com/api/?'

In [29]:
item ="jalapeno"
#str(input('Enter the name of a food item (enter \'quit\' to quit): '))
url = serviceurl + urllib.parse.urlencode({'q':item})+'&p=1&format=xml'
uh = urllib.request.urlopen(url)

HTTPError: HTTP Error 500: Internal Server Error

In [None]:
data = uh.read().decode()
print('Retrieved', len(data), 'characters')
tree3 = ET.fromstring(data)

In [None]:
type(tree3)

In [None]:
for elem in tree3.iter():
    print(elem.text)

In [None]:
print(data)

In [None]:
for e in tree3.iter():
    h=e.find('href')
    t=e.find('title')
    if h!=None and t!=None:
        print("Receipe Link for:",t.text)
        print(h.text)
        print("-"*100)