# Web Scraping Example with lxml and requests

Adapted from: https://data-lessons.github.io/library-webscraping/

We will use the `lxml`, `requests` and `cssselect` packages to parse a database of UN Security Council Resolutions.

In [2]:
import requests
import lxml
import cssselect

## Download a page with Requests

In [3]:
response = requests.get('http://www.un.org/en/sc/documents/resolutions/2016.shtml')
print(response.text)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr"><!-- InstanceBegin template="/Templates/SC_Template.dwt" codeOutsideHTMLIsLocked="false" -->
<head>
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<!-- InstanceBeginEditable name="Title/Metatags" -->
<title>Resolutions adopted by the United Nations Security Council in 2016</title>
<meta name="dc.title" property="og:title" content="Resolutions adopted by the United Nations Security Council in 2016" />
<!-- InstanceEndEditable -->
<meta property="og:image" content="http://www.un.org/en/sc/img/fbsc.jpg"/>
<meta name="dc.subject" content="security council, SC, UNSC, security, peace, sanctions, veto, resolution, president, united nations, UN, peacekeeping, peacebuilding, conflict resolution, prevention" />
<meta name=

In [4]:
import lxml.html

# Note difference in this line from original web site
tree = lxml.html.fromstring(response.text)
print("Type of tree:", type(tree))
# find a list of nodes relative to tree matching the given XPath or CSS selector expression
# here we pick off the first node in the (1 element) list
title_elem = tree.xpath('//title')[0]
title_elem = tree.cssselect('title')[0]  # equivalent to previous XPath
print("length of cssselect call: {}".format(len(tree.cssselect('title'))))
print("Type of title_elem:", type(title_elem))

print("title tag:", title_elem.tag)
print("title text:", title_elem.text_content())
# converts the element back to html/xml
print("title html:", lxml.html.tostring(title_elem))
# navigate node hierarchy
print("title's parent's tag:", title_elem.getparent().tag)
# tab to find api contents
#title_elem
# shift-tab to get definition
#title_elem.getparent

Type of tree: <class 'lxml.html.HtmlElement'>
length of cssselect call: 1
Type of title_elem: <class 'lxml.html.HtmlElement'>
title tag: title
title text: Resolutions adopted by the United Nations Security Council in 2016
title html: b'<title>Resolutions adopted by the United Nations Security Council in 2016</title>\r\n'
title's parent's tag: head


## Scraping the contents of the pages
Two steps:
- First we get a list of URLs to scrape
- Then we scrape the list of URLs

### Getting list of URLs to scrape

In [None]:
def get_year_urls():
    """Return a list of (year_url, year) pairs
    """
    start_url = 'http://www.un.org/en/sc/documents/resolutions/'
    response = requests.get(start_url)
    tree = lxml.html.fromstring(response.text)
    tables = tree.cssselect('#content > table')
    # Check you captured something and not more than you expected
    if len(tables) != 1:
        print('Expected exactly 1 table, got {}'.format(len(tables)))
        return []

    table = tables[0]
    links = table.cssselect('a')

    out = []
    for link in links:
        year_url = requests.compat.urljoin(start_url, link.attrib['href'])
        year = link.text_content()
        # TODO: validate that year is actually an appropriate number
        if len(year) != 4 or not year.isdigit():
            print("Link text '{}' is not an integer".format(link.text_content()))
            continue
        out.append((year_url, year))

    # Check we got something
    if not out:
        print('Expected some year URLs, got none')
    return out

In [None]:
urls = get_year_urls()

In [None]:
print(urls)

In [None]:
def clean_text(element):
    all_text = element.text_content()
    cleaned = ' '.join(all_text.split())
    return cleaned

def get_resolutions_for_year(year_url, year):
    """Return a list of resolutions

    Each should be represented as a dict like::

        {'date': ..., 'symbol': ..., 'url': ..., ''title': ..., }
    """
    response = requests.get(year_url)
    tree = lxml.html.fromstring(response.text)
    tables = tree.cssselect('#content > table')
    # Check you captured something and not more than you expected
    if len(tables) != 1:
        print('Expected exactly 1 table, got {}'.format(len(tables)))
        return []
    table = tables[0]
    out = []

    for row_elem in table.cssselect('tr'):
        resolution = {}
        children = row_elem.getchildren()
        if len(children) == 1:
            # Assume that a row with 1 element is the header
            continue

        # symbol
        resolution['symbol'] = clean_text(children[0])
        
        # date
        if len(children) == 3:
            # there is a date column
            resolution['date'] = clean_text(children[1])
        elif len(children) == 2:
            # adopt the year for the page
            resolution['date'] = year
        else:
            print('Unexpected number of children in row element: {}'.format(len(children)))
            continue
        
        # url
        symbol_links = children[0].cssselect('a')
        if len(symbol_links) != 1:
            print('Expected 1 link in the symbol column, got {}'.format(len(symbol_links)))
            continue
        relative_url = symbol_links[0].attrib['href']
        resolution['url'] = requests.compat.urljoin(response.url, relative_url)

        # title
        resolution['title'] = clean_text(children[-1])
        
        # Append to out
        out.append(resolution)

    # Check we got something
    if not out:
        print('Expected some resolutions, got none'.format(year))
    return out


In [None]:
# Test get_resolutions_for_year on 2016
resolutions = get_resolutions_for_year("http://www.un.org/en/sc/documents/resolutions/2016.shtml", "2016")
for resolution in resolutions:
    print(resolution)

## Creating a CSV file from scraped results

In [None]:
import csv
import time

with open('unsc-resolutions.csv', 'w') as out_file:
    writer = csv.DictWriter(out_file, ['date', 'symbol', 'title', 'url'])
    writer.writeheader()

    # Loop over years
    for year_url, year in get_year_urls():
        time.sleep(0.1)  # Wait a moment

        print('Processing:', year_url)
        year_resolutions = get_resolutions_for_year(year_url, year)

        for resolution in year_resolutions:
            writer.writerow(resolution)