In [1]:
#Load in some libraries to handle the web page requests and the web page parsing...
import requests

#You may need to install BeautifulSoup
#!pip3 install beautifulsoup4
from bs4 import BeautifulSoup

#Note - I'm in Python3
from urllib.parse import parse_qs

#The scraper will be limited to just the first results page...
def searchUNdata(q):
    ''' Run a search on the UN data website and scrape the results '''
    
    params={'q':q}
    url='http://data.un.org/Search.aspx'

    response = requests.get(url,params=params)

    soup=BeautifulSoup(response.content)

    results={}

    #Get the list of results
    searchresults=soup.findAll('div',{'class':'Result'})
    
    #For each result, parse out the name of the dataset, the datamart ID and the data filter ID
    for result in searchresults:
        h2=result.find('h2')
        #We can find everything we need in the <a> tag...
        a=h2.find('a')
        p=parse_qs(a.attrs['href'])
        results[a.text]=(p['d'][0],p['f'][0])

    return results

In [2]:
#A couple of helper functions to let us display the results

results=searchUNdata('death')

def printResults(results):
    ''' Nicely print the search results '''
    
    for result in results.keys():
        print(result)


def unDataSearch(q):
    ''' Simple function to take a search phrase, run the search on the UN data site, and print and return the results. '''
    
    results=searchUNdata(q)
    printResults(results)
    return results

printResults(results)

#q='mortality'
#unDataSearch(q)

Infant mortality rate, for both sexes combined (infant deaths per 1,000 live births)
Under-five mortality, for both sexes combined (deaths under age five per 1,000 live births)
Malaria - number of reported deaths
Number of infant deaths (thousands)
Number of neonatal deaths (thousands)
Crude death rate (per 1000 population)
Number of under-five deaths (thousands)
Deaths due to HIV/AIDS (per 100 000 population)
Civil registration coverage of cause-of-death (%)
Number of deaths due to tuberculosis, excluding HIV
Ill-defined causes in cause-of-death registration (%)
Deaths due to tuberculosis among HIV-negative people (per 100 000 population)
AIDS-related deaths
Crude death rate (CDR)
Deaths by month of death
Deaths averted due to ART
Lifetime risk of maternal death
Foetal deaths by gestational age
Number of male deaths (thousands)
Late foetal deaths by age of woman


In [3]:
#Just in case - a helper routine for working with the search results data
def search(d, substr):
    ''' Partial string match search within dict key names '''
    #via http://stackoverflow.com/a/10796050/454773
    
    result = []
    for key in d:
        if substr.lower() in key.lower():
            result.append((key, d[key])) 

    return result

In [4]:
search(results, 'Deaths by month of death')

[('Deaths by month of death', ('POP', 'tableCode:65'))]

In [5]:
#Note - I'm in Python3
from io import BytesIO

import zipfile
import pandas as pd

def getUNdata(undataSearchResults,dataset):
    ''' Download a named dataset from the UN Data website and load it into a pandas dataframe '''

    datamartID,seriesRowID=undataSearchResults[dataset]
    
    url='http://data.un.org/Handlers/DownloadHandler.ashx?DataFilter='+seriesRowID+'&DataMartId='+datamartID+'&Format=csv'

    r = requests.get(url)
    
    
    s=BytesIO(r.content)
    z = zipfile.ZipFile(s)
    
    #Show the files in the zip file
    #z.namelist()
    
    #Let's assume we just get one file per zip...
    #Drop any all blank columns
    df=pd.read_csv( BytesIO( z.read( z.namelist()[0] ) )).dropna(axis=1,how='all')
    return df

In [6]:
results=unDataSearch('Deaths by month of death')

Deaths by month of death


In [7]:
dd=getUNdata(results,'Deaths by month of death')

#Preview the last few rows
dd[:5]

Unnamed: 0,Country or Area,Year,Area,Month,Record Type,Reliability,Source Year,Value,Value Footnotes,Value Footnotes.1
0,Åland Islands,2018,Total,Total,Data tabulated by year of occurrence,"Final figure, complete",2020.0,272.0,,
1,Åland Islands,2018,Total,January,Data tabulated by year of occurrence,"Final figure, complete",2020.0,23.0,,
2,Åland Islands,2018,Total,February,Data tabulated by year of occurrence,"Final figure, complete",2020.0,29.0,,
3,Åland Islands,2018,Total,March,Data tabulated by year of occurrence,"Final figure, complete",2020.0,29.0,,
4,Åland Islands,2018,Total,April,Data tabulated by year of occurrence,"Final figure, complete",2020.0,19.0,,


In [8]:
#One thing to note is that footnotes may appear at the bottom of a dataframe
#We can spot the all empty row and drop rows from that
#We can also drop the footnote related columns
def dropFootnotes(df):
    return df[:pd.isnull(dd).all(1).nonzero()[0][0]].drop(['Value Footnotes','Value Footnotes.1'], 1)

dropFootnotes(dd)[-5:]

  """


IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
#Create a function that automatically drops the footnotes and any empty rows
def getUNdata2(undataSearchResults, dataset, footnotes=False):
    df=getUNdata(undataSearchResults, dataset)
    if footnotes:
        return df
    return dropFootnotes(df)


In [None]:
getUNdata2(results,'Deaths by month of death')[5:5]

In [None]:
getUNdata2(results,'Deaths by month of death',footnotes=True)[-5:]