
# About

**Goal:** Extraction, transformation, and load data the ReliefWeb API.

**API:** http://reliefweb.int/help/api

**Situation:** 
An organization wants to explore reports from the ReliefWebAPI. Write a script to extract, transform, and load meta data from the API. Use text from popular sources as terms for search queries.

**Actions:**

- Connect to the ReliefWebAPI.
- Systematically search and download meta data.
- Preprocess data into tabular format.

**Audience:** Intermediate level Tensorflow users.

In [1]:
import pandas as pd
# import warnings
# warnings.filterwarnings("ignore")
import json
from urllib.request import urlopen
import codecs
import csv
import os
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from IPython.display import HTML

# Pandas Printing Options
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 5000)

# What is the ReliefWeb API?

In [13]:
HTML("<iframe src='http://reliefweb.int/help/api' width=1000 height=300></iframe>")

In [2]:
# set-up
stoplist = stopwords.words('english')
query = "http://api.rwlabs.org/v1/reports?query[value]={}&limit=1000&filter[field]=headline&fields[include][]=headline.summary&fields[include][]=theme.name&fields[include][]=date"

# Millennium Development Goals

In [14]:
HTML("<iframe src='https://en.wikipedia.org/wiki/Millennium_Development_Goals' width=1000 height=300 zoom=80></iframe>")

# Universal Declaration of Human Rights

In [17]:
HTML("<iframe src='http://www.un.org/en/universal-declaration-human-rights/' width=1000 height=300 zoom=80></iframe>")

# Helper Functions

In [11]:
def load_and_cleanse(path):
    assert (path[-4:] == '.txt')
    text = open(path).read()
    # tokenize into words
    tokens = [
        word
        for sent in nltk.sent_tokenize(text)
        for word in nltk.word_tokenize(sent)
    ]
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


def get_request(url_query):
    response = urlopen(url_query).read().decode('utf-8')
    responseJson = json.loads(response)
    return responseJson.get("data")


def print_request(article, search_term=None, check=False):
    if check and search_term:
        print(article)
        print()

def request_from_list(search_words):
    for word in search_words:
        url_query = query.format(word)
        api_data = get_request(url_query)
        output_path = './outputs/relief_{}.csv'.format(word)
        with codecs.open(output_path, "w", encoding='utf8') as csvfile:
            reliefwriter = csv.writer(csvfile, delimiter=",")

            reliefwriter.writerow([
                "article_id", "search_term", "relief_score", "created_date",
                "headline", "title", "themes"
            ])

            for article in api_data:
                # optional request check
                print_request(article, search_term='universal', check=False)
                
                if "headline" in article[
                        "fields"].keys() and "theme" in article["fields"].keys(
                        ) and "title" in article["fields"].keys():
                    article_id = article["id"]
                    relief_score = article["score"]
                    title = article['fields']['title']
                    created_date = article['fields']['date']['created']
                    headline = article["fields"]["headline"]["summary"]
                    dummy = [x.strip('"') for x in headline.split()]
                    headline = " ".join(dummy)
                    url = article['href']
                    themes = []

                    for x in range(len(article["fields"]["theme"])):
                        theme = article["fields"]["theme"][x].get("name")
                        themes.append(theme)

                else:
                    continue

                try:
                    reliefwriter.writerow([
                        article_id, word, relief_score, created_date, headline, title, themes, url
                    ])

                except (UnicodeEncodeError, UnicodeEncodeError):
                    article_id = "Missing"
                    headline = "Missing"
                    title = article['fields']['title']
                    reliefwriter.writerow(
                        [article_id, word, relief_score, created_date, headline, title, themes, url])

        print("{} file saved to {}".format(word, output_path))
        
#
millennium_development_goals = load_and_cleanse('MDGs.txt')
decleration_of_human_rights = load_and_cleanse('Universal_Declaration_of_Human_Rights.txt')

docs = decleration_of_human_rights + millennium_development_goals
tokens = docs.split()
examples = tokens[:5]

# Build Search Term Vocabulary

In [12]:
millennium_development_goals = load_and_cleanse('MDGs.txt')
decleration_of_human_rights = load_and_cleanse('Universal_Declaration_of_Human_Rights.txt')

# Extract, Transform, and Load

In [15]:
request_from_list(examples)

universal file saved to ./outputs/relief_universal.csv
declaration file saved to ./outputs/relief_declaration.csv
human file saved to ./outputs/relief_human.csv
right file saved to ./outputs/relief_right.csv
preamble file saved to ./outputs/relief_preamble.csv


# Preview Output

In [16]:
example_output = pd.read_csv("./outputs/relief_universal.csv")
example_output

Unnamed: 0,article_id,search_term,relief_score,created_date,headline,title,themes
1431191,universal,8.872,2016-03-09T17:17:34+00:00,Amnesty International expresses lingering conc...,Sudan: Dire Human Rights Situation Continues -...,['Protection and Human Rights'],http://api.rwlabs.org/v1/reports/1431191
1786646,universal,8.872,2016-11-20T13:04:51+00:00,Universal Children’s Day is more than a day to...,UNICEF Executive Director Anthony Lake stateme...,['Protection and Human Rights'],http://api.rwlabs.org/v1/reports/1786646
1540416,universal,8.872,2016-05-25T02:47:51+00:00,While access to medical care is a growing issu...,Universal access of populations to health care...,"['Health', 'Safety and Security']",http://api.rwlabs.org/v1/reports/1540416
564606,universal,8.872,2013-03-22T12:19:27+00:00,The launch of new Health Sector Strategic Plan...,New plan to ensure universal healthcare in Som...,['Health'],http://api.rwlabs.org/v1/reports/564606
617194,universal,8.872,2013-11-20T02:52:19+00:00,Tens of thousands of Syrian children in Jordan...,Universal Children's Day: Syrian refugees rely...,"['Education', 'Protection and Human Rights']",http://api.rwlabs.org/v1/reports/617194
703416,universal,8.872,2014-10-27T19:27:14+00:00,This study outlines the steps to achieving uni...,Achieving universal sanitation: Sharing the ex...,['Water Sanitation Hygiene'],http://api.rwlabs.org/v1/reports/703416
462094,universal,8.872,2011-11-30T10:30:00+00:00,30 NOVEMBER 2011 ¦ GENEVA -- Global progress i...,Global HIV/Aids response - Epidemic update and...,"['Health', 'HIV/Aids']",http://api.rwlabs.org/v1/reports/462094
527217,universal,8.872,2012-09-27T00:31:04+00:00,26 September 2012 – Secretary-General Ban Ki-m...,UN chief unveils $1.5 billion initiative to ac...,['Education'],http://api.rwlabs.org/v1/reports/527217
