In [18]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
import os
import time
from datetime import datetime
import concurrent.futures
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [19]:
load_dotenv()
api_key = os.getenv('IATI_API_KEY')

if not api_key:
    raise ValueError("API key not found. Please make sure it is set in the .env file or update it if necessary.")

Activity/XML

In [14]:
# Function to handle retry logic for requests
def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

# Function to fetch data in XML format
def fetch_page_xml(start):
    params = {
        'q': '(sector_code:(11250 OR 12240 OR 31110 OR 31120 OR 31130 OR 31140 OR 31150 OR 31161 OR 31162 OR 31163 OR 31164 OR 31165 OR 31166 OR 31181 OR 31182 OR 31191 OR 31192 OR 31193 OR 31194 OR 31195 OR 31210 OR 31220 OR 31261 OR 31281 OR 31282 OR 31291 OR 31310 OR 31320 OR 31381 OR 31382 OR 31391 OR 32161 OR 32162 OR 43040 OR 43071 OR 43072 OR 43073 OR 52010) OR sector_vocabulary:2 OR sector_code:(311 OR 312 OR 313)) AND (title_narrative:("food security" OR "food insecurity") AND description_narrative:("food security" OR "food insecurity")) AND tag_vocabulary:2',
        'fq': 'activity_date_iso_date:[2021-01-01T00:00:00Z TO *]',
        'rows': 1000,
        'start': start
    }
    headers = {'Ocp-Apim-Subscription-Key': api_key}
    
    response = requests_retry_session().get(xml_base_url, headers=headers, params=params, timeout=30)
    if response.status_code == 200:
        return response.content  # Return raw XML content
    else:
        response.raise_for_status()

# Function to parse the XML data and extract relevant information
def parse_xml_data(xml_data):
    root = ET.fromstring(xml_data)
    activities = []
    for activity in root.findall('.//iati-activity'):
        iati_id = activity.find('iati-identifier').text if activity.find('iati-identifier') is not None else None
        title = activity.find('.//title/narrative').text if activity.find('.//title/narrative') is not None else None
        description = activity.find('.//description/narrative').text if activity.find('.//description/narrative') is not None else None
        
        sector_element = activity.find('.//sector/code')
        sector = sector_element.attrib.get('code') if sector_element is not None else None
        
        date_element = activity.find('.//activity-date[@type="2"]')  # type "2" is the actual start date
        date = date_element.attrib.get('iso-date') if date_element is not None else None
        
        country_element = activity.find('.//recipient-country')
        country = country_element.attrib.get('code') if country_element is not None else None
        
        # Extract SDG tags
        sdg_tags = activity.findall(".//tag[@vocabulary='2']")
        sdg_codes = [tag.get('code') for tag in sdg_tags]
        
        activities.append({
            'iati_identifier': iati_id,
            'title_narrative': title,
            'description_narrative': description,
            'sector_code': sector,
            'activity_date_iso_date': date,
            'recipient_country_code': country,
            'sdg_tags': sdg_codes  # Add SDG tags to the data structure
        })
    return activities



                                     iati_identifier  \
0                   XI-IATI-EC_INTPA-2020-PCC-412348   
1                                DAC-1601-INV-041387   
2  XM-DAC-47015-22403_Window3_USA-USAID-UnitedSta...   
3  XM-DAC-47015-22404_Window3_USA-USAID-UnitedSta...   
4                                DAC-1601-INV-038770   

                                     title_narrative  \
0  Strengthening Food Security Statistics at coun...   
1  Emergency Relief to Food Insecurity in Madagascar   
3            Grant: ACUTE FOOD INSECURITY (Window 3)   
4  Emergency Response to Food Insecurity in Burki...   

                               description_narrative sector_code  \
0  <p>This Project will strengthen the capacity o...        None   
1  to provide humanitarian aid to people affected...        None   
2  A $500,000 Window 3 grant from USA - USAID-Uni...        None   
3  A $1,000,000 Window 3 grant from USA - USAID-U...        None   
4  to provide humanitarian aid to people a

In [20]:

xml_base_url = "https://api.iatistandard.org/datastore/activity/iati"  # URL for XML data
xml_data = fetch_page_xml(start=0)  # Fetching the first page as an example


parsed_activities = parse_xml_data(xml_data)

df = pd.DataFrame(parsed_activities)




HTTPError: 401 Client Error: Access Denied for url: https://api.iatistandard.org/datastore/activity/iati?q=%28sector_code%3A%2811250+OR+12240+OR+31110+OR+31120+OR+31130+OR+31140+OR+31150+OR+31161+OR+31162+OR+31163+OR+31164+OR+31165+OR+31166+OR+31181+OR+31182+OR+31191+OR+31192+OR+31193+OR+31194+OR+31195+OR+31210+OR+31220+OR+31261+OR+31281+OR+31282+OR+31291+OR+31310+OR+31320+OR+31381+OR+31382+OR+31391+OR+32161+OR+32162+OR+43040+OR+43071+OR+43072+OR+43073+OR+52010%29+OR+sector_vocabulary%3A2+AND+sector_code%3A%28311+OR+312+OR+313%29%29+OR+%28title_narrative%3A%28%22food+security%22+OR+%22food+insecurity%22%29+AND+description_narrative%3A%28%22food+security%22+OR+%22food+insecurity%22%29%29+AND+tag_vocabulary%3A2&fq=activity_date_iso_date%3A%5B2021-01-01T00%3A00%3A00Z+TO+%2A%5D&rows=1000&start=0

In [None]:
def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def fetch_page(start):
    params = {
        'q': '(sector_code:(11250 OR 12240 OR 31110 OR 31120 OR 31130 OR 31140 OR 31150 OR 31161 OR 31162 OR 31163 OR 31164 OR 31165 OR 31166 OR 31181 OR 31182 OR 31191 OR 31192 OR 31193 OR 31194 OR 31195 OR 31210 OR 31220 OR 31261 OR 31281 OR 31282 OR 31291 OR 31310 OR 31320 OR 31381 OR 31382 OR 31391 OR 32161 OR 32162 OR 43040 OR 43071 OR 43072 OR 43073 OR 52010) OR sector_vocabulary:2 AND sector_code:(311 OR 312 OR 313)) OR (title_narrative:("food security" OR "food insecurity") OR description_narrative:("food security" OR "food insecurity"))',
        'fl': 'iati_identifier,title_narrative,description_narrative,sector_code,activity_date_iso_date,activity_date_type,recipient_country_code',
        'fq': 'activity_date_type:2 AND activity_date_iso_date:[2021-01-01T00:00:00Z TO *]',  
        'rows': 1000,
        'start': start
    }
    headers = {'Ocp-Apim-Subscription-Key': api_key}
    
    for attempt in range(5):  
        try:
            response = requests_retry_session().get(base_url, headers=headers, params=params, timeout=30)
            response.raise_for_status()
            return response.json()['response']['docs']
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:
                wait = 2 ** attempt  # exponential backoff
                print(f"Rate limit hit. Waiting for {wait} seconds.")
                time.sleep(wait)
            else:
                print(f"HTTP error occurred: {e}")
        except Exception as e:
            print(f"An error occurred: {e}")
        
    print(f"Failed to fetch data for start={start} after 5 attempts")
    return []

def get_total_results():
    params = {
        'q': '(sector_code:(11250 OR 12240 OR 31110 OR 31120 OR 31130 OR 31140 OR 31150 OR 31161 OR 31162 OR 31163 OR 31164 OR 31165 OR 31166 OR 31181 OR 31182 OR 31191 OR 31192 OR 31193 OR 31194 OR 31195 OR 31210 OR 31220 OR 31261 OR 31281 OR 31282 OR 31291 OR 31310 OR 31320 OR 31381 OR 31382 OR 31391 OR 32161 OR 32162 OR 43040 OR 43071 OR 43072 OR 43073 OR 52010) OR sector_vocabulary:2 AND sector_code:(311 OR 312 OR 313)) OR (title_narrative:("food security" OR "food insecurity") OR description_narrative:("food security" OR "food insecurity"))',
        'rows': 0
    }
    headers = {'Ocp-Apim-Subscription-Key': api_key}
    response = requests_retry_session().get(base_url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json()['response']['numFound']
    else:
        print(f"Error: {response.status_code}")
        return 0

base_url = "https://api.iatistandard.org/datastore/activity/select"
total_results = get_total_results()
all_activities = []

print(f"Total results to fetch: {total_results}")

with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: 
    futures = [executor.submit(fetch_page, i) for i in range(0, total_results, 1000)]
    for i, future in enumerate(concurrent.futures.as_completed(futures)):
        all_activities.extend(future.result())
        print(f"Fetched page {i+1}/{len(futures)}")
        time.sleep(1) 

df = pd.DataFrame(all_activities)
print(df.head())
print(f"Total activities fetched: {len(df)}")