### Code to use World Bank API and create many .csv files in S3

In [5]:
import requests # the library we'll use to call the API
import pandas as pd # good ole pandas
import json # Used to deal with the json objects that return from the API
import boto3 # You know this one!
import io  # Used for some input/output functions below

In [6]:
# Make sure credentials are set
session = boto3.Session()
sts = session.client('sts')
response = sts.get_caller_identity()
my_username = response['Arn'].split('/')[1]
print(my_username)
s3c = session.client('s3')

kcolvin


In [7]:
# Get all the countries
url = 'http://api.worldbank.org/v2/country/?format=json&per_page=500'
# Call the API again
r = requests.get(url)
# Now simply load the data into a dataframe
cl_df = pd.DataFrame(r.json()[1])
cl_df = cl_df[['id','name']]
cl_df.head()

Unnamed: 0,id,name
0,ABW,Aruba
1,AFE,Africa Eastern and Southern
2,AFG,Afghanistan
3,AFR,Africa
4,AFW,Africa Western and Central


### GDP Indicator

In [None]:
# Function to save DataFrame to S3 in the GDP folder
def save_to_s3(fn,df):
    # fn = filename
    # df = dataframe
    bucket  = 'gse580-read-only'
    key = 'athena/world-bank-data/GDP/' + fn 
    ##
    # Code to use the put_object function to save clean_df as a .csv file in S3
    with io.StringIO() as csv_buffer:
        # Use the pandas to_csv function
        df.to_csv(csv_buffer, index=False)
        #
        # Here is the put_object function
        response = s3c.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue())
        #
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
        #
        if status == 200:
            print(f"Successful S3 put_object response. Status - {status}")
        else:
            print(f"Unsuccessful S3 put_object response. Status - {status}")

In [4]:
# Loop though all countries
for index, row in cl_df.iterrows():
    base_url = 'http://api.worldbank.org/v2/countries/'
    #ctry = 'br;cn;us;de/'
    ctry = row['id'] + '/'
    #ind = 'SP.POP.TOTL/?'
    ind = 'NY.GDP.MKTP.CD/?'
    form = 'format=json'
    num  = '&per_page=500'

    # Build the final string
    url = base_url + ctry + 'indicators/' + ind + form + num
    print(url)
    print('Country:',row['name'])

    # Call the API
    r = requests.get(url)

    # Try to convert to df and save to S3
    try:
        df = pd.DataFrame(r.json()[1])
        #print(df.head(2))
        # Clean up the df
        df2 = df[['countryiso3code','date','value']]
        fn = ind.split('/')[0] + '-' + row['id'] + '.csv' # Build filename
        print(fn)
        # Save it to s3
        save_to_s3(fn,df2)
    except:
        print("Something didn't work with:",row['id'],row['name'])

### Total Population Indicator

In [8]:
# Function to save DataFrame to S3 in the POP folder
def save_to_s3(fn,df):
    # fn = filename
    # df = dataframe
    bucket  = 'gse580-read-only'
    key = 'athena/world-bank-data/POP/' + fn 
    ##
    # Code to use the put_object function to save clean_df as a .csv file in S3
    with io.StringIO() as csv_buffer:
        # Use the pandas to_csv function
        df.to_csv(csv_buffer, index=False)
        #
        # Here is the put_object function
        response = s3c.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue())
        #
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
        #
        if status == 200:
            print(f"Successful S3 put_object response. Status - {status}")
        else:
            print(f"Unsuccessful S3 put_object response. Status - {status}")

In [None]:
# Loop though all countries
for index, row in cl_df.iterrows():
    base_url = 'http://api.worldbank.org/v2/countries/'
    #ctry = 'br;cn;us;de/'
    ctry = row['id'] + '/'
    ind = 'SP.POP.TOTL/?'
    form = 'format=json'
    num  = '&per_page=500'

    # Build the final string
    url = base_url + ctry + 'indicators/' + ind + form + num
    print(url)
    print('Country:',row['name'])

    # Call the API
    r = requests.get(url)

    # Try to convert to df and save to S3
    try:
        df = pd.DataFrame(r.json()[1])
        #print(df.head(2))
        # Clean up the df
        df2 = df[['countryiso3code','date','value']]
        fn = ind.split('/')[0] + '-' + row['id'] + '.csv' # Build filename
        print(fn)
        # Save it to s3
        save_to_s3(fn,df2)
    except:
        print("Something didn't work with:",row['id'],row['name'])

In [None]:
# Check a specific file to make sure it is valid
#
# Load a .csv file from S3 straight into a pandas df
bucket = 'gse580-read-only'
key = 'athena/world-bank-data/POP/SP.POP.TOTL-BEL.csv'
#
# Call the 'get_object' function from boto3. This is a little different than the download_file() from above. 
response = s3c.get_object(Bucket=bucket, Key=key)
#
# Get the HTTPStatusCode from the response
status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    # If all OK, then create the DataFrame
    print(f"Successful S3 get_object response. Status - {status}")
    df = pd.read_csv(response.get('Body'))
else:
    # See what the response is and troubleshoot
    print(f"Unsuccessful S3 get_object response. Status - {status}")
#
# Assuming it worked, show the df.head()
df

In [None]:
# Verify it exists:
response = s3c.list_objects(Bucket=bucket)
all_objects = response['Contents']
for obj in all_objects:
    # Search for your key in all the keys
    if key in obj['Key']:
        print('It does exist:')
        print(obj['Key'])