## JSON mini project using data from world bank projects
****

In [102]:
import pandas as pd

In [101]:
# Load json file as Pandas dataframe
df = pd.read_json('data/world_bank_projects.json')

In [8]:
# view info of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 50 columns):
_id                         500 non-null object
approvalfy                  500 non-null int64
board_approval_month        500 non-null object
boardapprovaldate           500 non-null object
borrower                    485 non-null object
closingdate                 370 non-null object
country_namecode            500 non-null object
countrycode                 500 non-null object
countryname                 500 non-null object
countryshortname            500 non-null object
docty                       446 non-null object
envassesmentcategorycode    430 non-null object
grantamt                    500 non-null int64
ibrdcommamt                 500 non-null int64
id                          500 non-null object
idacommamt                  500 non-null int64
impagency                   472 non-null object
lendinginstr                495 non-null object
lendinginstrtype            495 non

****
## Top Ten Countries With Most Projects

In [14]:
# Since each row represents a unique project for a given country,
# we can find the top 10 countries with the most projects by counting
# the number of country name values that appear the most
# Take the 'countryname' column from df and apply the value_counts() method
# Then use head() method to find the top 10 countries

df['countryname'].value_counts().head(10)

People's Republic of China         19
Republic of Indonesia              19
Socialist Republic of Vietnam      17
Republic of India                  16
Republic of Yemen                  13
Kingdom of Morocco                 12
Nepal                              12
People's Republic of Bangladesh    12
Republic of Mozambique             11
Africa                             11
Name: countryname, dtype: int64

****
## Create New Dataframe With Missing Values From 'mjtheme_namecode' Column Filled

In [75]:
# Create a dictionary to store the key value pairs of code numbers as keys
# and name of the theme as values

codes = {}

for theme_list in df['mjtheme_namecode']:
    for theme_dicts in theme_list:
        if (theme_dicts['code'] not in codes) and (theme_dicts['name'] != ''):
            codes[theme_dicts['code']] = theme_dicts['name']

In [77]:
# Create a new data frame from the same json file as above

df2 = pd.read_json('data/world_bank_projects.json')

# Replace the entries with missing theme names with the correct theme name
# from its corresponding code number

for theme_list in df2['mjtheme_namecode']:
    for theme_dicts in theme_list:
        if theme_dicts['name'] == '':
                theme_dicts['name'] = codes[theme_dicts['code']]

****
## Top Ten Major Project Themes

In [80]:
# Create a dictionary to store the frequency counts of the theme names

theme_freqs = {}

for theme_list in df2['mjtheme_namecode']:
    for theme_dicts in theme_list:
        if theme_dicts['name'] in theme_freqs:
            theme_freqs[theme_dicts['name']] += 1
        else:
            theme_freqs[theme_dicts['name']] = 1

In [100]:
# Create a Pandas Series of the theme_freqs dictionary
# Sort the Series in descending order and call .head(10) to
# get the top 10 themes by count

pd.Series(theme_freqs).sort_values(ascending=False).head(10)

Environment and natural resources management    250
Rural development                               216
Human development                               210
Public sector governance                        199
Social protection and risk management           168
Financial and private sector development        146
Social dev/gender/inclusion                     130
Trade and integration                            77
Urban development                                50
Economic management                              38
dtype: int64