# Import json, pandas

In [59]:
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize


# load data from file and convert to pandas dataframe

In [120]:
# load the json data into python variable

with open('data/world_bank_projects.json') as f:
    data = json.load(f)
    
# normalize data to get theme details from json nested key
df = json_normalize(data, 'mjtheme_namecode', ['countryshortname', 'project_name'])

# fill blank theme names with NA
df['name'] = df['name'].replace(r'^\s*$', np.nan, regex=True)

df
  

Unnamed: 0,code,name,countryshortname,project_name
0,8,Human development,Ethiopia,Ethiopia General Education Quality Improvement...
1,11,,Ethiopia,Ethiopia General Education Quality Improvement...
2,1,Economic management,Tunisia,TN: DTF Social Protection Reforms Support
3,6,Social protection and risk management,Tunisia,TN: DTF Social Protection Reforms Support
4,5,Trade and integration,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
5,2,Public sector governance,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
6,11,Environment and natural resources management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
7,6,Social protection and risk management,Tuvalu,Tuvalu Aviation Investment Project - Additiona...
8,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership
9,7,Social dev/gender/inclusion,"Yemen, Republic of",Gov't and Civil Society Organization Partnership


# get countries with major project 

In [121]:

df.countryshortname.value_counts()[:10]




Indonesia             56
India                 51
Vietnam               43
Brazil                41
Bangladesh            41
China                 40
Africa                39
Yemen, Republic of    34
Morocco               32
Mozambique            31
Name: countryshortname, dtype: int64

# Major project themes

In [122]:
def get_mapping():
    theme_lookup = {}
    theme_name = df[df.name != ''].drop_duplicates()
    for item in theme_name.to_dict('records'):
        theme_lookup.update({item['code']: item['name']})
    return theme_lookup
  
# store the dict to lookup theme name for code
mapping = get_mapping()

# update dataframe with theme_name in a new column
df['name_clean'] = df.code.apply(lambda row: mapping.get(row))

df.name_clean.value_counts()[:10]


Environment and natural resources management    250
Rural development                               216
Human development                               210
Public sector governance                        199
Financial and private sector development        146
Social dev/gender/inclusion                     130
Trade and integration                            77
Urban development                                50
Rule of law                                      15
Name: name_clean, dtype: int64