# JSON examples and exercise
****
+ get familiar with packages for dealing with JSON
+ study examples with JSON strings and files 
+ work on exercise to be completed and submitted 
****
+ reference: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader
+ data source: http://jsonstudio.com/resources/
****

In [1]:
import pandas as pd

## imports for Python, Pandas

In [2]:
import json
from pandas.io.json import json_normalize

## JSON example, with string

+ demonstrates creation of normalized dataframes (tables) from nested json string
+ source: http://pandas.pydata.org/pandas-docs/stable/io.html#normalization

In [3]:
# define json string
data = [{'state': 'Florida', 
         'shortname': 'FL',
         'info': {'governor': 'Rick Scott'},
         'counties': [{'name': 'Dade', 'population': 12345},
                      {'name': 'Broward', 'population': 40000},
                      {'name': 'Palm Beach', 'population': 60000}]},
        {'state': 'Ohio',
         'shortname': 'OH',
         'info': {'governor': 'John Kasich'},
         'counties': [{'name': 'Summit', 'population': 1234},
                      {'name': 'Cuyahoga', 'population': 1337}]}]

In [4]:
# use normalization to create tables from nested element
json_normalize(data, 'counties')

Unnamed: 0,name,population
0,Dade,12345
1,Broward,40000
2,Palm Beach,60000
3,Summit,1234
4,Cuyahoga,1337


In [5]:
# further populate tables created from nested element
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])

Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich


****
## JSON example, with file

+ demonstrates reading in a json file as a string and as a table
+ uses small sample file containing data about projects funded by the World Bank 
+ data source: http://jsonstudio.com/resources/

In [6]:
# load json as string
#json.load((open('data/world_bank_projects_less.json')))

In [7]:
# load as Pandas dataframe
sample_json_df = pd.read_json('data/world_bank_projects_less.json')
sample_json_df

Unnamed: 0,_id,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,...,sectorcode,source,status,supplementprojectflg,theme1,theme_namecode,themecode,totalamt,totalcommamt,url
0,{'$oid': '52b213b38594d8a2be17c780'},1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,...,"ET,BS,ES,EP",IBRD,Active,N,"{'Name': 'Education for all', 'Percent': 100}","[{'name': 'Education for all', 'code': '65'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,{'$oid': '52b213b38594d8a2be17c781'},2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,...,"BZ,BS",IBRD,Active,N,"{'Name': 'Other economic management', 'Percent...","[{'name': 'Other economic management', 'code':...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en


In [8]:
sample_json_df.shape

(2, 50)

****
## JSON exercise

Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
1. Find the 10 countries with most projects
2. Find the top 10 major project themes (using column 'mjtheme_namecode')
3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

### Load data and check missing values, duplicates

In [9]:
# load data as Pandas dataframe
df = pd.read_json('data/world_bank_projects.json')
df.shape

(500, 50)

In [10]:
# check missing
df.isnull().sum().sort_values(ascending=False)[:20]

sector4                     326
sector3                     235
project_abstract            138
closingdate                 130
sector2                     120
envassesmentcategorycode     70
projectdocs                  54
docty                        54
impagency                    28
borrower                     15
mjtheme                       9
themecode                     9
theme_namecode                9
lendinginstrtype              5
lendinginstr                  5
supplementprojectflg          2
id                            0
ibrdcommamt                   0
grantamt                      0
idacommamt                    0
dtype: int64

In [11]:
# check duplicates on countryname and project_name
print('The unique countryname is: {}.'.format(str(len(df.countryname.unique()))))
print('The unique project_name is: {}.'.format(str(len(df.project_name.unique()))))

The unique countryname is: 118.
The unique project_name is: 500.


### Q1. Find the 10 countries with most projects

In [12]:
# countryname doesn't have missing values and has duplicates and project_name is unique. groupby countryname and get counts of each group will work
df.groupby(['countryname']).size().reset_index(name='project_name_counts').sort_values(by='project_name_counts',ascending=False).iloc[:10,]
#df.groupby(['countryname']).size().sort_values(ascending=False)[:10]

Unnamed: 0,countryname,project_name_counts
39,People's Republic of China,19
64,Republic of Indonesia,19
107,Socialist Republic of Vietnam,17
63,Republic of India,16
97,Republic of Yemen,13
38,People's Republic of Bangladesh,12
34,Nepal,12
25,Kingdom of Morocco,12
76,Republic of Mozambique,11
0,Africa,11


### Q2. Find the top 10 major project themes (using column 'mjtheme_namecode')

In [13]:
# concatenate all mjtheme_namecode to a list
mjtheme_namecode_list = []
for i in range(500):
    mjtheme_namecode_list = mjtheme_namecode_list + df.mjtheme_namecode[i]
print(len(mjtheme_namecode_list))
print(mjtheme_namecode_list[:5])

1499
[{'code': '8', 'name': 'Human development'}, {'code': '11', 'name': ''}, {'code': '1', 'name': 'Economic management'}, {'code': '6', 'name': 'Social protection and risk management'}, {'code': '5', 'name': 'Trade and integration'}]


In [14]:
# convert the list to a dataframe
df_mjtheme = pd.DataFrame(mjtheme_namecode_list)
df_mjtheme.head()

Unnamed: 0,code,name
0,8,Human development
1,11,
2,1,Economic management
3,6,Social protection and risk management
4,5,Trade and integration


In [15]:
# the top 10 major project themes. There are missing values in for #7.
df_mjtheme.groupby(['name']).size().sort_values(ascending=False)[:10]

name
Environment and natural resources management    223
Rural development                               202
Human development                               197
Public sector governance                        184
Social protection and risk management           158
Financial and private sector development        130
                                                122
Social dev/gender/inclusion                     119
Trade and integration                            72
Urban development                                47
dtype: int64

### Q3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [16]:
# fill blanks in name with NaN
import numpy as np
df_mjtheme.replace('', np.nan, inplace=True)
df_mjtheme.head()

Unnamed: 0,code,name
0,8,Human development
1,11,
2,1,Economic management
3,6,Social protection and risk management
4,5,Trade and integration


In [17]:
# find unique code-name match
df_mjtheme_unique = df_mjtheme.drop_duplicates().dropna().sort_values(by='code')
df_mjtheme_unique 

Unnamed: 0,code,name
2,1,Economic management
18,10,Rural development
6,11,Environment and natural resources management
5,2,Public sector governance
252,3,Rule of law
11,4,Financial and private sector development
4,5,Trade and integration
3,6,Social protection and risk management
8,7,Social dev/gender/inclusion
0,8,Human development


In [18]:
# merge two df to replace NaN with true name
df_mjtheme_new = pd.merge(df_mjtheme, df_mjtheme_unique, on=['code'], suffixes=['_old', ''], how='left').drop(['name_old'], axis=1)
df_mjtheme_new.head()

Unnamed: 0,code,name
0,8,Human development
1,11,Environment and natural resources management
2,1,Economic management
3,6,Social protection and risk management
4,5,Trade and integration


In [19]:
# check missing values in new dataframe
df_mjtheme_new.isnull().sum()

code    0
name    0
dtype: int64

In [20]:
# redo Q2. the top 10 major project themes.
df_mjtheme_new.groupby(['name']).size().sort_values(ascending=False)[:10]

name
Environment and natural resources management    250
Rural development                               216
Human development                               210
Public sector governance                        199
Social protection and risk management           168
Financial and private sector development        146
Social dev/gender/inclusion                     130
Trade and integration                            77
Urban development                                50
Economic management                              38
dtype: int64

In [21]:
# create a list of length of mjtheme_namecode per record and its cumulative sum
ls_len = list(df.mjtheme_namecode.apply(len))
indx = [0] + list(np.cumsum(ls_len))

In [22]:
print(ls_len[:10])
print(indx[:10])

[2, 2, 4, 2, 2, 2, 2, 2, 2, 3]
[0, 2, 4, 8, 10, 12, 14, 16, 18, 20]


In [23]:
# convert the missing names filled code-name dataframe to a list of dictionaries
mjtheme_namecode_list_fill = df_mjtheme_new.to_dict('records')

In [24]:
# create copy of original df with missing names in mjtheme_namecode
df0 = df.copy()

In [25]:
# slice the list based on the length of sublist. 
# create list comprehension, then convert to series and assign to df.mjtheme_namecode
df.mjtheme_namecode = pd.Series([mjtheme_namecode_list_fill[indx[i]:indx[i+1]] for i in range(df.shape[0])])

In [26]:
# compare mjtheme_namecode between original df and missing names filled df
i = 0
print(df0.mjtheme_namecode[i])
print(df.mjtheme_namecode[i])

[{'code': '8', 'name': 'Human development'}, {'code': '11', 'name': ''}]
[{'code': '8', 'name': 'Human development'}, {'code': '11', 'name': 'Environment and natural resources management'}]


In [27]:
# compare mjtheme_namecode between original df and missing names filled df
i = 411
print(df0.mjtheme_namecode[i])
print(df.mjtheme_namecode[i])

[{'code': '6', 'name': 'Social protection and risk management'}, {'code': '6', 'name': ''}]
[{'code': '6', 'name': 'Social protection and risk management'}, {'code': '6', 'name': 'Social protection and risk management'}]


In [28]:
# compare mjtheme_namecode between original df and missing names filled df
i = 451
print(df0.mjtheme_namecode[i])
print(df.mjtheme_namecode[i])

[{'code': '8', 'name': 'Human development'}, {'code': '2', 'name': ''}]
[{'code': '8', 'name': 'Human development'}, {'code': '2', 'name': 'Public sector governance'}]


In [29]:
# compare mjtheme_namecode between original df and missing names filled df
i = 18
print(df0.mjtheme_namecode[i])
print(df.mjtheme_namecode[i])

[{'code': '8', 'name': 'Human development'}, {'code': '8', 'name': 'Human development'}, {'code': '2', 'name': 'Public sector governance'}, {'code': '7', 'name': 'Social dev/gender/inclusion'}, {'code': '8', 'name': 'Human development'}]
[{'code': '8', 'name': 'Human development'}, {'code': '8', 'name': 'Human development'}, {'code': '2', 'name': 'Public sector governance'}, {'code': '7', 'name': 'Social dev/gender/inclusion'}, {'code': '8', 'name': 'Human development'}]
