# Normalizing Json files in Pandas.

In [1]:
# import libraries
import pandas as pd
import json

## 1. A simple json object / a dictionary

### 1.1 When JSON is a simple dict

In [2]:
a_dict = {
    "school" : "ABC primary school",
    "location" : "London",
    "ranking" : 2
}

In [3]:
df = pd.json_normalize(a_dict)
df

Unnamed: 0,school,location,ranking
0,ABC primary school,London,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   school    1 non-null      object
 1   location  1 non-null      object
 2   ranking   1 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 152.0+ bytes


### 1.2. A simple list of dictionaries

In [5]:
json_list = [
    {"class":"Year 1","student number":20, "room":"Yellow"},
    {"class": "Year 2", "student number":25, "room": "Blue"}
]

In [6]:
pd.json_normalize(json_list)

Unnamed: 0,class,student number,room
0,Year 1,20,Yellow
1,Year 2,25,Blue


sometimes we have keys that are always not present 

In [7]:
json_list2 = [
    {"class":"Year 1", "num_of_students":20, "room": "Yellow"},
    {"class":"Year 2", "room":"Blue"}, # num_of students is missing
]

In [8]:
pd.json_normalize(json_list2)

Unnamed: 0,class,num_of_students,room
0,Year 1,20.0,Yellow
1,Year 2,,Blue


## 2. Multi leveled json

### 2.1 When the data is a multi leveled dictionary

The value of info is multiple levels (known as a nested dict).

In [9]:
multi_lev = {
    "school":"ABC primary school",
    "location":"London",
    "ranking":2,
    "info":{
        "president":"John Cena",
        "contacts":{
            "email":{
                "admission":"admission@abc.com",
                "general":"info@abc.com"
            },
            "tel":"123456789"
        }

    }
}

In [10]:
pd.json_normalize(multi_lev)

Unnamed: 0,school,location,ranking,info.president,info.contacts.email.admission,info.contacts.email.general,info.contacts.tel
0,ABC primary school,London,2,John Cena,admission@abc.com,info@abc.com,123456789


If you don’t want to dig all the way down to each value use the max_level argument. With the argument max_level=1, we can see that our nested value contacts is put up into a single column info.contacts.

In [11]:
pd.json_normalize(multi_lev, max_level=1)

Unnamed: 0,school,location,ranking,info.president,info.contacts
0,ABC primary school,London,2,John Cena,"{'email': {'admission': 'admission@abc.com', '..."


### 2.2 When the data is a multi leveled list of dictionaries

In [12]:
multi_lev_list = [
    {
        "class":"Year 1",
        "student count":20,
        "room":"Yellow",
        "info":{
            "teachers":{
                "math":"Mr. Mutindwa",
                "physics":"Mr.King'ang'i"
            }
        }
    },
    {
        "class":"Year 2",
        "student count":25,
        "room":"Blue",
        "info":{
            "teachers":{
                "math": "Mrs. Rotich",
                "physics": "Mr. Rugano"
            }
        }
    }
]

In [13]:
pd.json_normalize(multi_lev_list)

Unnamed: 0,class,student count,room,info.teachers.math,info.teachers.physics
0,Year 1,20,Yellow,Mr. Mutindwa,Mr.King'ang'i
1,Year 2,25,Blue,Mrs. Rotich,Mr. Rugano


In [14]:
pd.json_normalize(multi_lev_list, max_level=1)

Unnamed: 0,class,student count,room,info.teachers
0,Year 1,20,Yellow,"{'math': 'Mr. Mutindwa', 'physics': 'Mr.King'a..."
1,Year 2,25,Blue,"{'math': 'Mrs. Rotich', 'physics': 'Mr. Rugano'}"


## 3. JSON with a Nested List

### 3.1 When the data is a dict

In [15]:
nested_dict = {
    "school":"ABC primary school",
    "location":"London",
    "ranking":2,
    "info":{
        "president":"John Cena",
        "contacts":{
            "email":{
                "admission":"admission@abc.com",
                "general":"info@qbc.com"
            },
            "tel":"123456789"
        }
    },
    "students":[
        {"name":"Tom"},
        {"name":"James"},
        {"name":"Jacqueline"}
    ]
}

In [16]:
pd.json_normalize(nested_dict)

Unnamed: 0,school,location,ranking,students,info.president,info.contacts.email.admission,info.contacts.email.general,info.contacts.tel
0,ABC primary school,London,2,"[{'name': 'Tom'}, {'name': 'James'}, {'name': ...",John Cena,admission@abc.com,info@qbc.com,123456789


note that the value of students is a list. our nested list is put up into a single column students and other values are flattened. How can we flatten the nested list? To do that, we can set the argument record_path to ['students']:

In [17]:
# flatten students
pd.json_normalize(nested_dict, record_path=['students'])

Unnamed: 0,name
0,Tom
1,James
2,Jacqueline


The result looks great but doesn’t include school and tel. To include them, we can use the argument meta to specify a list of metadata we want in the result.

In [18]:
pd.json_normalize(nested_dict, record_path=['students'], meta=['school', ['info','contacts', 'tel']])

Unnamed: 0,name,school,info.contacts.tel
0,Tom,ABC primary school,123456789
1,James,ABC primary school,123456789
2,Jacqueline,ABC primary school,123456789


In [19]:
pd.json_normalize(nested_dict, record_path=['students'], meta=['school', 'location', ['info','contacts', 'email']])

Unnamed: 0,name,school,location,info.contacts.email
0,Tom,ABC primary school,London,"{'admission': 'admission@abc.com', 'general': ..."
1,James,ABC primary school,London,"{'admission': 'admission@abc.com', 'general': ..."
2,Jacqueline,ABC primary school,London,"{'admission': 'admission@abc.com', 'general': ..."


### 3.2 When data is a list of dicts

In [20]:
list_of_dicts = [
    {
        "class":"Year 1",
        "student count":20,
        "room":"Yellow",
        "info":{
            "teachers":{
                "math":"Mr. Mutindwa",
                "physics":"Mr. King'ang'i"
            }
        },
        "students":[
            {
                "name":"Tom",
                "sex":"M",
                "grades":{
                    "math":66,
                    "physics":77
                }
            },
            {
                "name":"James",
                "sex":"M",
                "grades":{
                    "math":80,
                    "physics":78
                    }    
            }
        ]
    },
    {
        "class":"Year 2",
        "student count":25,
        "room":"Blue",
        "info":{
            "teachers":{
                "math":"Mrs. King'ori",
                "physics":"Mr.Karanja"
            }
        },
        "students":[
            {
                "name":"Tony",
                "sex":"M"
            },
            {
                "name":"Jacqueline",
                "sex":"F"
            }
        ]
    }
]

In [21]:
pd.json_normalize(list_of_dicts)

Unnamed: 0,class,student count,room,students,info.teachers.math,info.teachers.physics
0,Year 1,20,Yellow,"[{'name': 'Tom', 'sex': 'M', 'grades': {'math'...",Mr. Mutindwa,Mr. King'ang'i
1,Year 2,25,Blue,"[{'name': 'Tony', 'sex': 'M'}, {'name': 'Jacqu...",Mrs. King'ori,Mr.Karanja


All nested lists are put up into a single column students and other values are flattened. To flatten the nested list, we can set the argument record_path to ['students']. Notices that not all records have math and physics, and those missing values are shown as NaN.

In [22]:
pd.json_normalize(list_of_dicts, record_path='students')

Unnamed: 0,name,sex,grades.math,grades.physics
0,Tom,M,66.0,77.0
1,James,M,80.0,78.0
2,Tony,M,,
3,Jacqueline,F,,


In [23]:
# include meta data
pd.json_normalize(list_of_dicts, record_path=['students'], meta = ['class', 'room', ['info', 'teachers','math']])

Unnamed: 0,name,sex,grades.math,grades.physics,class,room,info.teachers.math
0,Tom,M,66.0,77.0,Year 1,Yellow,Mr. Mutindwa
1,James,M,80.0,78.0,Year 1,Yellow,Mr. Mutindwa
2,Tony,M,,,Year 2,Blue,Mrs. King'ori
3,Jacqueline,F,,,Year 2,Blue,Mrs. King'ori


In [24]:
pd.json_normalize(list_of_dicts, record_path=['students'], meta = ['class', 'room', ['info', 'teachers','physics']])

Unnamed: 0,name,sex,grades.math,grades.physics,class,room,info.teachers.physics
0,Tom,M,66.0,77.0,Year 1,Yellow,Mr. King'ang'i
1,James,M,80.0,78.0,Year 1,Yellow,Mr. King'ang'i
2,Tony,M,,,Year 2,Blue,Mr.Karanja
3,Jacqueline,F,,,Year 2,Blue,Mr.Karanja


## 4. The Errors argument

The errors argument default to 'raise’ and will raise KeyError if keys listed in meta are not always present. For example, the math teacher is not available from the second record.

In [25]:
list_of_dicts = [
    {
        "class":"Year 1",
        "student count":20,
        "room":"Yellow",
        "info":{
            "teachers":{
                "math":"Mr. Mutindwa",
                "physics":"Mr. King'ang'i"
            }
        },
        "students":[
            {
                "name":"Tom",
                "sex":"M",
                "grades":{
                    "math":66,
                    "physics":77
                }
            },
            {
                "name":"James",
                "sex":"M",
                "grades":{
                    "math":80,
                    "physics":78
                    }    
            }
        ]
    },
    {
        "class":"Year 2",
        "student count":25,
        "room":"Blue",
        "info":{
            "teachers":{
                # no math teacher
                "physics":"Mr.Karanja"
            }
        },
        "students":[
            {
                "name":"Tony",
                "sex":"M"
            },
            {
                "name":"Jacqueline",
                "sex":"F"
            }
        ]
    }
]

In [26]:
pd.json_normalize(list_of_dicts, record_path=['students'], meta = ['class', 'room', ['info', 'teachers','math']])

KeyError: "Try running with errors='ignore' as key 'math' is not always present"

To work around it, set the argument errors to 'ignore' and those missing values are filled with NaN.

In [27]:
pd.json_normalize(list_of_dicts, record_path=['students'], meta = ['class', 'room', ['info', 'teachers','math']],errors='ignore')

Unnamed: 0,name,sex,grades.math,grades.physics,class,room,info.teachers.math
0,Tom,M,66.0,77.0,Year 1,Yellow,Mr. Mutindwa
1,James,M,80.0,78.0,Year 1,Yellow,Mr. Mutindwa
2,Tony,M,,,Year 2,Blue,
3,Jacqueline,F,,,Year 2,Blue,


## Working with a local file

In [28]:
with open('1_asimple_dict.json', 'r') as f:
    data=json.loads(f.read())

pd.json_normalize(data)

Unnamed: 0,school,location,ranking
0,ABC primary school,London,2
