# All Pandas json_normalize() you should know for flattening JSON

Source code for Medium's article [All Pandas json_normalize() you should know for flattening JSON](https://bit.ly/3xxkw07), written by [B. Chen](https://bindichen.medium.com/).

## Importing libraries

In [37]:
import json
import requests

import pandas as pd

## Flattening a simple JSON

### When the JSON is a simple dict

In [2]:
a_dict = {
    'school': 'ABC primary school',
    'location': 'london',
    'ranking': 2
}

df = pd.json_normalize(a_dict)

In [3]:
df

Unnamed: 0,school,location,ranking
0,ABC primary school,london,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   school    1 non-null      object
 1   location  1 non-null      object
 2   ranking   1 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 152.0+ bytes


### When the data is a list of dicts

In [5]:
json_list = [
    {'class': 'Year 1', 'student number': 20, 'room': 'Yellow'},
    {'class': 'Year 2', 'student number': 25, 'room': 'Blue'}
]

df = pd.json_normalize(json_list)

In [6]:
df

Unnamed: 0,class,student number,room
0,Year 1,20,Yellow
1,Year 2,25,Blue


In [7]:
json_list = [
    {'class': 'Year 1', 'student number': 20, 'room': 'Yellow'},
    {'class': 'Year 2', 'room': 'Blue'}
]

df = pd.json_normalize(json_list)

In [8]:
df

Unnamed: 0,class,student number,room
0,Year 1,20.0,Yellow
1,Year 2,,Blue


## Flattening a JSON with multiple levels 

### When the data is a dict

In [9]:
json_obj = {
    'school': 'ABC primary school',
    'location': 'London',
    'ranking': 2,
    'info': {
        'president': 'John Kasich',
        'contacts': {
            'email': {
                'admission': 'admission@abc.com',
                'general': 'info@abc.com'
            },
            'tel': '123456789'
        }
    }
}

df = pd.json_normalize(json_obj)

In [10]:
df

Unnamed: 0,school,location,ranking,info.president,info.contacts.email.admission,info.contacts.email.general,info.contacts.tel
0,ABC primary school,London,2,John Kasich,admission@abc.com,info@abc.com,123456789


In [11]:
df = pd.json_normalize(json_obj, max_level=1)

In [12]:
df

Unnamed: 0,school,location,ranking,info.president,info.contacts
0,ABC primary school,London,2,John Kasich,"{'email': {'admission': 'admission@abc.com', '..."


### When the data is a list of dicts

In [13]:
json_list = [
    {
        'class': 'Year 1',
        'student count': 20,
        'room': 'Yellow',
        'info': {
            'teachers': {
                'math': 'Rick Scott',
                'physics': 'Elon Mask'
            }
        }
    },
    {
        'class': 'Year 2',
        'student count': 25,
        'room': 'Blue',
        'info': {
            'teachers': {
                'math': 'Alan Turing',
                'physics': 'Albert Einstein'
            }
        }
    }
]

df = pd.json_normalize(json_list) 

In [14]:
df

Unnamed: 0,class,student count,room,info.teachers.math,info.teachers.physics
0,Year 1,20,Yellow,Rick Scott,Elon Mask
1,Year 2,25,Blue,Alan Turing,Albert Einstein


In [15]:
df = pd.json_normalize(json_list, max_level=1)

In [16]:
df

Unnamed: 0,class,student count,room,info.teachers
0,Year 1,20,Yellow,"{'math': 'Rick Scott', 'physics': 'Elon Mask'}"
1,Year 2,25,Blue,"{'math': 'Alan Turing', 'physics': 'Albert Ein..."


## Flattening JSON with a nested list

### When the data is a dict

In [17]:
json_obj = {
    'school': 'ABC primary school',
    'location': 'London',
    'ranking': 2,
    'info': {
        'president': 'John Kasich',
        'contacts': {
            'email': {
                'admission': 'admission@abc.com',
                'general': 'info@abc.com'
            },
            'tel': '123456789'
        }
    },
    'students': [
        {'name': 'Tom'},
        {'name': 'James'},
        {'name': 'Jacqueline'},
    ]
}

df = pd.json_normalize(json_obj)

In [18]:
df

Unnamed: 0,school,location,ranking,students,info.president,info.contacts.email.admission,info.contacts.email.general,info.contacts.tel
0,ABC primary school,London,2,"[{'name': 'Tom'}, {'name': 'James'}, {'name': ...",John Kasich,admission@abc.com,info@abc.com,123456789


In [19]:
df = pd.json_normalize(json_obj, record_path=['students'])

In [20]:
df

Unnamed: 0,name
0,Tom
1,James
2,Jacqueline


In [21]:
df = pd.json_normalize(
     json_obj,
     record_path=['students'],
     meta=['school', ['info', 'contacts', 'tel']]
     )

In [22]:
df

Unnamed: 0,name,school,info.contacts.tel
0,Tom,ABC primary school,123456789
1,James,ABC primary school,123456789
2,Jacqueline,ABC primary school,123456789


### When the data is a list of dicts

In [23]:
json_list = [
    {
        'class': 'Year 1',
        'student count': 20,
        'room': 'Yellow',
        'info': {
            'teachers': {
                'math': 'Rick Scott',
                'physics': 'Elon Mask'
            }
        },
        'students': [
            {
                'name': 'Tom',
                'sex': 'M',
                'grades': {'math': 66, 'physics': 77}
            },
            {
                'name': 'James',
                'sex': 'M',
                'grades': {'math': 80, 'physics': 78}
            }
        ]
    },
    {
        'class': 'Year 2',
        'student count': 25,
        'room': 'Blue',
        'info': {
            'teachers': {
                'math': 'Alan Turing',
                'physics': 'Albert Einstein'
            }
        },
        'students': [
            {
                'name': 'Tony',
                'sex': 'M',
            },
            {
                'name': 'Jacqueline',
                'sex': 'F'
            }
        ]
    }
]

df = pd.json_normalize(json_list)

In [24]:
df

Unnamed: 0,class,student count,room,students,info.teachers.math,info.teachers.physics
0,Year 1,20,Yellow,"[{'name': 'Tom', 'sex': 'M', 'grades': {'math'...",Rick Scott,Elon Mask
1,Year 2,25,Blue,"[{'name': 'Tony', 'sex': 'M'}, {'name': 'Jacqu...",Alan Turing,Albert Einstein


In [25]:
df = pd.json_normalize(json_list, record_path=['students'])

In [26]:
df

Unnamed: 0,name,sex,grades.math,grades.physics
0,Tom,M,66.0,77.0
1,James,M,80.0,78.0
2,Tony,M,,
3,Jacqueline,F,,


In [27]:
df = pd.json_normalize(
     json_list,
     record_path=['students'],
     meta=['class', 'room', ['info', 'teachers', 'math']]
    )

In [28]:
df

Unnamed: 0,name,sex,grades.math,grades.physics,class,room,info.teachers.math
0,Tom,M,66.0,77.0,Year 1,Yellow,Rick Scott
1,James,M,80.0,78.0,Year 1,Yellow,Rick Scott
2,Tony,M,,,Year 2,Blue,Alan Turing
3,Jacqueline,F,,,Year 2,Blue,Alan Turing


## The errors argument

In [29]:
json_list = [
    {
        'class': 'Year 1',
        'student count': 20,
        'room': 'Yellow',
        'info': {
            'teachers': {
                'math': 'Rick Scott',
                'physics': 'Elon Mask'
            }
        },
        'students': [
            {
                'name': 'Tom',
                'sex': 'M'
            },
            {
                'name': 'James',
                'sex': 'M'
            }
        ]
    },
    {
        'class': 'Year 2',
        'student count': 25,
        'room': 'Blue',
        'info': {
            'teachers': {
                # No math teacher
                #
                'physics': 'Albert Einstein'
            }
        },
        'students': [
            {
                'name': 'Tony',
                'sex': 'M',
            },
            {
                'name': 'Jacqueline',
                'sex': 'F'
            }
        ]
    }
]

df = pd.json_normalize(
     json_list,
     record_path=['students'],
     meta=['class', 'room', ['info', 'teachers', 'math']]
    )

KeyError: "Try running with errors='ignore' as key 'math' is not always present"

In [31]:
df = pd.json_normalize(
     json_list,
     record_path=['students'],
     meta=['class', 'room', ['info', 'teachers', 'math']],
     errors='ignore'
    )

In [32]:
df

Unnamed: 0,name,sex,class,room,info.teachers.math
0,Tom,M,Year 1,Yellow,Rick Scott
1,James,M,Year 1,Yellow,Rick Scott
2,Tony,M,Year 2,Blue,
3,Jacqueline,F,Year 2,Blue,


## Custom Separator using the sep argument

In [33]:
json_list = [
    {
        'class': 'Year 1',
        'student count': 20,
        'room': 'Yellow',
        'info': {
            'teachers': {
                'math': 'Rick Scott',
                'physics': 'Elon Mask'
            }
        },
        'students': [
            {
                'name': 'Tom',
                'sex': 'M',
                'grades': {'math': 66, 'physics': 77}
            },
            {
                'name': 'James',
                'sex': 'M',
                'grades': {'math': 80, 'physics': 78}
            }
        ]
    },
    {
        'class': 'Year 2',
        'student count': 25,
        'room': 'Blue',
        'info': {
            'teachers': {
                'math': 'Alan Turing',
                'physics': 'Albert Einstein'
            }
        },
        'students': [
            {
                'name': 'Tony',
                'sex': 'M',
            },
            {
                'name': 'Jacqueline',
                'sex': 'F'
            }
        ]
    }
]

df = pd.json_normalize(
     json_list,
     record_path=['students'],
     meta=['class', 'room',['info', 'teachers', 'math']],
     errors='ignore',
     sep='->'
    )

In [34]:
df

Unnamed: 0,name,sex,grades->math,grades->physics,class,room,info->teachers->math
0,Tom,M,66.0,77.0,Year 1,Yellow,Rick Scott
1,James,M,80.0,78.0,Year 1,Yellow,Rick Scott
2,Tony,M,,,Year 2,Blue,Alan Turing
3,Jacqueline,F,,,Year 2,Blue,Alan Turing


## Adding prefix for meta and record data

In [35]:
df = pd.json_normalize(
     json_list,
     record_path=['students'],
     meta=['class'],
     meta_prefix='meta-',
     record_prefix='student-',
     errors='ignore'
    )

In [36]:
df

Unnamed: 0,student-name,student-sex,student-grades.math,student-grades.physics,meta-class
0,Tom,M,66.0,77.0,Year 1
1,James,M,80.0,78.0,Year 1
2,Tony,M,,,Year 2
3,Jacqueline,F,,,Year 2


## Working witha local file

In [None]:
# This code needs a file to work
#

# Load data using Python JSON module
#
with open('data/simple.json', 'r') as f:

    data = json.loads(f.read())

# Flattening JSON data
#
df = pd.json_normalize(data)

## Working with a URL

In [38]:
URL = 'http://raw.githubusercontent.com/BindiChen/machine-learning/master/data-analysis/027-pandas-convert-json/data/simple.json'

data = json.loads(requests.get(URL).text)

# Flattening JSON data
#
df = pd.json_normalize(data)

In [39]:
df

Unnamed: 0,id,name,math,physics,chemistry
0,A001,Tom,60,66,61
1,A002,James,89,76,51
2,A003,Jenny,79,90,78
