# Pandas json normalize




In [1]:
import pandas as pd

data = [
    {"id": 1, "name": "John", "info": {"age": 30, "city": "New York"}},
    {"id": 2, "name": "Jane", "info": {"age": 25, "city": "London"}}
]

df = pd.json_normalize(data)
print(df)

   id  name  info.age info.city
0   1  John        30  New York
1   2  Jane        25    London


In [2]:
data = {
    "school": "ABC High",
    "students": [
        {"name": "Tom", "scores": {"math": 90, "science": 85}},
        {"name": "Sara", "scores": {"math": 95, "science": 92}}
    ]
}

df = pd.json_normalize(data, record_path=['students'], meta=['school'])
print(df)

   name  scores.math  scores.science    school
0   Tom           90              85  ABC High
1  Sara           95              92  ABC High


The students list is buried inside the JSON structure.

record_path=['students'] digs into that list and flattens it.

meta=['school'] pulls in the school name for every student.

You might be wondering: “What if I didn’t use meta?”

Well, you’d lose the school info entirely because it’s outside the students list. That’s why meta is crucial when you want to retain context.

In [3]:
df = pd.json_normalize(data['students'], sep='_')
print(df)

   name  scores_math  scores_science
0   Tom           90              85
1  Sara           95              92


When dealing with deeply nested JSON, the key is to use:

record_path: This helps you dig into lists nested deep inside.

meta: This grabs any extra information outside of the list.

max_level: If you only want to flatten up to a certain depth, this parameter keeps things under control.

In [5]:
import pandas as pd

data = {
    "company": "TechCorp",
    "departments": [
        {
            "name": "Engineering",
            "employees": [
                {"name": "Alice", "skills": {"Python": 90, "SQL": 80}},
                {"name": "Bob", "skills": {"Python": 85, "SQL": 88}}
            ]
        },
        {
            "name": "HR",
            "employees": [
                {"name": "Charlie", "skills": {"Recruitment": 95, "Onboarding": 89}}
            ]
        }
    ]
}

# Flattening nested lists with meta info
df = pd.json_normalize(
    data,
    record_path=['departments', 'employees'],
    meta=[['departments', 'name'], 'company'],
    sep='_'
)

df

Unnamed: 0,name,skills_Python,skills_SQL,skills_Recruitment,skills_Onboarding,departments_name,company
0,Alice,90.0,80.0,,,Engineering,TechCorp
1,Bob,85.0,88.0,,,Engineering,TechCorp
2,Charlie,,,95.0,89.0,HR,TechCorp


NaN values are often not errors—they’re just missing data.  Here’s why it happens:

Some keys might be present in one record but missing in another.
This inconsistency leads to NaN (which stands for “Not a Number”).
How to Handle It?

Ignore Missing Keys:
Use errors='ignore' to skip over missing fields gracefully.

In [6]:
data = [
    {"id": 1, "name": "John", "info": {"age": 30}},
    {"id": 2, "name": "Jane"}  # Missing 'info'
]

df = pd.json_normalize(data, errors='ignore')
print(df)

   id  name  info.age
0   1  John      30.0
1   2  Jane       NaN


2. Fill Missing Values:
After normalization, you can fill in the gaps with fillna().

In [9]:
# df.fillna("Not Available", inplace=True)
df = df.fillna("Not Available")
print(df)

   id  name       info.age
0   1  John           30.0
1   2  Jane  Not Available


In [11]:
import requests
import pandas as pd

# Mock API response
response = {
    "users": [
        {"id": 1, "name": "Alice", "contacts": {"email": "alice@example.com"}},
        {"id": 2, "name": "Bob", "contacts": {"email": "bob@example.com"}}
    ]
}

# Convert API response to JSON and normalize
df = pd.json_normalize(response['users'])
print(df)

   id   name     contacts.email
0   1  Alice  alice@example.com
1   2    Bob    bob@example.com
