In [50]:
import numpy as np
import pandas as pd
import json
from pandas import json_normalize
from IPython.display import display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = pd.read_json("results-20211210-150638.json", orient="records", convert_dates=["event_timestamp", "event_date"])

In [51]:
def flatten_nested_json_df(df):

    df = df.reset_index()

    print(f"original shape: {df.shape}")
    print(f"original columns: {df.columns}")


    # search for columns to explode/flatten
    s = (df.applymap(type) == list).all()
    list_columns = s[s].index.tolist()

    s = (df.applymap(type) == dict).all()
    dict_columns = s[s].index.tolist()

    print(f"lists: {list_columns}, dicts: {dict_columns}")
    while len(list_columns) > 0 or len(dict_columns) > 0:
        new_columns = []

        for col in dict_columns:
            print(f"flattening: {col}")
            # explode dictionaries horizontally, adding new columns
            horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
            horiz_exploded.index = df.index
            df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
            new_columns.extend(horiz_exploded.columns) # inplace

        for col in list_columns:
            print(f"exploding: {col}")
            # explode lists vertically, adding new columns
            df = df.drop(columns=[col]).join(df[col].explode().to_frame())
            new_columns.append(col)

        # check if there are still dict o list fields to flatten
        s = (df[new_columns].applymap(type) == list).all()
        list_columns = s[s].index.tolist()

        s = (df[new_columns].applymap(type) == dict).all()
        dict_columns = s[s].index.tolist()

        print(f"lists: {list_columns}, dicts: {dict_columns}")

    print(f"final shape: {df.shape}")
    print(f"final columns: {df.columns}")
    return df

In [52]:
dff = pd.DataFrame()

d = flatten_nested_json_df(df)

original shape: (3819, 8)
original columns: Index(['index', 'user_id', 'event_date', 'event_name', 'event_timestamp', 'event_params', 'operating_system', 'country'], dtype='object')
lists: ['event_params'], dicts: []
exploding: event_params
lists: [], dicts: ['event_params']
flattening: event_params
lists: [], dicts: []
final shape: (36301, 12)
final columns: Index(['index', 'user_id', 'event_date', 'event_name', 'event_timestamp', 'operating_system', 'country', 'event_params.key', 'event_params.value.string_value', 'event_params.value.int_value', 'event_params.value.float_value', 'event_params.value.double_value'], dtype='object')


In [53]:
display(d.head())

Unnamed: 0,index,user_id,event_date,event_name,event_timestamp,operating_system,country,event_params.key,event_params.value.string_value,event_params.value.int_value,event_params.value.float_value,event_params.value.double_value
0,0,Rosie,20211126,notification_receive,2021-11-26 04:11:40.117,Android,Vietnam,firebase_event_origin,fcm,,,
0,0,Rosie,20211126,notification_receive,2021-11-26 04:11:40.117,Android,Vietnam,message_type,display,,,
1,1,Rosie,20211126,notification_receive,2021-11-26 07:37:29.662,Android,Vietnam,firebase_event_origin,fcm,,,
1,1,Rosie,20211126,notification_receive,2021-11-26 07:37:29.662,Android,Vietnam,message_type,display,,,
2,2,lucluck,20211126,notification_receive,2021-11-26 02:30:19.792,Android,Vietnam,message_type,display,,,


In [54]:
d["event_name"].unique()

array(['notification_receive', 'screen_view', 'session_start',
       'Android_system', 'app_exception', 'error', 'user_engagement',
       'first_open', 'notification_foreground', 'app_remove',
       'entrance_login', 'signup_phone_number', 'app_clear_data',
       'notification_dismiss', 'create_parti_home', 'create_parti_start',
       'create_parti_setting', 'create_parti_done', 'parti_mute_talk',
       'parti_mute_music', 'notification_open', 'entrance_signup',
       'parti_join', 'parti_leave', 'parti_unmute_talk'], dtype=object)