In [1]:
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime

In [2]:
def _create_dataframes_from_json (json_x, idx):
    '''Return a dataframe from the json file (the idx is only important for the next function)'''

    # It is important to set the working directory to the correct path

    # Import modules
    import numpy as np
    import pandas as pd
    import json

    # Load JSON and create dictionary json_data
    with open(json_x) as json_file:
        json_data = json.load(json_file)

    # Get list of dictionaries with the corresponding x,y-pairs and time
    events_lt = json_data.get('events')

    # Create dataframe from list of dictionaries
    events_df = pd.DataFrame(events_lt)

    # Get dictionary with the information of the session
    session_dc = json_data.get('session')

    # Create dataframe from dictionary (with one row) (session data)
    session_df_temp = pd.DataFrame.from_dict(session_dc, orient ='index')
    session_df_temp = session_df_temp.transpose()

    # Create dataframe with number of rows corresponding to events_df
    if events_df.x.count() > 1:        
        session_df = session_df_temp.append([session_df_temp]*(events_df.x.count()-1),ignore_index=True)

    # Get the first rows of the json file in a dictionary
    beg_dc = dict((k, json_data[k]) for k in ('startTime', 'websitePageUrl', 'visitTime',"engagementTime", "pageTitle", "url", 
                                       "viewportWidth", "viewportHeight", "tags"))

    # Create dataframe from dictionary (with one row) (data of first rows)
    beg_df_temp = pd.DataFrame.from_dict(beg_dc, orient = 'index').T

    # Create dataframe with number of rows corresponding to events_df
    if events_df.x.count() > 1:
        beg_df = beg_df_temp.append([beg_df_temp]*(events_df.x.count()-1),ignore_index=True)

    # Concatenate all three dataframes into one
    df = pd.concat([ beg_df, events_df, session_df],axis=1)
    
    # Add index column (for further function)
    df.insert(loc=0,column='idx', value = idx)
    
    # Check if 'ta' column exists
    if 'ta' not in df.columns:
        df.insert(loc=12,column='ta', value = np.NaN)
        
    # Check if 'v' column exists
    if 'v' in df.columns:
        del df['v']

    # Return dataframe
    return df

In [3]:
def _create_dataframe_from_files(work_dir):
    '''This function takes the working directory as an input, uses the function '_create_dataframes_from_json' to 
    transform the json files into dataframes and returns a 'big' dataframe where the single dataframes of each file 
    are concatenated'''
    
    # Return list of files in working directory
    list_dir = os.listdir(work_dir)
    
    # Initialize dataframe with all files
    df_total = pd.DataFrame(columns=['idx', 'startTime', 'websitePageUrl', 'visitTime', 'engagementTime',
       'pageTitle', 'url', 'viewportWidth', 'viewportHeight', 'tags', 'e', 't',
       'ta', 'ty', 'x', 'y', 'id', 'created', 'lastActivity', 'pages',
       'duration', 'engagementTime', 'totalFriction', 'country', 'region',
       'city', 'isp', 'ip', 'lang', 'userAgent', 'browser', 'browserVersion',
       'os', 'osVersion', 'device', 'referrer', 'referrerType', 'screenRes',
       'entryPage', 'tags', 'variables', 'watched', 'starred', 'lng', 'lat',
       'visitorId', 'gdpr', 'visitorName', 'playbackUrl'])
    
    # Count how often the loop was skipped
    skipped = 0
    
    # Loop over every file in the list and return as a dataframe
    for index, file in enumerate(list_dir):
        
        try:
            df_single = _create_dataframes_from_json(file,index)
        
            # Append dataframe of a single file to the dataframe with all files
            df_total = df_total.append(df_single, ignore_index = True)
    
        except:
            skipped += 1
            continue
            
    col = ['idx','startTime', 'websitePageUrl', 'visitTime', 'engagementTime1',
    'pageTitle', 'url', 'viewportWidth', 'viewportHeight', 'tags1', 'e', 't',
    'ta', 'ty', 'x', 'y', 'id', 'created', 'lastActivity', 'pages',
    'duration', 'engagementTime2', 'totalFriction', 'country', 'region',
    'city', 'isp', 'ip', 'lang', 'userAgent', 'browser', 'browserVersion',
    'os', 'osVersion', 'device', 'referrer', 'referrerType', 'screenRes',
    'entryPage', 'tags2', 'variables', 'watched', 'starred', 'lng', 'lat',
    'visitorId', 'gdpr', 'visitorName', 'playbackUrl']
    
    df_total.columns = col
    
    # Return dataframe
    return(df_total, skipped)

In [5]:
direction = r'C:\Users\Sellit\Desktop\BA_Coding\Data\final1'

In [6]:
os.chdir(direction)

In [7]:
start=datetime.now()
(df_test,skipped) = _create_dataframe_from_files(direction)
end = datetime.now()
print(end - start)

0:36:02.903441


In [8]:
skipped

0

In [9]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698685 entries, 0 to 698684
Data columns (total 49 columns):
idx                698685 non-null object
startTime          698685 non-null object
websitePageUrl     698685 non-null object
visitTime          698685 non-null object
engagementTime1    698685 non-null object
pageTitle          698685 non-null object
url                698685 non-null object
viewportWidth      698685 non-null object
viewportHeight     698685 non-null object
tags1              698685 non-null object
e                  698685 non-null object
t                  698685 non-null object
ta                 289562 non-null object
ty                 698685 non-null object
x                  698685 non-null object
y                  698685 non-null object
id                 698685 non-null object
created            698685 non-null object
lastActivity       698685 non-null object
pages              698685 non-null object
duration           698685 non-null object
engagem

In [14]:
os.chdir(r'C:\Users\Sellit\Desktop\BA_Coding\Data\test_final')

In [15]:
start2=datetime.now()

df_test.to_hdf('out.h5', 'df_test', mode='w')

end2 = datetime.now()
print(end2 - start2)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['idx', 'startTime', 'websitePageUrl', 'visitTime', 'engagementTime1', 'pageTitle', 'url', 'viewportWidth', 'viewportHeight', 'tags1', 'e', 't', 'ta', 'ty', 'x', 'y', 'id', 'created', 'lastActivity', 'pages', 'duration', 'engagementTime2', 'totalFriction', 'country', 'region', 'city', 'isp', 'ip', 'lang', 'userAgent', 'browser', 'browserVersion', 'os', 'osVersion', 'device', 'referrer', 'referrerType', 'screenRes', 'entryPage', 'tags2', 'variables', 'watched', 'starred', 'lng', 'lat', 'visitorId', 'gdpr', 'visitorName', 'playbackUrl']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


0:00:05.974882
