In [1]:
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime

In [2]:
def _create_dataframes_from_sessions (json_x, idx):
    '''Return a dataframe from the json file (the idx is only important for the next function)'''

    # It is important to set the working directory to the correct path

    # Import modules
    import numpy as np
    import pandas as pd
    import json

    # Load JSON and create dictionary json_data
    with open(json_x) as json_file:
        json_data = json.load(json_file)

    # Get list of dictionaries with the corresponding x,y-pairs and time
    views_lt = json_data.get('pageViews')

    # Create dataframe from list of dictionaries
    views_df = pd.DataFrame(views_lt)

    # Get the first rows of the json file in a dictionary
    beg_dc = dict((k, json_data[k]) for k in ('id', 'created', 'lastActivity'))

    # Create dataframe from dictionary (with one row) (data of first rows)
    beg_df_temp = pd.DataFrame.from_dict(beg_dc, orient = 'index').T

    if views_df.id.count() > 1:
        # Create dataframe with number of rows corresponding to events_df
        beg_df = beg_df_temp.append([beg_df_temp]*(views_df.id.count()-1),ignore_index=True)
    else:
        beg_df = beg_df_temp
    
    # Get the last rows of the json file in a dictionary
    end_dc = dict((k, json_data[k]) for k in ('duration', 'engagementTime', 'totalFriction', 'country', 'region', 'city',
                                                  'isp', 'ip', 'lang', 'userAgent', 'browser', 'browserVersion', 'os', 
                                                  'osVersion', 'device', 'referrer', 'referrerType', 'screenRes', 'tags', 
                                                  'variables', 'watched', 'starred', 'lng', 'lat', 'gdpr'))
        
    # Create dataframe from dictionary (with one row) (data of first rows)
    end_df_temp = pd.DataFrame.from_dict(end_dc, orient = 'index').T

    if views_df.id.count() > 1:
        # Create dataframe with number of rows corresponding to events_df
        end_df = end_df_temp.append([end_df_temp]*(views_df.id.count()-1),ignore_index=True)
    else:
        end_df = end_df_temp
    
    # Concatenate all three dataframes into one
    df = pd.concat([ beg_df, views_df, end_df],axis=1)
    
    # Add index column (for further function)
    df.insert(loc=0,column='idx', value = idx)

    # Return dataframe
    return df

In [3]:
def _create_dataframe_from_files(work_dir):
    '''This function takes the working directory as an input, uses the function '_create_dataframes_from_json' to 
    transform the json files into dataframes and returns a 'big' dataframe where the single dataframes of each file 
    are concatenated'''
    
    # Return list of files in working directory
    list_dir = os.listdir(work_dir)
    
    # Initialize dataframe with all files
    df_total = pd.DataFrame(columns=['idx','id', 'created', 'lastActivity',"annotations", "endTime","engagementTime",'friction',
                                     "id","scroll", "startTime", 'tags', "title", "uri", "visitTime", "websitePage",
                                     'duration', 'engagementTime', 'totalFriction', 'country', 'region', 'city', 'isp', 'ip',
                                     'lang', 'userAgent', 'browser', 'browserVersion', 'os', 'osVersion', 'device', 'referrer',
                                     'referrerType','screenRes', 'tags', 'variables', 'watched', 'starred', 'lng', 'lat', 
                                     'gdpr'])
    
    # Loop over every file in the list and return as a dataframe
    for index, file in enumerate(list_dir):
        
        df_single = _create_dataframes_from_sessions(file,index)
        
        # Append dataframe of a single file to the dataframe with all files
        df_total = df_total.append(df_single, ignore_index = True)
    
    col = ['idx','session_id', 'created', 'lastActivity', "annotations", "endTime","engagementTime", 'friction', "page_id",
           "scroll", "startTime", 'tags1', "title", "uri", "visitTime", "websitePage", 'duration', 'total_engagementTime', 
           'totalFriction', 'country', 'region', 'city', 'isp', 'ip', 'lang', 'userAgent', 'browser', 'browserVersion', 
           'os', 'osVersion', 'device', 'referrer', 'referrerType',  'screenRes', 'tags2', 'variables', 'watched', 
           'starred', 'lng', 'lat','gdpr']
    
    df_total.columns = col
    
    # Return dataframe
    return df_total

In [4]:
direction = r'C:\Users\Sellit\Desktop\BA_Coding\Data\2Sessions\non_purchase_sessions2'

In [5]:
os.chdir(direction)

In [None]:
start=datetime.now()
df_test = _create_dataframe_from_files(direction)
end = datetime.now()
print(end - start)

In [40]:
df_test

Unnamed: 0,idx,session_id,created,lastActivity,annotations,endTime,engagementTime,friction,page_id,scroll,...,referrer,referrerType,screenRes,tags2,variables,watched,starred,lng,lat,gdpr
0,0,0039a42c876d77711d59d5c2dc254e16,2018-02-24T11:10:31.2699629+01:00,2018-02-24T11:12:11.0633247+01:00,[],2018-02-24T11:11:48.5539629+01:00,64326,0,022431738b9635846310209bb75ac46d9195791c,64.0,...,,,1920x1080,[],[gclid=EAIaIQobChMIq4mk3Ke-2QIVz53tCh1jWwmlEAQ...,False,False,7.291,52.6906,True
1,0,0039a42c876d77711d59d5c2dc254e16,2018-02-24T11:10:31.2699629+01:00,2018-02-24T11:12:11.0633247+01:00,[],2018-02-24T11:12:11.0633247+01:00,17291,0,02245075a4aec10bc7144fc54c6cdca646117120,46.0,...,,,1920x1080,[],[gclid=EAIaIQobChMIq4mk3Ke-2QIVz53tCh1jWwmlEAQ...,False,False,7.291,52.6906,True
2,1,006a4a95465115458f2199fb0f81470e,2018-02-25T11:27:38.2030631+01:00,2018-02-25T11:32:24.5571748+01:00,[],2018-02-25T11:32:24.5571748+01:00,13389,0,02255045f8d99553a60d405c60e576b1dd9f35ad,30.0,...,,,1920x1080,[],[gclid=EAIaIQobChMIrvLrue3A2QIV75ztCh1IeAVZEAY...,False,False,8.4353,49.4811,True
3,2,009f85cda07dd74de695c242ced5e22f,2018-02-24T11:53:03.0186593+01:00,2018-02-24T11:53:32.722342+01:00,[],2018-02-24T11:53:32.722342+01:00,19226,0,0224378928dffc5388e2a4f4ef7fe376bd9d286b,42.0,...,,,1920x1080,[],[gclid=EAIaIQobChMI-oWomrG-2QIVkBobCh1PzQFUEAQ...,False,False,10.1853,47.9837,True
4,3,00af6821116c1da806e693619bae0ce3,2018-02-25T07:37:06.2691477+01:00,2018-02-25T07:46:06.7447296+01:00,[],2018-02-25T07:37:48.9556422+01:00,28170,0,02251244b02c7894852a710530720d3d0f0e4935,29.0,...,,,1920x1080,[],"[utm_source=criteo, utm_medium=retargeting, ut...",False,False,9.0117,48.6821,True
5,3,00af6821116c1da806e693619bae0ce3,2018-02-25T07:37:06.2691477+01:00,2018-02-25T07:46:06.7447296+01:00,[],2018-02-25T07:46:06.7447296+01:00,21071,0,02254460e24e42fc7288f46e198445566454cbeb,29.0,...,,,1920x1080,[],"[utm_source=criteo, utm_medium=retargeting, ut...",False,False,9.0117,48.6821,True
6,3,00af6821116c1da806e693619bae0ce3,2018-02-25T07:37:06.2691477+01:00,2018-02-25T07:46:06.7447296+01:00,[],2018-02-25T07:45:44.9849382+01:00,33699,0,02251060639caa2c3114df6fd1107ac34c2088b1,100.0,...,,,1920x1080,[],"[utm_source=criteo, utm_medium=retargeting, ut...",False,False,9.0117,48.6821,True
7,3,00af6821116c1da806e693619bae0ce3,2018-02-25T07:37:06.2691477+01:00,2018-02-25T07:46:06.7447296+01:00,[],2018-02-25T07:45:51.0571877+01:00,19397,0,0225450219e124807748df8470dec35f22633cd4,71.0,...,,,1920x1080,[],"[utm_source=criteo, utm_medium=retargeting, ut...",False,False,9.0117,48.6821,True
8,4,013068cd09c4acf676258ecbc182d85d,2018-02-24T16:52:37.0636185+01:00,2018-02-24T16:52:47.6016185+01:00,[],2018-02-24T16:52:47.6016185+01:00,4764,0,02243876e5464ce047efb48ef7ab206228764fba,17.0,...,,,1920x1080,[],[],False,False,11.0671,49.4639,True
9,5,013786f30d5024546348e1d256008518,2018-02-25T12:58:26.337125+01:00,2018-02-25T13:00:53.0692444+01:00,[],2018-02-25T13:00:53.0692444+01:00,114997,0,022518536f62ed2fc19b1c3f77fc32562c3e53cf,100.0,...,https://ads.eu.criteo.com/delivery/r/afr.php?d...,Link,1920x1080,[],"[utm_source=criteo, utm_medium=retargeting, ut...",False,False,14.459,51.2542,True


In [42]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15376 entries, 0 to 15375
Data columns (total 41 columns):
idx                     15376 non-null object
session_id              15376 non-null object
created                 15376 non-null object
lastActivity            15376 non-null object
annotations             15376 non-null object
endTime                 15376 non-null object
engagementTime          15376 non-null object
friction                15376 non-null object
page_id                 15376 non-null object
scroll                  15376 non-null float64
startTime               15376 non-null object
tags1                   15376 non-null object
title                   15376 non-null object
uri                     15376 non-null object
visitTime               15376 non-null object
websitePage             15376 non-null object
duration                15376 non-null object
total_engagementTime    15376 non-null object
totalFriction           15376 non-null object
country         

In [43]:
os.chdir(r'C:\Users\Sellit\Desktop\BA_Coding\Data\final_out')

In [46]:
start2=datetime.now()

df_test.to_hdf('new_sessions.h5', 'df_test', mode='w')

end2 = datetime.now()
print(end2 - start2)

0:00:00.178902


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block1_values] [items->['idx', 'session_id', 'created', 'lastActivity', 'annotations', 'endTime', 'engagementTime', 'friction', 'page_id', 'startTime', 'tags1', 'title', 'uri', 'visitTime', 'websitePage', 'duration', 'total_engagementTime', 'totalFriction', 'country', 'region', 'city', 'isp', 'ip', 'lang', 'userAgent', 'browser', 'browserVersion', 'os', 'osVersion', 'device', 'referrer', 'referrerType', 'screenRes', 'tags2', 'variables', 'watched', 'starred', 'lng', 'lat', 'gdpr']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
