In [None]:
import pandas as pd

import elastic.container
from elastic.record_event import RecordEvent

@RecordEvent
def cell_2():
    gl = pd.read_csv('game_logs.csv')
    
    return gl

In [None]:
gl = cell_2()

gl.head()

In [None]:
@RecordEvent
def cell_3():
    gl.info(memory_usage='deep')

In [None]:
cell_3()

In [None]:
@RecordEvent
def cell_4():
    for dtype in ['float','int','object']:
        selected_dtype = gl.select_dtypes(include=[dtype])
        mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
        mean_usage_mb = mean_usage_b / 1024 ** 2
        print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))
    return selected_dtype, mean_usage_b, mean_usage_mb

In [None]:
selected_dtype, mean_usage_b, mean_usage_mb = cell_4()

In [None]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [None]:
@RecordEvent
def cell_5():
    gl_int = gl.select_dtypes(include=['int'])
    converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')
    print(mem_usage(gl_int))
    print(mem_usage(converted_int))
    compare_ints = pd.concat([gl_int.dtypes,converted_int.dtypes],axis=1)
    compare_ints.columns = ['before','after']
    compare_ints.apply(pd.Series.value_counts)
    
    return gl_int, converted_int, compare_ints

In [None]:
gl_int, converted_int, compare_ints = cell_5()

In [None]:
@RecordEvent
def cell_6():
    gl_float = gl.select_dtypes(include=['float'])
    converted_float = gl_float.apply(pd.to_numeric,downcast='float')
    print(mem_usage(gl_float))
    print(mem_usage(converted_float))
    compare_floats = pd.concat([gl_float.dtypes,converted_float.dtypes],axis=1)
    compare_floats.columns = ['before','after']
    compare_floats.apply(pd.Series.value_counts)
    
    return gl_int, converted_float, compare_floats

In [None]:
gl_int, converted_float, compare_floats = cell_6()

In [None]:
@RecordEvent
def cell_7():
    optimized_gl = gl.copy()
    optimized_gl[converted_int.columns] = converted_int
    optimized_gl[converted_float.columns] = converted_float
    
    print(mem_usage(gl))
    print(mem_usage(optimized_gl))
    
    return optimized_gl

In [None]:
optimized_gl = cell_7()

In [None]:
@RecordEvent
def cell_8():
    gl_obj = gl.select_dtypes(include=['object']).copy()
    
    return gl_obj

In [None]:
gl_obj = cell_8()
gl_obj.describe()

In [None]:
@RecordEvent
def cell_9():
    dow = gl_obj.day_of_week
    print(dow.head())
    dow_cat = dow.astype('category')
    print(dow_cat.head())
    
    return dow, dow_cat

In [None]:
dow, dow_cat = cell_9()

In [None]:
@RecordEvent
def cell_10():
    converted_obj = pd.DataFrame()
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:,col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:,col] = gl_obj[col]
    
    return converted_obj, num_unique_values, num_total_values

In [None]:
converted_obj, num_unique_values, num_total_values = cell_10()

In [None]:
for x in elastic.container.operation_events:
    print(x, "\n")

In [None]:
for x in elastic.container.data_events:
    print(x, "\n")

In [None]:
parent_lookup = {oe: set() for oe in elastic.container.operation_events}
children_lookup = {oe: set() for oe in elastic.container.operation_events}

In [None]:
for oe in elastic.container.operation_events:
    for related_data_event in elastic.container.operation_event_lookup[oe.exec_uuid].related_data_events:
        parent_oe = related_data_event.prev_operation_event
        
        parent_lookup[oe].add(parent_oe)
        children_lookup[parent_oe].add(oe)