In [1]:
import pandas as pd
# the state rep is common to both supervised and RL 
from gym_ianna.envs.ianna_env import get_state_rep

In [2]:
acts = pd.read_csv('../data/experts_actions.tsv', sep = '\t', escapechar='\\')

In [3]:
acts.sort_values('action_id', inplace=True)

In [4]:
acts.head()

Unnamed: 0,creation_time,session_id,project_id,action_id,action_params,parent_display_id,child_display_id
0,2016-08-14 12:44:05,1,1,1,"{""field"":""eth_src"",""aggregations"":[],""groupPri...",1,2
1,2016-08-14 12:44:08,1,1,2,"{""field"":""ip_src"",""aggregations"":[],""groupPrio...",2,3
2,2016-08-15 09:40:42,2,1,3,"{""field"":""eth_src"",""aggregations"":[{""field"":""l...",4,5
3,2016-08-15 13:13:54,2,1,4,"{""field"":""ip_src"",""aggregations"":[{""field"":""le...",4,6
4,2016-08-15 13:14:10,2,1,5,"{""field"":""ip_src"",""aggregations"":[],""groupPrio...",4,7


In [5]:
project_cache = {}
for i in acts.project_id.unique():
    fname = '../data/{0}.tsv'.format(i)
    print('reading', fname)
    df = pd.read_csv(fname, sep = '\t', index_col = 0)
    project_cache[i] = df 

reading ../data/1.tsv
reading ../data/2.tsv
reading ../data/4.tsv
reading ../data/3.tsv


In [6]:
import json
disp_cache = {}
        
def build_all_states(x):
    if x['parent_display_id'] in disp_cache:
        state = disp_cache[x['parent_display_id']]
        df = state["df"]
        grouped_by = state["grouped_by"]
        rep = state["rep"] 
    else:
        df = project_cache[x['project_id']]
        grouped_by = {col: 0 for col in df.columns}
        disp_cache[x['parent_display_id']] = {"df": df, "grouped_by": grouped_by, "rep": get_state_rep(df, grouped_by)}
    action = json.loads(x['action_params'])
    new_df, new_grouped_by = df.copy(), grouped_by.copy()
    if "groupPriority" in action:
        new_grouped_by[action["field"]] = 1
    disp_cache[x['child_display_id']] = {"df": new_df, "grouped_by": new_grouped_by, "rep": get_state_rep(new_df, new_grouped_by)}
acts.apply(build_all_states, axis = 1)
pass


In [7]:
recs = []

def build_acts_rep_table(x):
    action = json.loads(x['action_params'])
    if "groupPriority" not in action:
        return
    field_name = action["field"]
    field_id = project_cache[x['project_id']].columns.tolist().index(field_name)
    rec = [field_id]
    
    rec.extend(disp_cache[x['parent_display_id']]['rep'])
    recs.append(rec)

acts.apply(build_acts_rep_table, axis = 1)
acts_rep_table = pd.DataFrame(recs)
acts_rep_table.rename(columns={0: 'label'}, inplace=True)
acts_rep_table.head()

Unnamed: 0,label,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,2,8648,8648,10,2,2,3,8148,0,3692,...,0,0,0,0,0,0,0,0,0,0
1,7,8648,2,10,2,2,3,8148,0,3692,...,0,0,0,0,0,0,0,0,0,0
2,2,8648,8648,10,2,2,3,8148,0,3692,...,0,0,0,0,0,0,0,0,0,0
3,7,8648,8648,10,2,2,3,8148,0,3692,...,0,0,0,0,0,0,0,0,0,0
4,7,8648,8648,10,2,2,3,8148,0,3692,...,0,0,0,0,0,0,0,0,0,0


In [8]:
acts_rep_table.to_csv('../data/acts_rep_table.csv', index = False)