In [None]:
import pandas as pd
import json
import numpy as np

In [None]:
with open('../data/event_history.json') as fin:
    raw = json.loads(fin.read())

In [None]:
raw.keys()

In [None]:
data = pd.DataFrame.from_dict(raw['Records'])

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data['userIdentity'].iloc[0]

In [None]:
def parse_node_from_user_identity(user_identity):
    try:
        typ = user_identity['type']

        if typ == 'AssumedRole':
            principal = user_identity['principalId']
            id_arn = user_identity['sessionContext']['sessionIssuer']['arn']
#             return edge_type, (principal, id_arn)
            return principal, typ

        elif typ == 'AWSService':
            invoke = user_identity['invokedBy']
#             return edge_type, invoke
            return invoke, typ

        elif typ == 'AWSAccount':
            invoke = user_identity['invokedBy']
            principal = user_identity['principalId']
#             return edge_type, (principal, invoke)
            return principal, typ
            
    except Exception as e:
        print(e)
        invoke = user_identity['invokedBy']
        return 'Account', invoke
    return None, None

In [None]:
def parse_node_from_request_parameter(request):
    try:
        arns = [str(request[k]) for k in request.keys() if 'arn' in k.lower()]
        return arns
    except Exception as e:
#         print(e)
        return None
    

In [None]:
data['p_request'] = data['requestParameters'].apply(lambda x: parse_node_from_request_parameter(x))
data[data['p_request'].notna()][
    data[data['p_request'].notna()]['p_request'].apply(lambda x: len(x) > 1)
]['p_request'].iloc[0]

In [None]:
data['requestParameters']

In [None]:
data['p_identity'] = data['userIdentity'].apply(lambda x: parse_node_from_user_identity(x))
data[data['p_identity'].isna()]

In [None]:
col_idx_map = dict(zip(data.columns.values, range(len(data.columns.values))))
col_idx_map

In [None]:
def parse_edge_from_action(row):
    eventTime = row[col_idx_map['eventTime']]
    
    dst_type = row[col_idx_map['eventSource']]
    
    dst = row[col_idx_map['p_request']]
    if dst is not None and len(dst) > 0:
        dst = dst[0]
    else:
        dst = dst_type

    edge_type = row[col_idx_map['eventName']]
    
    src, src_type = row[col_idx_map['p_identity']]
    
    return (eventTime, src, src_type, dst, dst_type, edge_type)
    

In [None]:
data.loc[2]

In [None]:
data['edges'] = data.apply(lambda x: parse_edge_from_action(x), axis=1)

In [None]:
edge_df = pd.DataFrame(data['edges'].values.tolist(), columns=['timestamp','src', 'src_type', 'dst', 'dst_type', 'edge_type'])

edge_df

In [None]:
# def get_alphabets():
#     lo_offset = 97
#     al = []
#     for i in range(10):
#         al.append(i)
        
#     for i in range(26):
# #         print(f"{i+lo_offset}, {chr(i+lo_offset)}")
#         al.append(chr(i+lo_offset))
#     ca_offset = 65
#     for i in range(26):
# #         print(f"{i+ca_offset}, {chr(i+ca_offset)}")
#         al.append(chr(i+ca_offset))
#     return al
# get_alphabets()

In [None]:
def encode_edge_info(df):
#     al = get_alphabets()
    unique_nodes = np.unique(np.concatenate([df['src'].unique(), 
#                 df['src_type'].unique(),
                df['dst'].unique()
#                 df['dst_type'].unique()
               ]))

    unique_types = np.unique(np.concatenate([df['edge_type'].unique(), 
                df['src_type'].unique(),
                df['dst_type'].unique()
               ]))
    node_2_id = {res:idx for idx,res in enumerate(unique_nodes)}
    id_2_node = {idx:res for idx,res in enumerate(unique_nodes)}
    
    type_2_id = {res:str(hex(idx))[2:] for idx,res in enumerate(unique_types)}
    id_2_type = {str(hex(idx))[2:]:res for idx,res in enumerate(unique_types)}
    
    return node_2_id, id_2_node, type_2_id, id_2_type
    

In [None]:
node_2_id, id_2_node, type_2_id, id_2_type = encode_edge_info(edge_df)

In [None]:
node_2_id

In [None]:
id_2_type

In [None]:
def convert_encoded_edges(row):
    timestamp = row[0]
    src = row[1]
    src_type = row[2]
    dst = row[3]
    dst_type = row[4]
    edge_type = row[5]
    
    return (timestamp,
            node_2_id[src],
            type_2_id[src_type],
            
            node_2_id[dst],
            type_2_id[dst_type],
            
            type_2_id[edge_type])
    
    
    
edge_df.apply(lambda x:convert_encoded_edges(x), axis=1)

In [None]:
encoded_df = pd.DataFrame.from_records(edge_df.apply(lambda x:convert_encoded_edges(x), axis=1), 
                                      columns=['timestamp', 'src', 'src_type', 'dst', 'dst_type', 'edge_type']
                                      )
encoded_df

In [None]:
encoded_df.to_csv('../data/raw_edges.tsv', sep='\t', index=False)

In [None]:
# edge_df.loc[7943].dst

In [None]:
# for idx, i in enumerate(edge_df['dst'].values):
#     t = str(type(i))
#     if t != "<class 'str'>":
#         print(idx, t)

In [None]:
# [edge_df[col].unique() for col in edge_df.columns]

### GID (Treat each user is different graph for each minute)

In [None]:
encoded_df['graph_id'] = encoded_df['src'] 
encoded_df

In [None]:
encoded_df.drop('timestamp', axis=1).to_csv('../data/data_edges.tsv', sep='\t', index=False, index_label=False, header=False)

In [None]:
encoded_df.dst.max()

In [None]:
encoded_df[encoded_df.src==47]

In [None]:
# encoded_df[encoded_df.graph_id.isin([71, 72, 47, 46])].groupby('graph_id').head(10)

# sample_df = pd.concat(
#     [encoded_df[~encoded_df.graph_id.isin([71, 72, 47, 46])],
#     encoded_df[encoded_df.graph_id.isin([71, 72, 47, 46])].groupby('graph_id').head(10)]
# )
# sample_df

In [None]:
# sample_df.dtypes

In [None]:
# sample_df.drop('timestamp', axis=1).to_csv('../data/sample_edges.tsv', sep='\t', index=False, index_label=False, header=False)

In [None]:
encoded_df['timestamp'] = pd.to_datetime(encoded_df.timestamp)

In [None]:
encoded_df['hm'] = encoded_df.timestamp.dt.strftime('%Y-%m-%d %H-%M')

In [None]:
sample_df = encoded_df.drop('timestamp', axis=1).drop_duplicates()
sample_df

In [None]:
sample_df.drop(['hm'], axis=1).to_csv('../data/sample_edges.tsv', sep='\t', index=False, index_label=False, header=False)

In [None]:
sample_df[sample_df.src.isin([71,72,46,47])]