In [115]:
import pandas as pd
import json
import pyreact_core.query.displays as qd
import pyreact.query.packets as qp
import operator
import json
import numpy as np
import scipy as sp

In [116]:
#init:
actions = pd.read_csv('experts_dataset/experts_actions.tsv', sep = '\t', escapechar='\\')
displays = pd.read_csv('experts_dataset/experts_displays.tsv', sep = '\t', escapechar='\\')
data=[]
for i in range(4):
    df = pd.read_csv('raw_datasets/'+str(i+1)+".tsv", sep = '\t', index_col=0)
    data.append(df)
    

In [117]:
def hack_min(pd_series):
    return np.min(pd_series.dropna())
def hack_max(pd_series):
    return np.max(pd_series.dropna())


INT_OPERATOR_MAP = {
    8: operator.eq,
    32: operator.gt,
    64: operator.ge,
    128: operator.lt,
    256: operator.le,
    512: operator.ne,
}

AGG_MAP = {
    'sum': np.sum,
    'count': len ,
    'min': hack_min,#lambda x:np.nanmin(x.dropna()),
    'max': hack_max,#lambda x:np.nanmax(x.dropna()),
    'avg': np.mean
}

KEYS=[ 'eth_dst', 'eth_src', 'highest_layer', 'info_line',
       'ip_dst', 'ip_src', 'length', 'number',
        'sniff_timestamp', 'tcp_dstport', 'tcp_srcport',
       'tcp_stream']


In [118]:
def get_filtered_df(project_id,filtering_dict):
    #Given a project_id and the filtering critria, return the corresponding DataFrame
    #legacy:
    filters=filtering_dict["list"]
    df=data[project_id-1].copy()
    if filters:
        for filt in filters:
            field = filt["field"]
            op_num = filt["condition"]
            value = filt["term"]
            #print(field,op_num,value)

            #extract the operation:
            #print(field,op_num,value)
            if op_num in INT_OPERATOR_MAP.keys():
                opr = INT_OPERATOR_MAP.get(op_num)
                value= float(value) if df[field].dtype!='O' else value
                df = df[opr(df[field], value)]
            else:
                if op_num==16:
                    df = df[df[field].str.contains(value,na=False)]
                if op_num==2:
                    df = df[df[field].str.startswith(value,na=False)]
                if op_num==4:
                    df = df[df[field].str.endswith(value,na=False)]
    
    return df

In [119]:
def get_groupby_df(df,grouping_dict,aggregation_dict):
    #Given a dataframe, the grouping and aggregations - result (i) the aggregated dataframe, and (ii)the groupby element
 
    groupings=grouping_dict["list"]
    if aggregation_dict:
        aggregations=aggregation_dict["list"]
        #print(aggregations)
    else:
        aggregations = None
    grouping_attrs = [group["field"] for group in groupings]
    if not grouping_attrs:
        return None,None
    
    df_gb= df.groupby(grouping_attrs)
    
    agg_dict={'number':len} #all group-by gets the count by default in REACT-UI
    if aggregations: #Custom aggregations: sum,count,avg,min,max
        for agg in aggregations:
            agg_dict[agg['field']] = AGG_MAP.get(agg['type'])

        
    agg_df = df_gb.agg(agg_dict)
    return df_gb,agg_df

def get_df_by_row(row):
    return get_filtered_df(row["project_id"],json.loads(row["filtering"]))

def get_grouping_by_row(row):
    df = get_filtered_df(row["project_id"],json.loads(row["filtering"]))
    df_gb,agg_df = get_groupby_df(df,json.loads(row["grouping"]),json.loads(row["aggregations"]))
    return df_gb, agg_df
    

    
        

    

In [None]:
###MEASURES:


In [1]:
def get_data_column_measures(column):
    #for each column, compute its: (1) normalized value entropy (2)Null count (3)Unique values count
    B=20
    u = column.nunique()
    n = column.isnull().sum()
    column_na=column.dropna()
    size=len(column)
    if column.dtype=='O':
        h=sp.stats.entropy(column_na.value_counts().values)/np.log(len(column.dropna()))
    else:
        h= sp.stats.entropy(np.histogram(column_na,bins=B)[0])/np.log(B)
    return {"unique":u/(size-n),"nulls":n/size,"entropy":h}

def calc_data_layer(disp_row):
    #This method take a display row, and calculate the "data layer" measures for each column
    df=get_filtered_df(disp_row["project_id"],json.loads(disp_row["filtering"]))
    return df[KEYS].apply(get_data_column_measures).to_dict()

def get_grouping_measures(group_obj,agg_df):
    """"number" is the unique identifier of a packet, 
    therefore we use it to count the size of each group , 
    although this may feel hacky"""
    if group_obj is None or agg_df is None:
        return None 
    B=20
    groups_num=len(group_obj)
    size_var=np.var(agg_df.number/np.sum(agg_df.number))
    size_mean = np.mean(agg_df.number)
    group_keys=group_obj.keys
    agg_keys=list(agg_df.keys()).remove("number")
    agg_nve_dict={}
    if agg_keys is not None:
        for ak in agg_keys:
            agg_nve_dict[ak]=sp.stats.entropy(np.histogram(agg_df[ak],bins=B)[0])/np.log(B)
    return {"group_attrs":group_keys,"agg_attrs":agg_nve_dict,"ngroups":groups_num,"size_var":size_var,"size_mean":size_mean}
    
def calc_gran_layer(disp_row):
    #this method takes a display row, and calculates the "granularity layer" measures
    group_obj,agg_df = get_grouping_by_row(disp_row)
    return get_grouping_measures(group_obj,agg_df)

#    df=get_filtered_df(disp_row["project_id"],json.loads(row["filtering"]))
#    return df[KEYS].apply(get_data_column_measures).to_dict()

    
    