# Results EDA

In [None]:
import os
import json
import base64
from math import sqrt, pi

import ijson
import numpy as np
import pandas as pd
#import plotly.express as px
#import plotly.graph_objects as go

import multiprocessing

from tqdm.notebook import tqdm

# from kcmc_instance import KCMC_Instance

## REFERENCE Data Preparation

In [None]:
INPUT_PREFIX = '/data/dynamodb_objs'

# LISTING FILES
files_list = sorted([f for f in os.listdir(INPUT_PREFIX) if f.endswith('.json')])
len(files_list)

In [None]:
class open_and_replace(object):
    
    def __init__(self, filepath, readmode='r', to_replace=None):
        if to_replace is None: to_replace = {}
        self.to_replace = {str(key): str(value)
                           for key, value in to_replace.items()}
        self.filepath = filepath
        self.readmode = readmode
        
    def replace_inline(self, data):
        for key, value in self.to_replace.items():
            data = data.replace(key, value)
        return data
    
    def __enter__(self):
        self.fileobj = open(self.filepath, self.readmode)
        return self

    def __exit__(self, *exc):
        self.fileobj.close()
    
    def read(self, *args, **kwargs):
        data = self.fileobj.read(*args, **kwargs)
        return self.replace_inline(data)
    
    def __iter__(self):
        for data in self.fileobj:
            yield self.replace_inline(data)
    
    def __getattr__(self, attr):
        raise NotImplementedError(f'CLASS <open_and_replace> DOES NOT HAVE METHOD <{attr}>')

In [None]:
json_fix = {
    ": Infinity, ": ': "Infinity", ',
    ''',"'p42''': ''','p42''',
    ''',"'p94''': ''','p94''',
    ': nuln, ': ':null, ',
    '": 0.0. ': '": 0.0, ',
    ''',"'1')": ''': ''','1')": ''',
    ''',"'i364',''': ''','i364',''',
    '''""('i228', ''': '''"('i228', ''',
    ''',"'i168',''': ''','i168','''
}

def _parse_file(file):
    
    # RESULTS BUFFER
    result = {'variables': {'y': 'trimmed out', 'x': {}},
              'time': {'wall': None, 'setup': {'model': None}},
              'file': file, 'gurobi_logs': ''}
    
    # MAIN PARSING
    with open_and_replace(
        INPUT_PREFIX+"/"+file,
        to_replace=json_fix
    ) as f:  
        for prefix, event, value in ijson.parse(f):

            # IGNORED
            if prefix in {
                None, '', ' ', 'variables',
                'K', 'M', 'kcmc_k', 'kcmc_m',
                'serial', 'seed', 'time_limit', 'queued',
                'results_single_flow', 'results_multi_flow'
            }: continue

            # IGNORE FOR NOW THE GUROBI LOGS
            elif prefix == 'gurobi_logs': continue
            elif prefix == 'gurobi_logs.item':
                if isinstance(value, list):
                    for i in value: result['gurobi_logs'] += i
                elif value is None: continue
                else: result['gurobi_logs'] += value

            # PARSING OF GUROBI DEFAULT RESULT FORMAT
            elif prefix == 'json_solution':
                result.update(json.loads(value))

            # PARSING OF VARIABLE Y
            elif prefix.startswith('variables.y'): continue

            # PARSING OF VARIABLE X
            elif prefix.startswith('variables.x'):
                if prefix == 'variables.x': continue
                key = prefix[len('variables.x.'):]
                if ', ' in key:
                    item, tree = key.split("', '")
                    item = int(item.split('i')[1])
                    tree = int(tree.split("'")[0])
                else:
                    tree = 0
                    item = int(key.split('i')[1])            
                if tree not in result['variables']['x']: result['variables']['x'][tree] = {}
                result['variables']['x'][tree][item] = 0 if value is None else int(abs(value))

            # IGNORE FOR NOW THE PARSING OF DETAILED RUNTIMES
            elif prefix.startswith('time'): continue

            # PARSING OF SIMPLE COPY DATA
            elif prefix == 'threads': result['threads'] = value
            elif prefix == 'gurobi_model_fingerprint': result['gurobi_model_fingerprint'] = value

            elif prefix == 'gurobi_runtime': result['gurobi_runtime'] = float(value)
            elif prefix == 'simplex_iterations_count': result['simplex_iterations_count'] = float(value)
            elif prefix == 'binary_variables': result['binary_variables'] = value
            elif prefix == 'solutions_count': result['solutions_count'] = value
            elif prefix == 'node_count': result['node_count'] = value
            elif prefix == 'status_code': result['status_code'] = value
            elif prefix == 'status': result['status'] = value
            elif prefix == 'mip_gap': result['mip_gap'] = value

            elif prefix == 'K': result['K'] = value
            elif prefix == 'M': result['M'] = value
            elif prefix == 'gurobi_model_type': result['gurobi_model_type'] = value
            elif prefix == 'instance_key': result['instance_key'] = value
            elif prefix == 'coverage_density': result['coverage_density'] = float(value)
            elif prefix == 'communication_density': result['communication_density'] = float(value)

            # UNMAPPED KEY ERROR
            else: raise NotImplementedError(' | '.join(map(str, [file, prefix, event, value])))

    # CLOSEUP OF THE X VARIABLE
    result['variables']['x'] = {
        tree: np.packbits([bool(item[pos]) for pos in sorted(item.keys())]).tolist()
            for tree, item in result['variables']['x'].items()
    }
    
    # CLOSEUP OF THE GUROBI LOGS
    result['gurobi_logs'] = None if len(result['gurobi_logs']) == 0 \
                                 else result['gurobi_logs'].strip()

    # DETAILED TIMING
    with open_and_replace(
        INPUT_PREFIX+"/"+file,
        to_replace=json_fix
    ) as f:
        for key, value in ijson.kvitems(f, 'time'):
            if key == 'wall':
                result['time']['wall'] = value[1]-value[0]
                continue
            # Key must be "setup"
            for skey, svalue in value.items():
                if skey == 'model':
                    result['time']['setup']['model'] = svalue[1]-svalue[0]
                else:
                    result['time']['setup'][skey] = {
                        sskey: ssvalue[1]-ssvalue[0]
                        for sskey, ssvalue in svalue.items()
                    }
    
    # RETURNS            
    return result

def parse_file(file):
    try:
        return _parse_file(file)
    except Exception as exp:
        print(file, str(exp))
        raise exp
        #return None

In [None]:
if len(files_list) > 0:
    pool = multiprocessing.Pool()

    CHUNKSIZE = 100
    def chunks(lst, n): return [lst[i:i + n] for i in range(0, len(lst), n)]

    for chunknum, files_chunk in enumerate(chunks(files_list, CHUNKSIZE)):
        filename = f'/data/parsed_results/{chunknum}.pq'
        if os.path.exists(filename):
            print(chunknum, filename)
            # Comment the line below to overwrite existing parsed results
            continue
        df = []
        for file in tqdm(pool.imap_unordered(parse_file, files_chunk),
                         total=len(files_chunk)):
            if file is None: continue
            df.append(file.copy())

        # Format the DataFrame
        df = pd.DataFrame(df).sort_values('instance_key').reset_index(drop=True).copy()
        df = df.apply(lambda col: col.fillna('').astype(str)).copy()
        df.to_parquet(filename)
        print(chunknum, filename, len(df))

    pool.close()

## PREPROCESSING Data preparation

In [None]:
prep = pd.read_csv('results/preprocessor/optimizer_dinic.csv', header=None, sep='\t')
prep.columns = ['instance_key', 'K', 'M',
                'prep_heuristic', 'prep_runtime_us', 'prep_valid',
                'prep_size', 'prep_compression_rate', 'prep_result']

int_cols = ['K', 'M', 'prep_runtime_us', 'prep_size']
prep.loc[:, int_cols] = prep[int_cols].apply(lambda col: col.astype(int))
prep.loc[:, 'prep_compression_rate'] = prep['prep_compression_rate'].astype(float)
prep.loc[:, 'instance_key'] = 'KCMC_'+prep['instance_key'].str.replace(' ', '_').str.replace(';', '_')
prep.loc[:, 'prep_runtime'] = prep['prep_runtime_us'] / 1_000_000
prep.loc[:, 'prep_valid'] = prep['prep_valid'] == 'OK'

prep.to_parquet('/data/preprocessing.parquet')

len(prep), prep.columns

In [None]:
prep.loc[:, 'pois'] = prep['instance_key'].str.split('_').str[1]
prep.loc[:, 'sensors'] = prep['instance_key'].str.split('_').str[2]

(prep
    [prep['prep_heuristic'].str.startswith('ff_dinic')]
    [['pois', 'sensors', 'K', 'M',
      # 'prep_heuristic',
      'prep_runtime', 'prep_valid', 'prep_size']]
    .groupby(['pois', 'sensors', 'K', 'M'])
    .mean()
    .reset_index(drop=False)
    .sort_values(['pois', 'sensors', 'K', 'M'])
)[['pois', 'sensors', 'K', 'M',
   'prep_valid',
   'prep_size',
   'prep_runtime']]