**Copyright © 2018 University of Stirling**

# Processing Raw Output from Experiments on 3 Objectives

In [0]:
from __future__ import absolute_import, division, print_function

import numpy as np
import pandas as pd
import os
from tqdm import tqdm

Here we configure the relative path and filename information for the data to be processed.

In [0]:
# Location of raw data files.
DATA_PATH = "/content/{experiment}.{length}.{runid}/"
FUNC_FILE_NAME = "FUN.tsv"
VAR_FILE_NAME = "VAR.tsv"
REEVAL_FILE_NAME = "FUN-reevaled.tsv"

# If false, reevaluation data is ignored and origonal data is used as output by
# the origonal runs of the experiment. Note that plotting of reevaluation data
# may be turned off independantly in the plotting script even if included here.
USING_REEVAL = True

# Number of times the experiment is run.
# (0, NUM_RUNS-1) replaces {runid} in the data path.
NUM_RUNS = 30

# Number of variables. Replaces {length} in the data path.
MIN_NUM_VARIABLES = 2
MAX_NUM_VARIABLES = 10

# Data formatting.
COL_SEPERATOR = ' '

# Output folder (will be created if does not exist).
OUTPUT_PATH = "/content/antibiotic"

# Each entry will produce an output Pandas dataframe to the requested file.
# Key replaces {experiment} in the data path.
EXPERIMENTS = {"condor3d/condor3d/3d-100/results.both.fixed.length"
                   : {"save-to" : "constrained.pkl",
                      "objectives" : ("failurerate",
                                      "totalantibiotic",
                                      "maximumconcentration")
                     },
               "condor3d/condor3d/3d-1/results.both.fixed.length"
                   : {"save-to" : "unconstrained.pkl",
                      "objectives" : ("failurerate",
                                      "totalantibiotic",
                                      "maximumconcentration")
                     }
              }

The following helper functions are used for processing the raw output from experiments.

In [0]:
# Parses a line from the raw FUN file,
# e.g. '0.0 150.0 \n' is read as [0.0, 150.0]
def parse_function_file_line(fun_line):
    global COL_SEPERATOR
    return [np.float64(element) \
            for element in fun_line.strip("\n\r ").split(COL_SEPERATOR)]

# Parses a line from the raw VAR file,
# e.g. '58 36 18 29 9 \n', max_len=10 is read as [58 36 18 29 9 0 0 0 0 0]
def parse_variable_file_line(var_line, max_len=10):
    global COL_SEPERATOR
    rv = [np.int32(element) \
            for element in var_line.strip("\n\r ").split(COL_SEPERATOR)]
    while len(rv) < max_len:
        rv = rv + [0]
    return rv

# Parses a line from the raw FUN-reevaled file,
# e.g. '0.011360000000000037 \n' is read as 0.011360000000000037
def parse_reevaluation_file_line(reeval_line):
    return np.float64(reeval_line.strip("\n\r "))

# Find the actual length of a treatment,
# e.g. [0, 30, 0, 0, 30, 0, 0, 0, 0, 0] is length 4
def find_actual_length(vars):
    first_non_zero = None
    last_non_zero = None
    for i, x in enumerate(vars):
        if x > 0:
            last_non_zero = i
            if first_non_zero is None:
                first_non_zero = i
    if first_non_zero is None:
        return 0
    else:
        return last_non_zero - first_non_zero + 1

This section loads the raw data and stores all data points in Pandas dataframes.

In [0]:
for experiment in EXPERIMENTS:
    
    # Get the experiment objects to set the column headings
    objectives = EXPERIMENTS[experiment]["objectives"]

    experiment_path = DATA_PATH.replace("{experiment}", experiment)

    # Creates a dataframe for storing all ouput candidates
    df = pd.DataFrame()
    df['max_len'] = pd.Series(dtype=np.int32)
    df['actual_len'] = pd.Series(dtype=np.int32)
    df['runid'] = pd.Series(dtype=np.int32)
    for i in range(MAX_NUM_VARIABLES):
        df['x' + str(i)] = pd.Series(dtype=np.int32)
    for objective in objectives:
        df[objective] = pd.Series(dtype=np.float64)
    if USING_REEVAL:
        df["failurerate_reeval"] = pd.Series(dtype=np.float64)
    row_index = 0
    

    # Loops over each length of this experiment
    # [MIN_NUM_VARIABLES to MAX_NUM_VARIABLES] and [0, NUM_RUNS)
    for length in tqdm(range(MIN_NUM_VARIABLES, MAX_NUM_VARIABLES+1), desc=experiment):
        for runid in range(NUM_RUNS):
            
            # Using the helper methods above, processes the raw output from the tsv
            # files into the data frame
            run_path = experiment_path.replace("{runid}", str(runid)) \
                                      .replace("{length}", str(length))
            if USING_REEVAL:
                with open(run_path + FUNC_FILE_NAME) as fun, \
                     open(run_path + VAR_FILE_NAME) as var, \
                     open(run_path + REEVAL_FILE_NAME) as reeval:
                    for fun_line, var_line, reeval_line in zip(fun, var, reeval):
                        func_values = parse_function_file_line(fun_line)
                        var_values = parse_variable_file_line(var_line, MAX_NUM_VARIABLES)
                        reeval = parse_reevaluation_file_line(reeval_line)
                        actual_length = find_actual_length(var_values)
                        df.loc[row_index] \
                            = [length, actual_length, runid] + var_values + func_values + [reeval]
                        row_index += 1
            else:
                with open(run_path + FUNC_FILE_NAME) as fun, \
                     open(run_path + VAR_FILE_NAME) as var:
                    for fun_line, var_line in zip(fun, var):
                        func_values = parse_function_file_line(fun_line)
                        var_values = parse_variable_file_line(var_line, MAX_NUM_VARIABLES)
                        actual_length = find_actual_length(var_values)
                        df.loc[row_index] \
                            = [length, actual_length, runid] + var_values + func_values
                        row_index += 1

    # stores the data frame
    EXPERIMENTS[experiment]['dataframe'] = df
    
    # prints the number of solutions in the experiment
    print('Loaded {0} candidate solutions from experiment "{1}"'.format(
        len(df), experiment))

The data is saved in pickle format to the designated output directory:

In [0]:
if not os.path.isdir(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

for experiment in EXPERIMENTS:
    objectives = EXPERIMENTS[experiment]["objectives"]
    save_to = EXPERIMENTS[experiment]["save-to"]
    df = EXPERIMENTS[experiment]['dataframe']
    file_name = "{0}/{1}".format(OUTPUT_PATH, save_to)
    df.to_pickle(file_name)
    print('Saved pickle file "{0}"'.format(file_name))