# Special Case Model: Segment-based Model with Sorting Dependencies (SMS) with Reconfiguration Costs

### Imports

In [1]:
# import pyomo
from pyomo.environ import *
from pyomo.opt import SolverFactory

# import pandas
import pandas as pd

# import helper
from helper.print import print_result
from helper.export import export_config

### Load benchmark, system calibration and workload data

In [2]:
# performance measurements of the benchmark queries
df_perf = pd.read_csv('../data/benchmark/runtimes.csv')
# memory consumption of segments
df_memory = pd.read_csv('../data/benchmark/memory_consumption.csv')
# memory consumption of indexes
df_memory_index = pd.read_csv('../data/benchmark/memory_consumption_index.csv')

# calibration data for hyrise 
df_poslist_scan_penalty = pd.read_csv('../data/calibration/poslist_scan_penalty.csv')
# calibration data for storage devices
df_storage_penalty = pd.read_csv('../data/calibration/storage_penalty.csv')

# workload definition 
df_workload = pd.read_csv('../data/workloads/workload_1/workload.csv')
# chunk access statistics 
df_chunk_access = pd.read_csv('../data/workloads/workload_1/chunk_access.csv')

# Apply indexes on pandas dataframes
df_perf.set_index(['ORDER_BY', 'ENCODING', 'SCAN_COLUMN', 'SELECTIVITY', 'INDEX', 'SCAN_TYPE'], inplace=True)
df_memory.set_index(['ORDER_BY', 'ENCODING', 'COLUMN', 'CHUNK_ID'], inplace=True)
df_memory_index.set_index(['ORDER_BY', 'ENCODING', 'COLUMN', 'CHUNK_ID'], inplace=True)
df_poslist_scan_penalty.set_index(['ENCODING'], inplace=True)
df_storage_penalty.set_index(['STORAGE', 'ENCODING', 'INDEX'], inplace=True)
df_chunk_access.set_index(['QUERY_ID', 'CHUNK'], inplace=True)

### Base Config

In [7]:
CONFIG_PATH = '../data/base_config/'

def load_config(file_name, path=None):
    if path is None:
        path = CONFIG_PATH
    df_base_config = pd.read_csv(path + file_name + '.csv')
    df_base_config.set_index(['CHUNK', 'COLUMN'], inplace=True)
    return df_base_config

def to_df_config(model):
    lines = []
    sort_columns = {} 
    
    config = model.X
    for item in config:
        if round(config[item].value) == 1.0:
            lines += [item]
            if item[3] == 1:
                sort_columns[item[0]] = item[1]
    
    lines = set_sorted_column_for_segment(lines, sort_columns)
    df = pd.DataFrame.from_records(lines, columns=["CHUNK", "COLUMN", "ENCODING", "SORT", "INDEX", "STORAGE"])
    df.set_index(['CHUNK', 'COLUMN'], inplace=True)
    return df

def change_base_config(df, model):
    BASE_CONFIG = df
    model.R.reconstruct()

## Model

In [10]:
# In Hyrise, scan operations performed on the pos list are significantly slower than isolated 
# executed scan operations. To consider this in our cost estimation, we introduce an encoding-specific 
# parameter, which can be measured during the model calibration. 
def scan_order_penalty(scan_order, encoding):
    return 1 if (scan_order == 0) else df_poslist_scan_penalty.loc[(encoding)]['PENALTY']

# storage penalties
def storage_penalty(encoding, index, storage):
    return df_storage_penalty.loc[(storage, encoding, index)]['PENALTY']

# set index value to 0 for all scans with a scan order value >= 1
# indexes can only be used to speed up the first scan
def index_value(scan, i):    
    return i if scan['SCAN_ORDER'] == 0 else 0

# the measured execution time of an isolated scan operation with the given column configuration
def performance(s, e, o, i):
    scan = df_workload.iloc[s]
    return df_perf.loc[(o, e, scan['SCAN_COLUMN'], scan['SELECTIVITY'], index_value(scan, i), scan['SCAN_TYPE'])]['TIME']
    
# updates the storage budgets (the amount of storage units can not be modified)
def update_storage_budgets(storage_budgets, model):
    model.SB.reconstruct(storage_budgets)
    model.MemoryBudgetConstraint.reconstruct()

In [12]:
model = ConcreteModel()

# memory budget of the different storage devices    
STORAGE_BUDGET = {0:4_000_000_000, 1:3_000_000_000}

# base configuration, which is currently applied
BASE_CONFIG = load_config('base_config')

# modification costs per segment
ALPHA = 500_000_000
COSTS_ENCODING_CHANGE = 2
COSTS_CREATE_INDEX = 1.5
COSTS_SORTING_CHANGE = 2.5
COSTS_STORAGE_CHANGE = 1

# set of order configurations 
O = df_perf.index.levels[0].unique()

# set of encodings
E = df_perf.index.levels[1].unique()

# set of columns
N = df_perf.index.levels[2].unique()

# set of chunks
M = df_memory.index.levels[3].unique()

# set of scan operations defined in the workload  
model.S = Set(initialize=range(0, df_workload.shape[0]))

# set of storage devices
model.B = Set(initialize=range(0, len(STORAGE_BUDGET)))

# storage budget per storage device
model.SB = Param(model.B, within=NonNegativeIntegers, initialize=STORAGE_BUDGET, mutable=True)

# discrete variable for the indexing configuration of a segment
model.I = Set(initialize=[0,1], within=Binary)

# decision variable to describe selected combination of options 
model.X = Var(M, N, E, O, model.I, model.B, within=Binary) 

# decision variable to describe the selected index and sort configuration for chunk m
model.Y = Var(M, O, within=Binary)

# decision variable to describe the selected encoding configuration for a segment m,n 
model.Z = Var(M, N, E, model.I, model.B, within=Binary)

# returns the approximated costs of single segment scan based on the overall column scan costs
# proportional costs depending on the number of accessed chunks
def segment_access_init(model, m, n, s):
    q = df_workload.iloc[s]['QUERY_ID']
 
    if df_workload.iloc[s]['SCAN_COLUMN'] != n or df_chunk_access.loc[(q, m)]['ACCESSED'] == 0:
        return 0
    return (1/df_chunk_access.loc[(q, slice(None))]['ACCESSED'].sum())
model.A = Param(M, N, model.S, within=NonNegativeReals, initialize=segment_access_init)

# Runtime performance for column per encoding, sorting, indexing, and selectivity 
def performance_init(model, m, n, e, o, i, b):
    return sum(df_workload.iloc[s]['FREQUENCY'] * \
               performance(s, e, o, i) * \
               model.A[m, n, s] * \
               df_workload.iloc[s]['SCAN_FACTOR'] * \
               scan_order_penalty(df_workload.iloc[s]['SCAN_ORDER'], e) * \
               storage_penalty(e, i, b)
               for s in model.S)
model.C = Param(M, N, E, O, model.I, model.B, within=NonNegativeReals, initialize=performance_init, mutable=True)

# memory consumption of a segment for a given encoding ordering, and sorting configuration
def memory_init(model, m, n, e, o, i):
    if i == 0:
        return df_memory.loc[(o,e,n,m)]['SIZE_IN_BYTES']
    else:
        return df_memory.loc[(o,e,n,m)]['SIZE_IN_BYTES'] + df_memory_index.loc[(o,e,n,m)]['SIZE_IN_BYTES']
model.MC = Param(M, N, E, O, model.I, within=NonNegativeReals, initialize=memory_init)

# returns if a given index encoding configuration can be implemented in a database
# for hyrise only indices on dictionary encoded columns are allowed
def valid_index_encoding_config(model, e, i):
    if i == 1 and e == 0:
        return 1
    return 0
model.VC = Param(E, model.I, within=Binary, initialize=valid_index_encoding_config)

# determine modification costs for each configuration based on applied base config
def modification_costs_init(model, m, n, e, o, i, b):
    mod_costs = 0
    segment_config = BASE_CONFIG.loc[(m,n)]
    
    if segment_config['ENCODING'] !=  e:
        mod_costs += COSTS_ENCODING_CHANGE
    if segment_config['SORT'] ==  n and o != segment_config['SORT']:
        mod_costs += COSTS_SORTING_CHANGE
    if segment_config['INDEX'] == 0 and i > 0:
        mod_costs += COSTS_CREATE_INDEX
    if segment_config['STORAGE'] != b:
        mod_costs += COSTS_STORAGE_CHANGE
    return mod_costs 
model.R = Param(M, N, E, O, model.I, model.B, within=NonNegativeReals, initialize=modification_costs_init, mutable=True)

### Objective

def runtime(m):
    return sum(m.X[chunk_id, column_id, encoding_id, ordering_id, index_config, storage_id] * \
               m.C[chunk_id, column_id, encoding_id, ordering_id, index_config, storage_id]
               for chunk_id in M
               for column_id in N
               for encoding_id in E
               for ordering_id in O
               for index_config in m.I
               for storage_id in m.B) + ALPHA * \
           sum(m.X[chunk_id, column_id, encoding_id, ordering_id, index_config, storage_id] * \
               m.R[chunk_id, column_id, encoding_id, ordering_id, index_config, storage_id]
               for chunk_id in M
               for column_id in N
               for encoding_id in E
               for ordering_id in O
               for index_config in m.I
               for storage_id in m.B)
model.Obj = Objective(rule=runtime)

### Constraints 

# Size within budget
def memory_budget_rule(m, b):
    # sum up memory consumption of all 
    return sum((m.X[chunk_id, column_id, encoding_id, ordering_id, index_config, b] * \
                m.MC[chunk_id, column_id, encoding_id, ordering_id, index_config])
               for chunk_id in M
               for column_id in N
               for encoding_id in E
               for ordering_id in O
               for index_config in m.I) <= m.SB[b]
model.MemoryBudgetConstraint = Constraint(model.B, rule=memory_budget_rule)

# one column sorted per chunk
def single_sorting_column_per_chunk_rule(m, i):
    return sum(m.Y[i, ordering_id] 
               for ordering_id in O) == 1
model.SingleSortColumnConstraints = Constraint(M, rule=single_sorting_column_per_chunk_rule)

# Exactly one encoding, sorting, and indexing configuration per segment
def one_encoding_sorting_indexing_config_active_per_segment_rule(m, i, j):
    return sum(m.Z[i, j, encoding_id, index_config, storage_id] 
                for encoding_id in E
                for index_config in m.I
                for storage_id in m.B) == 1
model.SingleEncodingConstraints = Constraint(M, N, rule=one_encoding_sorting_indexing_config_active_per_segment_rule)

# all segmets of a chunk have to be stored on the same storage medium
# def single_storage_medium_per_chunk_rule(m, i, j):
#     return sum(m.Z[i, 1, encoding_id, index_config, storage_id] * storage_id
#                for encoding_id in E
#                for index_config in m.I
#                for storage_id in m.B) == \
#            sum(m.Z[i, j, encoding_id, index_config, storage_id] * storage_id
#                for encoding_id in E
#                for index_config in m.I
#                for storage_id in m.B)       
# model.SingleStorageConstraints = Constraint(M, N, rule=single_storage_medium_per_chunk_rule)

# lin X = Z * Y
def linear_x(m, j, n, e, o, i, b):
    return m.X[j, n, e, o, i, b] >= m.Y[j, o] + m.Z[j, n, e, i, b] - 1 
model.linearX = Constraint(M, N, E, O, model.I, model.B, rule=linear_x)

def linear_y(m, j, n, e, o, i, b):
     return m.X[j, n, e, o, i, b] <= m.Y[j, o] 
model.linearY = Constraint(M, N, E, O, model.I, model.B, rule=linear_y)

def linear_z(m, j, n, e, o, i, b):
     return m.X[j, n, e, o, i, b] <= m.Z[j, n, e, i, b] 
model.linearZ = Constraint(M, N, E, O, model.I, model.B, rule=linear_z)

# Indexed columns need to have dictionary encoding
def valid_index_configuration_rule(m, i, j, k, l, n):
    return m.X[i, j, k, l, 1, n] <= m.VC[k, 1]
model.IndexesOnDictColumnsConstraints = Constraint(M, N, E, O, model.B, rule=valid_index_configuration_rule)

# Solving

In [13]:
solver = SolverFactory('gurobi')
solver.options['threads'] = 16

result = solver.solve(model)
print_result(result, model, False)

Solving for budget;
  Storage: 0     Storage Size: 50000000 
  Storage: 1     Storage Size: 75000000 

Result: optimal (walltime: 63.7961s)
Objective: 4093353240.5196285
Memory consumption:
  0: 49.98806403104718 MB
  1: 38.788684 MB

Storage: 0
CHUNK               driver_id           latitude            longitude           timestamp           status              
0                   S - RunLength       - - FoR-SIMD        - - FoR-SIMD        - - Dictionary      - - LZ4             
1                   S - RunLength       - - FoR-SIMD        - - LZ4             - - Dictionary      - - LZ4             
2                   - - Dictionary      S - FoR-SIMD        - - FoR-SIMD        - - Dictionary      - - LZ4             
3                   - - Dictionary      S - FoR-SIMD        - - FoR-SIMD        - - Dictionary      - - LZ4             
4                   - - Dictionary      S - FoR-SIMD        - - FoR-SIMD        - - Dictionary      - - LZ4             
5                   - - Dict