## 2RPFS Problem (TWCT objective) - Validate individual solutions

Before running this, notebook, please run notebook 0.1.

In [None]:
import pandas as pd
import numpy as np
import os, fnmatch
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
import glob
import seaborn as sns
import gzip
import matplotlib.style as style
from matplotlib.path import Path
from matplotlib.patches import BoxStyle

%matplotlib inline

In [None]:
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

### List files in the result folder 

In [None]:
resultfolder = os.path.join(os.getcwd(), 'results', 'consolidated')
rpfs_file = os.path.join(resultfolder, 'RPFS_TWCT_all_results.pkl.gz')

### Create the output folder 

In [None]:
outputfolder = os.path.join(os.getcwd(), 'results', 'consolidated')
outputfolder_graph = os.path.join(os.getcwd(), 'results', 'consolidated', 'graphs')
outputfolder_table = os.path.join(os.getcwd(), 'results', 'consolidated', 'tables')
if not os.path.exists(outputfolder_graph):
    os.makedirs(outputfolder_graph)
if not os.path.exists(outputfolder_table):
    os.makedirs(outputfolder_table)
#print('Saving files on folder: ' + outputfolder)

### Process consolidated CSV result files

In [None]:
df_rpfs = pd.read_pickle(rpfs_file)  # Robust PFSP Budget solutions only
df_rpfs.drop(columns=['executionId'], inplace=True)
df_rpfs = df_rpfs.reset_index()

**Robust dataframe: calculating new fields.**

In [None]:
df_rpfs['optimal'] = df_rpfs['is_optimal'] & df_rpfs['validated'] & (df_rpfs['gap'] <= 1e-8)
df_rpfs['time_limit'] = 7200.0
df_rpfs['time_limit_2'] = 7200.0 * 2
df_rpfs['mp_total_time'] = (df_rpfs['n'] < 15).astype(int) * np.minimum(df_rpfs['mp_total_time'], df_rpfs['time_limit']) + (df_rpfs['n'] >= 15).astype(int) * np.minimum(df_rpfs['mp_total_time'], df_rpfs['time_limit_2'])
df_rpfs['time'] = df_rpfs['mp_total_time'] + df_rpfs['sp_total_time']
df_rpfs['gap'] = df_rpfs['gap'] * 100.0
df_rpfs['RobCost_worstcase'] = df_rpfs['wct_validation']
df_rpfs = df_rpfs.rename(columns={"budget_Gamma": "RobCost_Gamma"})

In [None]:
df_rpfs.tail(4)

In [None]:
df_rpfs.info()

### Checking the Robust PFSP Budget solutions dataframe

In [None]:
df_rpfs.head(2)

# Tables

Replace model names with the name used in table presentation:

In [None]:
df_rpfs = df_rpfs[(df_rpfs['model'] != 'hybrid')]
df_rpfs['model'].replace({'hybrid-liao-you': 'Liao-You-Hybrid', 'hybrid-wilson': 'Wilson-Hybrid', 'hybrid-manne': 'Manne-Hybrid', 
                          'liao-you': 'Liao-You', 'manne': 'Manne',
                          'tba': 'TBA', 'ts2': 'TS2', 'ts3': 'TS3', 'wagner-wst2': 'WST2', 'wilson': 'Wilson'}, inplace=True)

Obtain list of C&CG models, instance types

In [None]:
model_list = df_rpfs['model'].unique().tolist()
instance_type_list = df_rpfs['instance_type'].unique().tolist()
print(model_list)
print(instance_type_list)

Add a new column containing the instance size as string

In [None]:
df_temp = df_rpfs
(df_temp['n'].astype(str) + 'x' + df_temp['m'].astype(str)).unique()

In [None]:
df_rpfs.columns

In [None]:
df_temp = df_rpfs
df_temp['instance_size'] = df_temp['n'].astype(str) + 'x' + df_temp['m'].astype(str)
df_rpfs = df_temp.set_index(['model', 'n', 'm', 'alpha', 'seq', 'RobCost_Gamma', 'instance_type'])
df_rpfs

Treating errors in the `gap` column

In [None]:
df_rpfs['gap'].describe()

In [None]:
df_check = df_rpfs.reset_index()[['model', 'n', 'm', 'alpha', 'seq', 'RobCost_Gamma', 'instance_name', 'gap', 'wct', 'RobCost_worstcase', 'lb']]
df_check[(df_check['gap'] < -1e-5)].to_csv(os.path.join(os.getcwd(), 'results', 'negative_gap_list.csv'))

In [None]:
df_rpfs['gap'] = df_rpfs['gap'].apply(lambda x: np.maximum(x, 0.0))

In [None]:
df_rpfs['gap'].describe()

In [None]:
def calculate_perc_best_performance(df, model):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    df_model = df_model[df_model['optimal'] == True]    
    df_model = df_model.set_index(['n', 'm', 'alpha', 'seq', 'RobCost_Gamma', 'instance_type'])
    if len(df_model.index) == 0:
        return np.nan
    
    df_others = df.reset_index()
    df_others = df_others[df_others['model'] != model]
    df_others = df_others[df_others['optimal'] == True] 
    group_columns = ['n', 'm', 'alpha', 'seq', 'RobCost_Gamma', 'instance_type']
    df_best_performance = df_others[group_columns + ['time']].groupby(by=group_columns).min()['time']
    df_best_performance = df_best_performance.to_frame()
    if len(df_best_performance.index) == 0:
        return np.nan
    
    df_compare = df_best_performance.join(df_model, how='inner', 
                                                     on=group_columns,
                                                     lsuffix='_best')
    df_compare['time_wins'] = (df_compare['time'] < df_compare['time_best']).astype(int)
    return np.round(100.0 * df_compare['time_wins'].sum() / len(df_compare.index), 2)

In [None]:
def calculate_perc_solved(df, model, instance_type = None, instance_size = None, alpha = None):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    df_ = df_model
    if instance_type is not None:
        df_ = df_[df_['instance_type'] == instance_type]
    if instance_size is not None:
        df_ = df_[df_['instance_size'] == instance_size]
    if alpha is not None:
        df_ = df_[df_['alpha'] == alpha]
    if len(df_.index) > 0:
        return np.round(100.0 * len(df_[(df_['optimal'] == True)].index) / len(df_.index), 2)
    else:
        return np.nan

In [None]:
# Avg. % Gap is the average percentage gap of solutions from instances not solved to optimality
def calculate_avg_perc_gap(df, model):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    df_model = df_model[df_model['optimal'] == False]
    if df_model['gap'].mean() >= 1e-2:
        return np.round(df_model['gap'].mean(), 2)
    else:
        return df_model['gap'].mean()

In [None]:
def calculate_median_time(df, model, time_col_name):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    return np.round(df_model[time_col_name].median(), 2)

In [None]:
def calculate_avg_time(df, model, time_col_name):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    return np.round(df_model[time_col_name].mean(), 2)

In [None]:
def calculate_std(df, model, col_name):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    if col_name == 'gap':
        df_model = df_model[df_model['optimal'] == False]
    if df_model[col_name].std() >= 1e-2:
        return np.round(df_model[col_name].std(), 2)
    else:
        return df_model[col_name].std()

In [None]:
import numpy as np
import scipy.stats

def mean_confidence_interval(df, model, col_name, confidence=0.95):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    if col_name == 'gap':
        df_model = df_model[df_model['optimal'] == False]
    data = df_model[col_name]
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    lb = np.round(m-h, 2)
    ub = np.round(m+h, 2)
    if np.isnan(lb) or np.isnan(ub):
        return '-'
    return '[{}, {}]'.format(lb, ub)

In [None]:
def calculate_median_iterations(df, model):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    return np.round(df_model['iterations'].median(), 2)

In [None]:
def calculate_avg_iterations(df, model):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    return np.round(df_model['iterations'].mean(), 2)

In [None]:
def calculate_num_instances(df, model):
    df_model = df.reset_index()
    df_model = df_model[df_model['model'] == model]
    return len(df_model.index)

### Let's determine the baseline results by obtaining the smallest valid objective values among all executed solutions methods

In [None]:
df_baseline = df_rpfs.copy().reset_index()
### df_baseline = df_baseline[(df_baseline['optimal'] == True) & (df_baseline['validated'] == True)]
exclude_model_list = ['Wilson-Hybrid', 'Liao-You-Hybrid', 'Manne-Hybrid', 'Wilson']
df_baseline = df_baseline[~(df_baseline['model'].isin(exclude_model_list))]
df_baseline = df_baseline[(df_baseline['validated'] == True) & (df_baseline['wct_validation'] > 0)]
group_key = ['n', 'm', 'alpha', 'seq', 'RobCost_Gamma', 'instance_type']  #   , 'ub_name', 'instance_name']
df_baseline_opt_value = df_baseline.groupby(by=group_key)[['wct_validation']].min().reset_index()
display(df_baseline_opt_value)
merge_key = group_key + ['wct_validation']
df_baseline_opt = df_baseline.merge(df_baseline_opt_value, on=merge_key, how='inner')
df_baseline_opt = df_baseline_opt.drop_duplicates(subset=merge_key, keep='first').sort_values(by=merge_key)
display(df_baseline_opt)
df_baseline_opt.to_csv(os.path.join(outputfolder_table, 'BestOptimalSolutions.csv'))

In [None]:
def calculate_perc_valid_solutions(df, model, instance_type, instance_size):
    df_base = df.reset_index()
    df_base = df_base[(df_base['instance_type'] == instance_type)]
    df_base = df_base[(df_base['instance_size'] == instance_size)]
    df_model = df_base[df_base['model'] == model]
    df_model_opt = df_model[(df_model['optimal'] == True) & (df_model['validated'] == True)]
    df_join = df_baseline_opt.merge(df_model_opt, on=group_key, how='inner')
    num_solutions = len(df_join.index)
    df_join['obj_diff'] = df_join['wct_validation_y'] - df_join['wct_validation_x']
    EPS = 1e-3
    df_join['obj_less_than'] = df_join['obj_diff'] < -EPS
    df_join['obj_greater_than'] = df_join['obj_diff'] > EPS
    df_join['is_valid'] = ((abs(df_join['obj_diff']) <= EPS) | ( df_join['obj_less_than'] & (df_join['permutation_x'].str.strip() == df_join['permutation_y'].str.strip())))
    df_join.to_csv(os.path.join(outputfolder_table, 'ValidSolutions_{}_{}_{}.csv'.format(model, instance_type, instance_size)))
    df_valid = df_join[df_join['is_valid']]
    if num_solutions > 0:
        return 100.0 * len(df_valid.index) / num_solutions
    else:
        return 0

## 1. First, let's create tables only with non-hybrid solution methods

## Table A1. # of correct solutions (Wilson-baseline) and performance per instance group and model

Model-wise Robust PFSP C&CG performance comparison, per instance group.

* % Best Performance is the percentage of instances solved to optimality where the model achieved shorter execution time, when compared to the other models; 

* % Solved contains the percentage of instances solved within the time limit; 

* Avg. % Gap is the average percentage gap of solutions from instances not solved to optimality; 

* Avg. time and Std. dev. of time are the mean and standard deviation in solution time (s), respectively;

* Avg. iterations and Std. dev. of iterations are the mean and standard deviation of the number of iterations performed.

#### Let's assume Wilson model as baseline (i.e. its optimal solutions are correct)

In [None]:
per_instance_stats = dict()
for instance_type in instance_type_list:  # group by instance type and size
    df_base = df_rpfs.reset_index()
    exclude_model_list = ['Wilson-Hybrid', 'Liao-You-Hybrid', 'Manne-Hybrid']
    df_base = df_base[~(df_base['model'].isin(exclude_model_list))]
    model_list_reduced = [_ for _ in model_list if _ not in exclude_model_list]
    df_itype = df_base
    df_itype = df_itype[(df_itype['instance_type'] == instance_type)]
    instance_size_list = ['10x2', '10x3', '10x4', '10x5', '15x5']  # df_itype['instance_size'].unique().tolist()
    for instance_size in instance_size_list:
        df_instance = df_itype[df_itype['instance_size'] == instance_size]
        for model in model_list_reduced:
            per_instance_stats[(instance_type,instance_size,model)] = dict()
            per_instance_stats[(instance_type,instance_size,model)]['% Best Performance'] = calculate_perc_best_performance(df_instance, model)
            per_instance_stats[(instance_type,instance_size,model)]['% Solved'] = calculate_perc_solved(df_base, model, instance_type, instance_size)
            per_instance_stats[(instance_type,instance_size,model)]['Avg. % gap'] = calculate_avg_perc_gap(df_instance, model)
            per_instance_stats[(instance_type,instance_size,model)]['Std. dev. of % gap'] = calculate_std(df_instance, model, 'gap')
            per_instance_stats[(instance_type,instance_size,model)]['95% CI of % gap'] = mean_confidence_interval(df_instance, model, 'gap')
            per_instance_stats[(instance_type,instance_size,model)]['Avg. time'] = calculate_avg_time(df_instance, model, 'time')
            per_instance_stats[(instance_type,instance_size,model)]['Std. dev. of time'] = calculate_std(df_instance, model, 'time')
            per_instance_stats[(instance_type,instance_size,model)]['Avg. MP time'] = calculate_avg_time(df_instance, model, 'mp_total_time')
            per_instance_stats[(instance_type,instance_size,model)]['Avg. SP time'] = calculate_avg_time(df_instance, model, 'sp_total_time')
            per_instance_stats[(instance_type,instance_size,model)]['Avg. iterations'] = calculate_avg_iterations(df_instance, model)
            per_instance_stats[(instance_type,instance_size,model)]['Std. dev. of iterations'] = calculate_std(df_instance, model, 'iterations')
            per_instance_stats[(instance_type,instance_size,model)]['# instances solved'] = calculate_num_instances(df_instance, model)
            per_instance_stats[(instance_type,instance_size,model)]['% valid solutions'] = calculate_perc_valid_solutions(df_base, model, instance_type, instance_size)

In [None]:
# https://stackoverflow.com/questions/57606801/pandas-style-options-to-latex

In [None]:
pd.set_option('display.max_columns', None)
allowed_keys = [(x, y, z) for (x, y, z) in per_instance_stats.keys() if (x == 'ying' and y in ['10x2', '10x3'])]
per_instance_stats1 = { your_key: per_instance_stats[your_key] for your_key in allowed_keys }
df_table3a = pd.DataFrame.from_dict(per_instance_stats1)
df_table3a.columns = df_table3a.columns.droplevel()
df_table3a

In [None]:
pd.set_option('display.max_columns', None)
allowed_keys = [(x, y, z) for (x, y, z) in per_instance_stats.keys() if (x == 'ying' and y in ['10x4', '10x5'])]
per_instance_stats2 = { your_key: per_instance_stats[your_key] for your_key in allowed_keys }
df_table3b = pd.DataFrame.from_dict(per_instance_stats2)
df_table3b.columns = df_table3b.columns.droplevel()
df_table3b

#### Export table to Tableau, after melt

In [None]:
df_table3 = pd.DataFrame.from_dict(per_instance_stats)
df_table3.columns = df_table3.columns.droplevel()
value_vars = df_table3.transpose().columns
df_melt_table3 = pd.melt(df_table3.transpose().reset_index(), id_vars=['level_0', 'level_1'], value_vars=value_vars)
df_melt_table3['Instance size'] = df_melt_table3['level_0']
df_melt_table3['Model'] = df_melt_table3['level_1']
df_melt_table3.to_excel(os.path.join(outputfolder_table, '2_twct_model_stats_per_instance_no_hybrid.xlsx'))

## 2. Now, we'll create a table with hybrid and non-hybrid solution methods

In [None]:
df_baseline = df_rpfs.copy().reset_index()
### df_baseline = df_baseline[(df_baseline['optimal'] == True) & (df_baseline['validated'] == True)]
exclude_model_list = ['Wilson']   ### ['Wilson-Hybrid', 'Liao-You-Hybrid', 'Manne-Hybrid', 'Wilson']
df_baseline = df_baseline[~(df_baseline['model'].isin(exclude_model_list))]
df_baseline = df_baseline[(df_baseline['validated'] == True) & (df_baseline['wct_validation'] > 0)]
group_key = ['n', 'm', 'alpha', 'seq', 'RobCost_Gamma', 'instance_type']  #   , 'ub_name', 'instance_name']
df_baseline_opt_value = df_baseline.groupby(by=group_key)[['wct_validation']].min().reset_index()
display(df_baseline_opt_value)
merge_key = group_key + ['wct_validation']
df_baseline_opt = df_baseline.merge(df_baseline_opt_value, on=merge_key, how='inner')
df_baseline_opt = df_baseline_opt.drop_duplicates(subset=merge_key, keep='first').sort_values(by=merge_key)
display(df_baseline_opt)
#df_baseline_opt.to_csv(os.path.join(outputfolder_table, 'BestOptimalSolutions.csv'))

### 2.2. Grouped by instance size, with hybrid method

In [None]:
def calculate_perc_valid_solutions(df, model, instance_type, instance_size):
    df_base = df.reset_index()
    df_base = df_base[(df_base['instance_type'] == instance_type)]
    df_base = df_base[(df_base['instance_size'].str.strip() == instance_size)]
    df_model = df_base[df_base['model'] == model]
    df_model_opt = df_model[(df_model['optimal'] == True) & (df_model['validated'] == True)]
    df_join = df_baseline_opt.merge(df_model_opt, on=group_key, how='inner')
    num_solutions = len(df_join.index)
    df_join['obj_diff'] = df_join['wct_validation_y'] - df_join['wct_validation_x']
    EPS = 1e-2
    df_join['obj_less_than'] = df_join['obj_diff'] < -EPS
    df_join['obj_greater_than'] = df_join['obj_diff'] > EPS
    df_join['is_valid'] = ((abs(df_join['obj_diff']) <=EPS) | ( df_join['obj_less_than'] & (df_join['permutation_x'].str.strip() == df_join['permutation_y'].str.strip())))
    df_join.to_csv(os.path.join(outputfolder_table, '2_ValidSolutions_{}_{}_{}.csv'.format(model, instance_type, instance_size)))
    df_valid = df_join[df_join['is_valid']]
    if num_solutions > 0:
        return 100.0 * len(df_valid.index) / num_solutions
    else:
        return np.nan

In [None]:
per_instance_stats = dict()
for instance_type in instance_type_list:  # group by instance type and size
    df_base = df_rpfs.reset_index()
    df_itype = df_base[(df_base['instance_type'] == instance_type)]
    instance_size_list = df_itype['instance_size'].unique().tolist()   # ['10x2', '10x3', '10x4', '10x5', '15x5']
    for instance_size in instance_size_list:
        df_instance = df_itype[df_itype['instance_size'] == instance_size]
        for model in model_list:
            per_instance_stats[(instance_type,instance_size,model)] = dict()
            per_instance_stats[(instance_type,instance_size,model)]['% Best Performance'] = calculate_perc_best_performance(df_instance, model)
            per_instance_stats[(instance_type,instance_size,model)]['% Solved'] = calculate_perc_solved(df_rpfs, model, instance_type, instance_size)
            per_instance_stats[(instance_type,instance_size,model)]['Avg. % gap'] = calculate_avg_perc_gap(df_instance, model)
            per_instance_stats[(instance_type,instance_size,model)]['Std. dev. of % gap'] = calculate_std(df_instance, model, 'gap')
            per_instance_stats[(instance_type,instance_size,model)]['95% CI of % gap'] = mean_confidence_interval(df_instance, model, 'gap')
            per_instance_stats[(instance_type,instance_size,model)]['Avg. time'] = calculate_avg_time(df_instance, model, 'time')
            per_instance_stats[(instance_type,instance_size,model)]['Std. dev. of time'] = calculate_std(df_instance, model, 'time')
            per_instance_stats[(instance_type,instance_size,model)]['Avg. MP time'] = calculate_avg_time(df_instance, model, 'mp_total_time')
            per_instance_stats[(instance_type,instance_size,model)]['Avg. SP time'] = calculate_avg_time(df_instance, model, 'sp_total_time')
            per_instance_stats[(instance_type,instance_size,model)]['Avg. iterations'] = calculate_avg_iterations(df_instance, model)
            per_instance_stats[(instance_type,instance_size,model)]['Std. dev. of iterations'] = calculate_std(df_instance, model, 'iterations')
            per_instance_stats[(instance_type,instance_size,model)]['# instances solved'] = calculate_num_instances(df_instance, model)
            per_instance_stats[(instance_type,instance_size,model)]['% valid solutions'] = calculate_perc_valid_solutions(df_base, model, instance_type, instance_size)

In [None]:
# https://stackoverflow.com/questions/57606801/pandas-style-options-to-latex

In [None]:
pd.set_option('display.max_columns', None)
allowed_keys = [(x, y, z) for (x, y, z) in per_instance_stats.keys() if (x == 'ying' and y in ['10x2', '10x3'])]
per_instance_stats1 = { your_key: per_instance_stats[your_key] for your_key in allowed_keys }
df_table3a = pd.DataFrame.from_dict(per_instance_stats1)
df_table3a.columns = df_table3a.columns.droplevel()
df_table3a

In [None]:
pd.set_option('display.max_columns', None)
allowed_keys = [(x, y, z) for (x, y, z) in per_instance_stats.keys() if (x == 'ying' and y in ['10x4', '10x5'])]
per_instance_stats2 = { your_key: per_instance_stats[your_key] for your_key in allowed_keys }
df_table3b = pd.DataFrame.from_dict(per_instance_stats2)
df_table3b.columns = df_table3b.columns.droplevel()
df_table3b

In [None]:
pd.set_option('display.max_columns', None)
allowed_keys = [(x, y, z) for (x, y, z) in per_instance_stats.keys() if (x == 'ying' and y in ['15x5'])]
per_instance_stats3 = { your_key: per_instance_stats[your_key] for your_key in allowed_keys }
df_table3c = pd.DataFrame.from_dict(per_instance_stats3)
#df_table3c.columns = df_table3b.columns.droplevel()
df_table3c

#### Export table to Tableau, after melt

In [None]:
df_table3 = pd.DataFrame.from_dict(per_instance_stats)
df_table3.columns = df_table3.columns.droplevel()
value_vars = df_table3.transpose().columns
df_melt_table3 = pd.melt(df_table3.transpose().reset_index(), id_vars=['level_0', 'level_1'], value_vars=value_vars)
df_melt_table3['Instance size'] = df_melt_table3['level_0']
df_melt_table3['Model'] = df_melt_table3['level_1']
df_melt_table3.to_excel(os.path.join(outputfolder_table, '2_twct_model_stats_per_instance_with_hybrid.xlsx'))