## RPFS Problem (TWCT objective) - Data treatment of result files 

In [1]:
import pandas as pd
import numpy as np
import os, fnmatch
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
import glob
import os
from pathlib import Path

%matplotlib inline

In [2]:
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

### List files in the output folder 

In [3]:
rootfolder = os.getcwd()
file_list = glob.glob(os.path.join(rootfolder, 'output') + '/*.csv', recursive=True)
file_list

['/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_10x2.csv',
 '/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_10x3.csv',
 '/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_10x4.csv',
 '/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_10x5.csv',
 '/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_15x5.csv',
 '/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_manne_randomweights_10x2.csv',
 '/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_manne_randomweights_10x3.csv',
 '/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_manne_randomweights_10x4.csv',
 '/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_manne_randomweights_10x4_brute.csv',
 '/public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_manne_randomweights_10x5.csv',
 '/public/doutorado

### Read all the CSV files 

In [4]:
# Alternative script to treat files with incorrect number of coluns or faulty lines
def alternative_csv_reader(filename, delimiter=',', header=0, names=None):
    with open(filename, 'r') as file:
        lines = file.readlines() 
        count = 1
        line_list = []
        num_columns = 20
        for line in lines:  # Strips the newline character 
            #print("line{}: {}".format(count, line.strip())) 
            nc = len(line.split(','))
            if 'executionId,' in line:
                #num_columns = nc
                print('Detected {0} columns in CSV file.'.format(nc))
            else:
                if 'none,' in line:
                    if nc == num_columns:
                        line_list.append(line)
                    elif nc > num_columns:  # treat strange truncated lines
                        line = line[line.rfind('none,'):]
                        nc = len(line.split(','))
                        if nc == num_columns:
                            print('WARN: truncating line {0}, for having more columns than expected.'.format(count))
                            line_list.append(line)
                        else:
                            print('WARN: Ignoring line {0}, since it has {1} columns, instead of {2}: '.format(count, nc, num_columns), line)
                    else:  # Ignore line
                        print('WARN: Ignoring line {0}: '.format(count), line)    
                elif len(line_list[-1].split(',')) < num_columns:  # current line is a continuation of the previous one
                    line_list[-1] = line_list[-1].replace('\n', '') + line
                    print('*** Treated line {0}: '.format(count), line_list[-1])
                else:  # Ignore line
                    print('WARN: Ignoring line {0}: '.format(count), line)
            count += 1
        # assert all lines have the same number of columns
        count = 1
        for line in line_list:
            nc = len(line.split(','))
            if nc != num_columns:
                print('ERROR: Line {0} has {1} columns, instead of {2}: '.format(count, nc, num_columns), line)
            count += 1
        text_data = StringIO(''.join(line_list))
        #print('line_list: ', str(line_list))
        #print('text_data: ', text_data)
        df = pd.read_csv(text_data, delimiter=delimiter, header=header, names=names)
        return df

### Process all CSV files and append all data to a single dataframe (one per solution method: Wilson, Wagner) 

In [5]:
%%time

dfdict = dict()
for filepath in file_list:
    print('Processing file ', filepath)
    try:
        df_ = pd.read_csv(filepath, delimiter=',', header=0, names=['executionId','ub_name','instance_name','alpha','n','m','budget_Gamma','wct','permutation','time_spent','time_to_best_sol','mp_total_time','sp_total_time','iterations','num_visited_solutions','num_improvements','is_optimal','validated','gap','lb','cost','wct_validation'])
    except:  # try alternative method to read csv lines
        df_ = alternative_csv_reader(filename, delimiter=',', header=0, names=['executionId','ub_name','instance_name','alpha','n','m','budget_Gamma','wct','permutation','time_spent','time_to_best_sol','mp_total_time','sp_total_time','iterations','num_visited_solutions','num_improvements','is_optimal','validated','gap','lb','cost','wct_validation'])
    filename = filepath[filepath.rfind(os.path.sep)+1:]
    modelname = filename[len('separation_wct_'):filename.find('_randomweights')]
    print('Read results for model ' + modelname)
    if modelname in dfdict:
        dfdict[modelname] = pd.concat([dfdict[modelname], df_])
    else:
        dfdict[modelname] = df_.copy()

Processing file  /public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_10x2.csv
Read results for model liao-you
Processing file  /public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_10x3.csv
Read results for model liao-you
Processing file  /public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_10x4.csv
Read results for model liao-you
Processing file  /public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_10x5.csv
Read results for model liao-you
Processing file  /public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_liao-you_randomweights_15x5.csv
Read results for model liao-you
Processing file  /public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_manne_randomweights_10x2.csv
Read results for model manne
Processing file  /public/doutorado_files/RPFS_Budget_TWCT/output/separation_wct_manne_randomweights_10x3.csv
Read results for model manne
Proc

In [6]:
dfdict

{'liao-you':    executionId          ub_name                      instance_name  alpha   n  \
 0         none   mip_separation   RB0101001_10_2_10_wct_inputs.txt     10  10   
 1         none   mip_separation   RB0101001_10_2_10_wct_inputs.txt     10  10   
 2         none   mip_separation   RB0101001_10_2_10_wct_inputs.txt     10  10   
 3         none   mip_separation   RB0101001_10_2_10_wct_inputs.txt     10  10   
 4         none   mip_separation   RB0101001_10_2_10_wct_inputs.txt     10  10   
 ..         ...              ...                                ...    ...  ..   
 8         none   mip_separation   RB0151001_15_5_10_wct_inputs.txt     10  15   
 9         none   mip_separation   RB0151001_15_5_10_wct_inputs.txt     10  15   
 10        none   mip_separation   RB0151002_15_5_10_wct_inputs.txt     10  15   
 11        none   mip_separation   RB0151002_15_5_10_wct_inputs.txt     10  15   
 12        none   mip_separation   RB0151002_15_5_10_wct_inputs.txt     10  15   
 
  

In [7]:
dfdict['manne'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2148 entries, 0 to 63
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   executionId            2148 non-null   object
 1   ub_name                2148 non-null   object
 2   instance_name          2148 non-null   object
 3   alpha                  2148 non-null   object
 4   n                      2148 non-null   object
 5   m                      2148 non-null   object
 6   budget_Gamma           2148 non-null   object
 7   wct                    2148 non-null   object
 8   permutation            2148 non-null   object
 9   time_spent             2148 non-null   object
 10  time_to_best_sol       2148 non-null   object
 11  mp_total_time          2148 non-null   object
 12  sp_total_time          2148 non-null   object
 13  iterations             2148 non-null   object
 14  num_visited_solutions  2148 non-null   object
 15  num_improvements       

### Remove duplicated header rows from both dataframes 

In [8]:
def find_invalid_values(df):
    all_invalid_values = set()
    for col in df:
        if col not in ['executionId','ub_name','instance_name','budget_Gamma','permutation','is_optimal','validated']:
            # 'alpha','n','m','cmax','time_spent','time_to_best_sol','iterations','num_visited_solutions','num_improvements','gap','lb','cost','cmax_dp'
            a = pd.to_numeric(df[col], errors='coerce')
            idx = a.isna()
            invalid_values = df.loc[idx][col].unique()
            all_invalid_values.update(invalid_values)
        #elif col in ['is_optimal','validated']
    print('Invalid values:', all_invalid_values)
    return all_invalid_values

In [9]:
for key, df in dfdict.items():
    dfdict[key] = df[(df['executionId'] != 'executionId')]
    print(key, dfdict[key].dtypes)

liao-you executionId               object
ub_name                   object
instance_name             object
alpha                      int64
n                          int64
m                          int64
budget_Gamma             float64
wct                      float64
permutation               object
time_spent               float64
time_to_best_sol         float64
mp_total_time            float64
sp_total_time            float64
iterations                 int64
num_visited_solutions      int64
num_improvements           int64
is_optimal                object
validated                 object
gap                      float64
lb                       float64
cost                     float64
wct_validation           float64
dtype: object
manne executionId              object
ub_name                  object
instance_name            object
alpha                    object
n                        object
m                        object
budget_Gamma             object
wct                  

### Convert column types from object 

In [10]:
def convert_column_types(df):
    for col in df:
        if col in ['alpha','n','m','wct','budget_Gamma','time_spent','time_to_best_sol','iterations','num_visited_solutions','num_improvements','gap','lb','cost','wct_validation', 'mp_total_time', 'sp_total_time', 'seq']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        elif col in ['is_optimal','validated']:
            df[col] = df[col].astype('bool')
    return df

In [11]:
%%time
for key, df in dfdict.items():
    dfdict[key] = convert_column_types(df)
    print(key, dfdict[key].dtypes)

liao-you executionId               object
ub_name                   object
instance_name             object
alpha                      int64
n                          int64
m                          int64
budget_Gamma             float64
wct                      float64
permutation               object
time_spent               float64
time_to_best_sol         float64
mp_total_time            float64
sp_total_time            float64
iterations                 int64
num_visited_solutions      int64
num_improvements           int64
is_optimal                  bool
validated                   bool
gap                      float64
lb                       float64
cost                     float64
wct_validation           float64
dtype: object
manne executionId               object
ub_name                   object
instance_name             object
alpha                      int64
n                          int64
m                          int64
budget_Gamma             float64
wct           

### Trim existing string columns 

In [12]:
def trim_all_columns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

In [13]:
for key, df in dfdict.items():
    dfdict[key] = trim_all_columns(df)
    print(key)

liao-you
manne
tba
ts2
ts3
wagner-wst2
wilson


### Include a column with the name of the underlying C&CG MILP Model

In [14]:
for key, df in dfdict.items():
    dfdict[key]['model'] = key

### Include a column with the name of the instance type (ying or tail)

In [15]:
for key, df in dfdict.items():
    dfdict[key]['instance_type'] = df['instance_name'].apply(lambda x: 'tail' if (x[0:5] == 'tail') else 'ying')

### Fix budget_Gamma paremeter value on Wilson model, `n=10`, `m=2`

In [16]:
for key, df in dfdict.items():
    if key == 'wilson':
        dfdict[key]['budget_Gamma'] = df.apply(lambda x: ((x['budget_Gamma']*x['m']*x['n'])/100.0) if (x['m'] == 2 and x['n'] == 10) else x['budget_Gamma'], axis=1)

### Include a column with the budget parameter value in %

In [17]:
for key, df in dfdict.items():
    dfdict[key]['Gamma%'] = 100.0 * df['budget_Gamma'] / (df['n'] * df['m'])

### Concatenate dataframes 

In [18]:
df = pd.concat(list(dfdict.values()))
df.head(4)

Unnamed: 0,executionId,ub_name,instance_name,alpha,n,m,budget_Gamma,wct,permutation,time_spent,...,num_improvements,is_optimal,validated,gap,lb,cost,wct_validation,model,instance_type,Gamma%
0,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,10,10,2,2.0,64014.2,7 9 1 6 10 4 3 8 5 2,59.330391,...,1.0,True,True,3.409849e-16,64014.2,64014.2,64014.2,liao-you,ying,10.0
1,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,10,10,2,4.0,65550.0,7 9 1 6 10 4 3 8 5 2,8.589128,...,1.0,True,True,0.0,65550.0,65550.0,65550.0,liao-you,ying,20.0
2,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,10,10,2,6.0,66175.1,7 9 1 6 10 4 3 8 5 2,8.714498,...,1.0,True,True,2.199002e-16,66175.1,66175.1,66175.1,liao-you,ying,30.0
3,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,10,10,2,8.0,66617.5,7 9 1 6 10 4 3 8 5 2,8.276973,...,1.0,True,True,0.0,66617.5,66617.5,66617.5,liao-you,ying,40.0


### Fix instance names 

The original instance names, in the instance file zip, were assembled incorrectly.

The problem lies in the alpha percentage. We are now going to fix this issue.

In [19]:
df['seq'] = df['instance_name'].apply(lambda x: x[x.find('_')-2:x.find('_')])
df['n_str'] = df['n'].astype(str).str.zfill(3)
df['alpha_str'] = df['alpha'].astype(str)

In [20]:
df['instance_name'] = 'RB' + df['n_str'] + df['alpha_str'] + df['seq'] + '_' + df['instance_name'].apply(lambda x: x[x.find('_')+1:])
df.drop(columns=['n_str', 'alpha_str'], inplace=True)
df.head(4)

Unnamed: 0,executionId,ub_name,instance_name,alpha,n,m,budget_Gamma,wct,permutation,time_spent,...,is_optimal,validated,gap,lb,cost,wct_validation,model,instance_type,Gamma%,seq
0,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,10,10,2,2.0,64014.2,7 9 1 6 10 4 3 8 5 2,59.330391,...,True,True,3.409849e-16,64014.2,64014.2,64014.2,liao-you,ying,10.0,1
1,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,10,10,2,4.0,65550.0,7 9 1 6 10 4 3 8 5 2,8.589128,...,True,True,0.0,65550.0,65550.0,65550.0,liao-you,ying,20.0,1
2,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,10,10,2,6.0,66175.1,7 9 1 6 10 4 3 8 5 2,8.714498,...,True,True,2.199002e-16,66175.1,66175.1,66175.1,liao-you,ying,30.0,1
3,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,10,10,2,8.0,66617.5,7 9 1 6 10 4 3 8 5 2,8.276973,...,True,True,0.0,66617.5,66617.5,66617.5,liao-you,ying,40.0,1


### Round columns containing time (in seconds) 

In [21]:
df['time_spent'] = df['time_spent'].round(2)
df['time_to_best_sol'] = df['time_to_best_sol'].round(2)
df['mp_total_time'] = df['mp_total_time'].round(2)
df['sp_total_time'] = df['sp_total_time'].round(2)

### Sort data according to model, instance_name, alpha, n, m and Gamma and set index

In [22]:
print('Sorting dataset...')
df = df.sort_values(['model', 'n', 'm', 'alpha', 'seq', 'budget_Gamma', 'instance_type'])
display(df.dtypes)
df = df.set_index(['model', 'n', 'm', 'alpha', 'seq', 'budget_Gamma', 'instance_type'])
display(df.head(6))

Sorting dataset...


executionId               object
ub_name                   object
instance_name             object
alpha                      int64
n                          int64
m                          int64
budget_Gamma             float64
wct                      float64
permutation               object
time_spent               float64
time_to_best_sol         float64
mp_total_time            float64
sp_total_time            float64
iterations               float64
num_visited_solutions    float64
num_improvements         float64
is_optimal                  bool
validated                   bool
gap                      float64
lb                       float64
cost                     float64
wct_validation           float64
model                     object
instance_type             object
Gamma%                   float64
seq                       object
dtype: object

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,executionId,ub_name,instance_name,wct,permutation,time_spent,time_to_best_sol,mp_total_time,sp_total_time,iterations,num_visited_solutions,num_improvements,is_optimal,validated,gap,lb,cost,wct_validation,Gamma%
model,n,m,alpha,seq,budget_Gamma,instance_type,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
liao-you,10,2,10,1,2.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,64014.2,7 9 1 6 10 4 3 8 5 2,59.33,59.33,11.75,0.14,2.0,2.0,1.0,True,True,3.409849e-16,64014.2,64014.2,64014.2,10.0
liao-you,10,2,10,1,4.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,65550.0,7 9 1 6 10 4 3 8 5 2,8.59,8.59,11.56,0.18,2.0,2.0,1.0,True,True,0.0,65550.0,65550.0,65550.0,20.0
liao-you,10,2,10,1,6.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,66175.1,7 9 1 6 10 4 3 8 5 2,8.71,8.71,11.69,0.19,2.0,2.0,1.0,True,True,2.199002e-16,66175.1,66175.1,66175.1,30.0
liao-you,10,2,10,1,8.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,66617.5,7 9 1 6 10 4 3 8 5 2,8.28,8.28,11.33,0.2,2.0,2.0,1.0,True,True,0.0,66617.5,66617.5,66617.5,40.0
liao-you,10,2,10,1,10.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,67028.2,7 9 1 6 10 4 3 8 5 2,8.68,8.68,11.68,0.19,2.0,2.0,1.0,True,True,2.171014e-16,67028.2,67028.2,67028.2,50.0
liao-you,10,2,10,1,12.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,67334.7,7 9 1 6 10 4 3 8 5 2,8.53,8.53,11.52,0.19,2.0,2.0,1.0,True,True,2.161132e-16,67334.7,67334.7,67334.7,60.0


### Find missing results, for a given value of alpha, n and m

For a given group of alpha, n, m and budget_Gamma, there should be 10 results.

First we will build a dataframe with the instances list and all required budget values.

In [23]:
data = []
rootfolder = os.getcwd()
file_list = Path(os.path.join(rootfolder, 'instances', 'robust')).rglob('*.txt')
file_set = set()
for path in file_list:
    instance_path = path.name
    #print('instance_path: ' + instance_path)    
    if '.txt' not in instance_path:
        continue
    if 'tail' in instance_path:
        if instance_path[:instance_path.find('_')] not in ['tail001', 'tail002', 'tail003', 'tail004', 'tail005', 'tail006', 'tail007', 'tail008', 'tail009', 'tail010']:
            #print(instance_path[:instance_path.find('_')])
            continue
    instance_name = instance_path[instance_path.rfind(os.path.sep)+1:]
    file_set.add(instance_name)
print(file_set, file_set)
for instance_name in file_set:
    #print('instance_name: ' + instance_name)
    seq = instance_name[instance_name.find('_')-2:instance_name.find('_')]
    info = instance_name[instance_name.find('_')+1:]
    n = info[:info.find('_')]
    info = info[info.find('_')+1:]
    m = info[:info.find('_')]
    info = info[info.find('_')+1:]
    alpha = info[:info.find('_')]
    instance_type = 'ying'
    if 'tail' in instance_name:
        instance_type = 'tail'
    else:
        if int(n) > 20:
            continue
    for gamma in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
        budget_gamma = int(gamma) * (int(m) * int(n)) / 100.0
        for model in list(dfdict.keys()):
            data.append([model, seq, alpha, int(n), int(m), budget_gamma, instance_type])
df_instances = pd.DataFrame(data, columns=['model', 'seq', 'alpha', 'n', 'm', 'budget_Gamma', 'instance_type'])
for col in df_instances:
    if col in ['alpha','n','m','budget_Gamma']:
        df_instances[col] = pd.to_numeric(df_instances[col], errors='coerce')
display(df_instances.dtypes)
df_instances = df_instances.set_index(['model', 'n', 'm', 'alpha', 'seq', 'budget_Gamma', 'instance_type'])
display(df_instances)

{'RB0101008_010_004_50_wct_inputs.txt', 'RB1504010_150_2_40_wct_inputs.txt', 'RB2003009_200_2_30_wct_inputs.txt', 'tail009_20_5_30_wct_inputs.txt', 'RB1503003_150_2_30_wct_inputs.txt', 'RB0101005_10_2_10_wct_inputs.txt', 'RB0101010_010_003_30_wct_inputs.txt', 'RB0103003_10_10_30_wct_inputs.txt', 'RB0201010_20_2_10_wct_inputs.txt', 'RB1505007_150_2_50_wct_inputs.txt', 'RB0502006_50_2_20_wct_inputs.txt', 'RB2005005_200_2_50_wct_inputs.txt', 'RB1505010_150_2_50_wct_inputs.txt', 'RB0105005_10_5_50_wct_inputs.txt', 'RB0101004_010_003_50_wct_inputs.txt', 'RB0101002_010_003_50_wct_inputs.txt', 'RB0101009_010_003_20_wct_inputs.txt', 'RB0202001_20_5_20_wct_inputs.txt', 'RB1003006_100_2_30_wct_inputs.txt', 'RB2005006_200_2_50_wct_inputs.txt', 'RB1003010_100_2_30_wct_inputs.txt', 'RB0101002_10_10_10_wct_inputs.txt', 'RB0101010_010_003_40_wct_inputs.txt', 'RB0155002_15_5_50_wct_inputs.txt', 'RB0201006_20_2_10_wct_inputs.txt', 'RB0504004_50_2_40_wct_inputs.txt', 'RB0203009_20_2_30_wct_inputs.txt', 

model             object
seq               object
alpha              int64
n                  int64
m                  int64
budget_Gamma     float64
instance_type     object
dtype: object

model,n,m,alpha,seq,budget_Gamma,instance_type
liao-you,10,4,50,08,4.0,ying
manne,10,4,50,08,4.0,ying
tba,10,4,50,08,4.0,ying
ts2,10,4,50,08,4.0,ying
ts3,10,4,50,08,4.0,ying
...,...,...,...,...,...,...
tba,10,5,30,01,50.0,ying
ts2,10,5,30,01,50.0,ying
ts3,10,5,30,01,50.0,ying
wagner-wst2,10,5,30,01,50.0,ying


In [24]:
df_instances.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31500 entries, ('liao-you', 10, 4, 50, '08', 4.0, 'ying') to ('wilson', 10, 5, 30, '01', 50.0, 'ying')
Empty DataFrame

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 13834 entries, ('liao-you', 10, 2, 10, '01', 2.0, 'ying') to ('wilson', 10, 5, 50, '10', 50.0, 'ying')
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   executionId            13834 non-null  object 
 1   ub_name                13834 non-null  object 
 2   instance_name          13834 non-null  object 
 3   wct                    13831 non-null  float64
 4   permutation            13834 non-null  object 
 5   time_spent             13834 non-null  float64
 6   time_to_best_sol       13834 non-null  float64
 7   mp_total_time          13834 non-null  float64
 8   sp_total_time          13833 non-null  float64
 9   iterations             13833 non-null  float64
 10  num_visited_solutions  13833 non-null  float64
 11  num_improvements       13834 non-null  float64
 12  is_optimal             13834 non-null  bool   
 13  validated              13834 

Now, lets join the instances dataframe with the results one (left join).

In [26]:
df_joined = df_instances.join(df, how='left')
df_joined

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,executionId,ub_name,instance_name,wct,permutation,time_spent,time_to_best_sol,mp_total_time,sp_total_time,iterations,num_visited_solutions,num_improvements,is_optimal,validated,gap,lb,cost,wct_validation,Gamma%
model,n,m,alpha,seq,budget_Gamma,instance_type,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
liao-you,10,2,10,01,2.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,64014.2,7 9 1 6 10 4 3 8 5 2,59.33,59.33,11.75,0.14,2.0,2.0,1.0,True,True,3.409849e-16,64014.2,64014.2,64014.2,10.0
liao-you,10,2,10,01,4.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,65550.0,7 9 1 6 10 4 3 8 5 2,8.59,8.59,11.56,0.18,2.0,2.0,1.0,True,True,0.000000e+00,65550.0,65550.0,65550.0,20.0
liao-you,10,2,10,01,6.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,66175.1,7 9 1 6 10 4 3 8 5 2,8.71,8.71,11.69,0.19,2.0,2.0,1.0,True,True,2.199002e-16,66175.1,66175.1,66175.1,30.0
liao-you,10,2,10,01,8.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,66617.5,7 9 1 6 10 4 3 8 5 2,8.28,8.28,11.33,0.20,2.0,2.0,1.0,True,True,0.000000e+00,66617.5,66617.5,66617.5,40.0
liao-you,10,2,10,01,10.0,ying,none,mip_separation,RB0101001_10_2_10_wct_inputs.txt,67028.2,7 9 1 6 10 4 3 8 5 2,8.68,8.68,11.68,0.19,2.0,2.0,1.0,True,True,2.171014e-16,67028.2,67028.2,67028.2,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wilson,20,5,50,10,80.0,ying,,,,,,,,,,,,,,,,,,,
wilson,20,5,50,10,90.0,tail,,,,,,,,,,,,,,,,,,,
wilson,20,5,50,10,90.0,ying,,,,,,,,,,,,,,,,,,,
wilson,20,5,50,10,100.0,tail,,,,,,,,,,,,,,,,,,,


In [27]:
df_joined.reset_index()[['model', 'n', 'm', 'alpha', 'seq', 'budget_Gamma', 'instance_type']]

Unnamed: 0,model,n,m,alpha,seq,budget_Gamma,instance_type
0,liao-you,10,2,10,01,2.0,ying
1,liao-you,10,2,10,01,4.0,ying
2,liao-you,10,2,10,01,6.0,ying
3,liao-you,10,2,10,01,8.0,ying
4,liao-you,10,2,10,01,10.0,ying
...,...,...,...,...,...,...,...
31637,wilson,20,5,50,10,80.0,ying
31638,wilson,20,5,50,10,90.0,tail
31639,wilson,20,5,50,10,90.0,ying
31640,wilson,20,5,50,10,100.0,tail


Now we will export to CSV a list with all rows with NaN values (missing experimental results).

In [28]:
missing_df = df_joined[df_joined.isnull().any(axis=1)].reset_index()[['model', 'n', 'm', 'alpha', 'seq', 'budget_Gamma', 'instance_type']]
outputfolder = os.path.join(os.getcwd(), 'results', 'consolidated')
if not os.path.exists(outputfolder):
    os.makedirs(outputfolder)
print('Saving file on folder: ' + outputfolder)
fname = os.path.join(outputfolder, 'RPFS_TWCT_missing_results.csv')
missing_df.to_csv(fname, sep=';')
print('Saved: ' + fname)

Saving file on folder: /public/doutorado_files/RPFS_Budget_TWCT/results/consolidated
Saved: /public/doutorado_files/RPFS_Budget_TWCT/results/consolidated/RPFS_TWCT_missing_results.csv


In [29]:
df_grouped = df.groupby(['alpha', 'n', 'm', 'budget_Gamma']).agg({'executionId' : ['count']}).reset_index()
df_grouped.columns = [ ' '.join(str(i) for i in col) for col in df_grouped.columns]
#df_grouped.reset_index(inplace=True)
df_grouped

Unnamed: 0,alpha,n,m,budget_Gamma,executionId count
0,10,10,2,2.0,70
1,10,10,2,4.0,70
2,10,10,2,6.0,70
3,10,10,2,8.0,70
4,10,10,2,10.0,70
...,...,...,...,...,...
255,50,20,5,60.0,1
256,50,20,5,70.0,1
257,50,20,5,80.0,1
258,50,20,5,90.0,1


In [30]:
table = pd.pivot_table(df, values='executionId', index=['alpha', 'n'], columns=['budget_Gamma'], aggfunc='count', fill_value=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(table)

Unnamed: 0_level_0,budget_Gamma,2.0,3.0,4.0,5.0,6.0,7.5,8.0,9.0,10.0,12.0,14.0,15.0,16.0,18.0,20.0,21.0,22.5,24.0,25.0,27.0,28.0,30.0,32.0,35.0,36.0,37.5,40.0,45.0,50.0,52.5,60.0,67.5,70.0,75.0,80.0,90.0,100.0
alpha,n,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
10,10,70,70,153,71,140,0,145,71,141,213,70,142,142,140,212,70,0,143,70,70,73,140,72,70,72,0,142,70,70,0,0,0,0,0,0,0,0
10,15,0,0,0,0,0,8,0,0,0,0,0,5,0,0,0,0,5,0,0,0,0,4,0,0,0,3,0,3,0,3,3,3,0,3,0,0,0
10,20,0,0,0,0,0,0,0,0,8,0,0,0,0,0,8,0,0,0,0,0,0,7,0,0,0,0,7,0,5,0,5,0,5,0,4,4,3
20,10,70,71,141,71,140,0,142,70,140,212,70,140,141,140,211,70,0,141,70,70,71,141,71,70,71,0,142,70,70,0,0,0,0,0,0,0,0
20,20,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,1,1,1
30,10,70,72,141,69,140,0,141,70,139,211,70,139,141,140,210,70,0,141,69,70,71,139,71,68,71,0,139,68,68,0,0,0,0,0,0,0,0
30,20,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,1,1,1
40,10,70,70,141,51,140,0,141,70,121,211,70,121,141,143,193,70,0,141,51,70,71,121,72,50,71,0,121,51,51,0,0,0,0,0,0,0,0
40,20,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,1,1,1
50,10,70,70,141,52,140,0,141,70,121,211,70,121,141,141,191,70,0,141,50,70,71,120,71,51,71,0,122,51,51,0,0,0,0,0,0,0,0


### Export the dataset to CSV file 

In [31]:
%%time

outputfolder = os.path.join(os.getcwd(), 'results', 'consolidated')
print('Saving file on folder: ' + outputfolder)
fname = os.path.join(outputfolder, 'RPFS_TWCT_all_results.csv')
df.to_csv(fname, sep=';')
print('Saved: ' + fname)
fname = os.path.join(outputfolder, 'RPFS_TWCT_all_results.pkl.gz')
df.to_pickle(fname)
print('Saved: ' + fname)

Saving file on folder: /public/doutorado_files/RPFS_Budget_TWCT/results/consolidated
Saved: /public/doutorado_files/RPFS_Budget_TWCT/results/consolidated/RPFS_TWCT_all_results.csv
Saved: /public/doutorado_files/RPFS_Budget_TWCT/results/consolidated/RPFS_TWCT_all_results.pkl.gz
CPU times: user 1.05 s, sys: 72.3 ms, total: 1.12 s
Wall time: 1.33 s
