## RPFS GRASP (Cmax objective) - Data treatment of result files 

In [None]:
import pandas as pd
import numpy as np
import os, fnmatch
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
import glob
from pathlib import Path
import os

%matplotlib inline

In [None]:
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

### List files in the output folder 

In [None]:
rootfolder = os.getcwd()
file_list = []
for path in Path(os.path.join(rootfolder, 'output2')).rglob('*.csv'):
    file_list.append(path.as_posix())
file_list

### Process all CSV files and append all data to a single dataframe

In [None]:
%%time

df_all = pd.DataFrame()
for filename in file_list:
    print('Processing file ', filename)
    df_ = pd.read_csv(filename, delimiter=',')
    df_all = df_all.append(df_.copy())

In [None]:
df_all.info()

### Remove trailing spaces on column names 

In [None]:
df_all = df_all.rename(columns=lambda x: x.strip())
df_all.dtypes

### Trim existing string columns 

In [None]:
def trim_all_columns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

In [None]:
df_all = trim_all_columns(df_all)

### Remove duplicated headers 

In [None]:
df_all = df_all[(df_all['n'] != 'n')]
display(df_all['n'].unique())

### Convert column types from object 

In [None]:
def convert_column_types(df):
    for col in df.columns:
        if col in ['alpha','n','m','time_spent','exit_code','solution_value','time_spent.1','time_to_best_sol','iterations','num_visited_solutions','num_improvements','vnd_size']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        elif col in ['first_improvement','random_vnd', 'adaptive']:
            df[col] = df[col].astype('bool')
    return df

In [None]:
%%time
df_all = convert_column_types(df_all)
df_all.dtypes

### Include a new column with the instance set name 

In [None]:
df_all['instance_set'] = df_all['instance_name'].str[7:11]
df_all['instance_set']

### Modify the `instance_name` column to remove the file path

In [None]:
df_all['instance_name'] = df_all['instance_name'].apply(lambda st: st[st.rfind("/")+1:])
df_all['instance_name']

### Split the column budget_Gamma into Gamma1 and Gamma2 

In [None]:
# new data frame with split value columns
new = df_all["budget_T"].str.split(" ", n = 2, expand = True) 
# making separate first name column from new data frame 
df_all["Gamma1"]= new[0] 
# making separate last name column from new data frame 
df_all["Gamma2"]= new[1] 
# convert Gamma columns to numeric
df_all["Gamma1"] = pd.to_numeric(df_all["Gamma1"], errors='coerce')
df_all["Gamma2"] = pd.to_numeric(df_all["Gamma2"], errors='coerce')

In [None]:
df_all.head(4)

### Round columns containing time (in seconds) 

In [None]:
df_all['time_spent'] = df_all['time_spent'].round(2)
df_all['time_spent.1'] = df_all['time_spent.1'].round(2)
df_all['time_to_best_sol'] = df_all['time_to_best_sol'].round(2)

### Check for execution errors 

Exit code != 0

In [None]:
df = df_all.copy()
display(df[(df['exit_code'] != 0)])

### Remove rows with execution errors (exit_code != 0) 

In [None]:
display(df[(df['exit_code'] != 0)]['exit_code'].unique())
df = df[(df['exit_code'] == 0)]
display(df['exit_code'].unique())

### Sort data according to instance_set, instance_name, alpha, n, m, Gamma1 and Gamma2 and set index

In [None]:
print('Sorting dataset...')
df = df.sort_values(['instance_set', 'n', 'm', 'alpha', 'instance_name', 'Gamma1', 'Gamma2'])
display(df.dtypes)
df = df.set_index(['instance_set', 'n', 'm', 'alpha', 'instance_name', 'Gamma1', 'Gamma2'])

In [None]:
display(df.head(6))

### Find missing results, for a given value of alpha, n and m

For a given group of alpha, n, m and budget_Gamma, there should be 10 results.

First we will build a dataframe with the instances list and all required budget values.

### TODO IMPLEMENT A SIMILAR MISSING RESULTS STRATEGY FOR THE TAIL INSTANCES

In [None]:
data = []
rootfolder = os.getcwd()
jobs_folders = glob.glob(os.path.join(rootfolder, 'instances', 'robust', 'ying', 'data', '*/'), recursive=False)
for job_path in jobs_folders:
    alpha_folders = glob.glob(os.path.join(job_path, '*/'), recursive=False)
    n = job_path[job_path.find('data')+5:job_path.rfind(' jobs')].strip()
    #print('n: {}'.format(n))
    for alpha_path in alpha_folders:
        alpha = alpha_path[alpha_path.find('jobs')+5:alpha_path.rfind('%')].strip()
        #print('alpha: {}'.format(alpha))
        instance_paths = glob.glob(os.path.join(alpha_path, '*'), recursive=False)
        for instance_path in instance_paths:
            instance_name = instance_path[instance_path.find('%')+2:]
            #print(instance_name)
            for gamma1 in [20, 40, 60, 80, 100]:
                for gamma2 in [20, 40, 60, 80, 100]:
                    for instance_set in ['ying']:
                        data.append([instance_set, instance_name.strip(), alpha, n, 2, gamma1, gamma2])
df_instances = pd.DataFrame(data, columns=['instance_set', 'instance_name', 'alpha', 'n', 'm', 'Gamma1', 'Gamma2'])
for col in df_instances:
    if col in ['alpha','n','m','Gamma1','Gamma2']:
        df_instances[col] = pd.to_numeric(df_instances[col], errors='coerce')
display(df_instances.dtypes)
df_instances = df_instances.set_index(['instance_set', 'n', 'm', 'alpha', 'instance_name', 'Gamma1', 'Gamma2'])
display(df_instances)

Now, lets join the instances dataframe with the results one (left join).

In [None]:
df_ying = df.reset_index()
df_ying = df_ying[(df_ying['instance_set'] == 'ying')]
df_ying = df_ying.set_index(['instance_set', 'n', 'm', 'alpha', 'instance_name', 'Gamma1', 'Gamma2'])
df_ying

In [None]:
# NOTA O CÁLCULO DOS RESULTADOS AUSENTES ESTÁ FUNCIONANDO, PORÉM O TESTE DA PARAMETRIZAÇÃO RODOU APENAS 20% DAS INSTÂNCIAS
# PARA CADA GRUPO DE n e alpha, ALÉM DE RODAR APENAS PARA VALORES DE BUDGET IGUAIS (e.g. [20 20]).
# POR ESSA RAZÃO, DIVERSAS COMBINAÇÕES DE EXECUÇÃO ABAIXO ESTÃO AUSENTES.
df_joined_ying = df_instances.join(df_ying, how='left')
df_joined_ying

Now we will export to CSV a list with all rows with NaN values (missing experimental results).

In [None]:
missing_df = df_joined_ying[~df_joined_ying['batch_id'].isnull()]#.reset_index()[['instance_set', 'n', 'm', 'alpha', 'instance_name', 'Gamma1', 'Gamma2']]
print('Number of missing results: ', len(missing_df.index))
print('Saving file on folder: ' + rootfolder)
fname = os.path.join(rootfolder, 'GRASP_Cmax_missing_results.csv')
missing_df.to_csv(fname, sep=';')
print('Saved: ' + fname)

In [None]:
df_grouped = df.groupby(['alpha', 'n', 'm', 'budget_T']).agg({'executionId' : ['count']}).reset_index()
df_grouped.columns = [ ' '.join(str(i) for i in col) for col in df_grouped.columns]
#df_grouped.reset_index(inplace=True)
df_grouped

In [None]:
table = pd.pivot_table(df, values='executionId', index=['alpha', 'n'], columns=['Gamma1', 'Gamma2'], aggfunc='count', fill_value=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(table)

### Export the dataset to CSV file 

In [None]:
%%time
print('Saving file on folder: ' + rootfolder)
fname = os.path.join(rootfolder, 'GRASP_Cmax_Ying_all_results.csv')
df_ying.to_csv(fname, sep=';')
print('Saved: ' + fname)