## RPFS GRASP (Cmax objective) - Data treatment of result files 

In [None]:
import pandas as pd
import numpy as np
import os, fnmatch
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
import glob
from pathlib import Path
import os

%matplotlib inline

In [None]:
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

### List files in the output folder 

In [None]:
rootfolder = os.getcwd()
file_list = []
for path in glob.glob(os.path.join(rootfolder, os.path.join('..', 'pfsp_experiments', 'run_grasp_rpfs_cmax_global', '*.csv'))):
    if 'parametrization' not in path:
        file_list.append(path)
file_list

### Process all CSV files and append all data to a single dataframe

In [None]:
# Alternative script to treat files with incorrect number of coluns or faulty lines
def alternative_csv_reader(filename, delimiter, header, names):
    with open(filename, 'r') as file:
        lines = file.readlines() 
        count = 1
        line_list = []
        num_columns = len(names)
        for line in lines:  # Strips the newline character 
            #print("line{}: {}".format(count, line.strip())) 
            nc = len(line.split(','))
            if 'execution_id,' in line:
                #num_columns = nc
                print('Detected {0} columns in CSV file.'.format(nc))
            else:
                if not names[0] in line:
                    if nc == num_columns:
                        line_list.append(line)
                    elif nc > num_columns:  # treat strange truncated lines
                        line = line[line.rfind('2020_'):]
                        nc = len(line.split(','))
                        if nc == num_columns:
                            print('WARN: truncating line {0}, for having more columns than expected.'.format(count))
                            line_list.append(line)
                        else:
                            print('WARN: Ignoring line {0}, since it has {1} columns, instead of {2}: '.format(count, nc, num_columns), line)
                    else:  # Ignore line
                        print('WARN: Ignoring line {0}: '.format(count), line)    
                elif len(line_list) > 0 and len(line_list[-1].split(',')) < num_columns:  # current line is a continuation of the previous one
                    line_list[-1] = line_list[-1].replace('\n', '') + line
                    print('*** Treated line {0}: '.format(count), line_list[-1])
                else:  # Ignore line
                    print('WARN: Ignoring line {0}: '.format(count), line)
            count += 1
        # assert all lines have the same number of columns
        count = 1
        for line in line_list:
            nc = len(line.split(','))
            if nc != num_columns:
                print('ERROR: Line {0} has {1} columns, instead of {2}: '.format(count, nc, num_columns), line)
            count += 1
        text_data = StringIO(''.join(line_list))
        #print('line_list: ', str(line_list))
        #print('text_data: ', text_data)
        df = pd.read_csv(text_data, delimiter=delimiter, header=header, names=names)
        return df

In [None]:
%%time
df = pd.DataFrame()
for filename in file_list:
    print('Processing file ', filename)
    try:
        df_ = pd.read_csv(filename, delimiter=',', header=0, names=['batch_id', 'run_id', 'execution_id', 'seed', 'ub_name', 'instance_name', 'alpha', 'n', 'm', 'budget_T', 'time_spent', 'exit_code', 'solution_value', 'permutation', 'time_spent_2', 'time_to_best_sol', 'iterations', 'num_visited_solutions', 'num_improvements', 'first_improvement', 'vnd_size', 'vnd_permutation', 'random_vnd', 'adaptive', 'const_beta1', 'const_beta2', 'time_factor'])
    except:  # try alternative method to read csv lines
        df_ = alternative_csv_reader(filename, delimiter=',', header=0, names=['batch_id', 'run_id', 'execution_id', 'seed', 'ub_name', 'instance_name', 'alpha', 'n', 'm', 'budget_T', 'time_spent', 'exit_code', 'solution_value', 'permutation', 'time_spent_2', 'time_to_best_sol', 'iterations', 'num_visited_solutions', 'num_improvements', 'first_improvement', 'vnd_size', 'vnd_permutation', 'random_vnd', 'adaptive', 'const_beta1', 'const_beta2', 'time_factor'])
    df_['budget_T'] = df_['budget_T'].astype(str).apply(lambda x: x.strip())
    df_['multibudget'] = df_['budget_T'].apply(lambda x: 1 if ' ' in x else 0)
    df_t = df_[df_['multibudget'] == 1]
    if len(df_t.index) > 0:
        print('WARN: invalid budget values detected')
        print(df_t)
    df = df.append(df_.copy())

In [None]:
df_all = df
df_all.info()

### Remove trailing spaces on column names 

In [None]:
df_all = df_all.rename(columns=lambda x: x.strip())
df_all.dtypes

### Trim existing string columns 

In [None]:
def trim_all_columns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

In [None]:
df_all = trim_all_columns(df_all)

### Remove duplicated headers 

In [None]:
df_all = df_all[(df_all['n'] != 'n')]
display(df_all['n'].unique())

### Convert column types from object 

In [None]:
def convert_column_types(df):
    for col in df.columns:
        if col in ['seed','n','m','time_spent','exit_code','solution_value','time_spent_2','time_to_best_sol','iterations','num_visited_solutions','num_improvements','vnd_size','const_beta1','const_beta2','time_factor']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        elif col in ['first_improvement','random_vnd', 'adaptive']:
            df[col] = df[col].astype('bool')
    return df

In [None]:
%%time
df_all = convert_column_types(df_all)
df_all.dtypes

In [None]:
df.tail()

### Include a new column with the instance set name 

In [None]:
df_all['instance_type'] = df_all['instance_name'].apply(lambda x: 'tail' if ('tail' in x) else 'ying')
df_all['instance_type'].unique()

### Modify the `instance_name` column to remove the file path

In [None]:
df_all['instance_name'] = df_all['instance_name'].apply(lambda st: st[st.rfind("/")+1:])
df_all['instance_name'].unique()

### Fix the values in column budget_Gamma

In [None]:
df_all["budget_T"].unique()

In [None]:
# new data frame with split value columns
new = df_all["budget_T"].str.split(" ", n = 2, expand = True) 
# making separate first name column from new data frame 
df_all["Gamma"]= new[0] 
# convert Gamma columns to numeric
df_all["Gamma"] = pd.to_numeric(df_all["Gamma"], errors='coerce')

In [None]:
df_all["Gamma"].unique()
df_all['Gamma%'] = df_all['Gamma']

In [None]:
df_all.head(4)

### Adjust n_str, alpha and seq values

In [None]:
df_all['n_str'] = df_all['n'].astype(str).str.zfill(3)
df_all['instance_name_short'] = df_all['instance_name'].apply(lambda x: x[:x.find('_cmax_inputs')] if '_cmax_inputs' in x else x[:x.find('.txt')])
df_all.loc[(df_all['instance_type'] == 'tail'), 'seq'] = df_all.loc[(df_all['instance_type'] == 'tail'), 'instance_name'].apply(lambda x: x[x.find('tail')+len('tail'):x.find('_')] if '_' in x else x[x.find('tail')+len('tail'):x.find('.')])
df_all.loc[(df_all['instance_type'] == 'ying'), 'seq'] = df_all.loc[(df_all['instance_type'] == 'ying'), 'instance_name'].apply(lambda x: x[x.find('_')-2:x.find('_')] if '_cmax_inputs' in x else x[x.find('.')-2:x.find('.')])

df_all['alpha_str'] = df_all['instance_name_short'].apply(lambda x: x[x.rfind('_')+1:] if '_' in x else 'na')
df_all.loc[(df_all['alpha_str'] == 'na'), 'alpha_str'] = df_all.loc[(df_all['alpha_str'] == 'na'), 'alpha'].astype(str)
df_all['alpha_str'] = df_all['alpha_str'].apply(lambda x: x.replace('%', ''))


In [None]:
df_all['alpha_str'].unique()

In [None]:
df_all['seq'].unique()

In [None]:
df_all['instance_name_short'].unique()

### Round columns containing time (in seconds) 

In [None]:
df_all['time_spent'] = df_all['time_spent'].round(2)
df_all['time_spent_2'] = df_all['time_spent_2'].round(2)
df_all['time_to_best_sol'] = df_all['time_to_best_sol'].round(2)

### Check for execution errors 

Exit code != 0

In [None]:
df = df_all.copy()
display(df[(df['exit_code'] != 0)]['solution_value'])

### Remove rows with execution errors (exit_code != 0) 

In [None]:
display('Exit codes: ', df[(df['exit_code'] != 0)]['exit_code'].unique())
df = df[(df['exit_code'] == 0)]
display('Exit codes: ', df['exit_code'].unique())

### Sort data according to instance_type, instance_name, alpha, n, m, Gamma and set index

In [None]:
print('Sorting dataset...')
df = df.sort_values(['instance_type', 'n', 'm', 'alpha_str', 'instance_name', 'Gamma'])
display(df.dtypes)
df = df.set_index(['instance_type', 'n', 'm', 'alpha_str', 'instance_name', 'Gamma'])

In [None]:
display(df.head(6))

### Find missing results, for a given instance and a given value of Gamma

For a given instance_name and budget_Gamma, there should be 100 results.

First we will build a dataframe with the instances list and all required budget values.

In [None]:
gamma_range = np.array(range(0, 101))[0:101:5]
gamma_range

In [None]:
data = []
filename_list = df.reset_index()['instance_name'].unique()
for instance_name in filename_list:
    
    if '_cmax_inputs' in instance_name:
        instance_name_short = instance_name[:instance_name.find('_cmax_inputs')]
    else:
        instance_name_short = instance_name[:instance_name.find('.txt')]
    # end if
    if '_' in instance_name:
        alpha = instance_name_short[instance_name_short.rfind('_')+1:].replace('%', '').strip()
    else:
        alpha = instance_name_short[5:7].strip()  # RB1501001.txt
    # end if
    #print('alpha: {}'.format(alpha))
    for gamma in gamma_range:
        data.append([instance_name.strip(), gamma])
df_instances = pd.DataFrame(data, columns=['instance_name', 'Gamma'])
#df_instances = df_instances.set_index(['instance_name'])
display(df_instances)

Now, lets join the instances dataframe with the results one (left join).

In [None]:
df_joined = df_instances.merge(df.reset_index(), how='left', left_on=['instance_name', 'Gamma'], right_on=['instance_name', 'Gamma'])
df_joined

In [None]:
df_grouped = df.groupby(['instance_type', 'instance_name', 'alpha_str', 'n', 'm', 'Gamma']).agg({'execution_id' : ['count']}).reset_index()
df_grouped.columns = [ '_'.join(str(i) for i in col) for col in df_grouped.columns]
#df_grouped.reset_index(inplace=True)
df_grouped

**For a given instance_name and budget_Gamma, there should be 100 results.**

In [None]:
perc_list = gamma_range
df_ = df.reset_index()
table = pd.pivot_table(df_[(df_['Gamma'].isin(perc_list))], values='execution_id', index=['instance_type', 'alpha_str', 'n', 'm', 'seq'], columns=['Gamma'], aggfunc='count', fill_value=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(table)

### Export missing results

Now we will export to CSV a list with all rows with NaN values (missing experimental results).

In [None]:
missing_df = df_grouped[df_grouped['execution_id_count'] < 100]
missing_df['tail_prefix'] = missing_df['instance_name_'].str.find('_')
#missing_df['tail_number'] = missing_df['instance_name_'].str[4:7]
#missing_df['tail_number'] = pd.to_numeric(missing_df['tail_number'], errors='coerce')
missing_df

In [None]:
missing_df['instance_name_'].unique()

In [None]:
missing_df = missing_df[(missing_df['tail_prefix'] <= 7) | (missing_df['instance_name_'].str.find('tail0100') >= 0)]
print('Saving file on folder: ' + rootfolder)
fname = os.path.join(os.getcwd(), 'GRASP_Cmax_missing_results.csv')
missing_df.to_csv(fname, sep=';')
print('Saved: ' + fname)

In [None]:
table = pd.pivot_table(df, values='execution_id', index=['instance_name', 'alpha', 'n'], columns=['Gamma'], aggfunc='count', fill_value=0)
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(table)

### Export the dataset to CSV file 

In [None]:
%%time

outputfolder = os.path.join(os.getcwd(), 'results', 'consolidated')
print('Saving file on folder: ' + outputfolder)
fname = os.path.join(outputfolder, 'RPFS_Cmax_GRASP_all_results.csv.gz')
df.to_csv(fname, sep=';')
print('Saved: ' + fname)
fname = os.path.join(outputfolder, 'RPFS_Cmax_GRASP_all_results.pkl.gz')
df.to_pickle(fname)
print('Saved: ' + fname)

In [None]:
df.reset_index().info()