In [1]:
# %load /home/jonathan/.ipython/profile_default/startup/01-setup.py
# start up settings for jupyter notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys

# use plt.style.available() to check out available styles
plt.style.use('seaborn-whitegrid')
plt.rcParams['font.size'] = 15.0
plt.rcParams['axes.labelsize'] = 15.0
plt.rcParams['xtick.labelsize'] = 15.0
plt.rcParams['ytick.labelsize'] = 15.0
plt.rcParams['legend.fontsize'] = 15.0

%matplotlib inline

# set the max column width
pd.options.display.max_colwidth = 1000

# to avoid have warnings from chained assignments
pd.options.mode.chained_assignment = None


In [2]:
idx = pd.IndexSlice

import matplotlib.patches as mpatches
import matplotlib.lines as mlines

### Load the data

In [4]:
data_dir = '../prediction-data/'

df_list = []

for f in os.listdir(data_dir):
    if 'inc0' in f or 'final-data' in f:
        continue
    fp = os.path.join(data_dir, f)
    
    if not os.path.isfile(fp):
        continue

    df = pd.read_csv(fp, header=[0, 1])
    df.rename(columns={
        'Unnamed: 0_level_1': '',
        'Unnamed: 1_level_1': '',
        'Unnamed: 2_level_1': '',
        'Unnamed: 3_level_1': ''
    }, level=1, inplace=True)
    df_list.append(df)
    


  interactivity=interactivity, compiler=compiler, result=result)


Base df shape: (245478, 258)
Uniq base df shape: (185848, 258)
K-2 df shape: (86469, 258)
K-2 unique df shape: (74582, 258)


In [None]:
base_df = pd.concat(df_list, axis=0, sort=False)
base_df.set_index(['model', 'log', 'decomposition', 'SP label'], inplace=True)

# remove traces for which all algorithms are invalid
astar_invalid = (base_df.loc[:,('is_valid', 'astar')] == False) | (base_df.loc[:, ('is_valid', 'astar')].isnull())
inc3_invalid = (base_df.loc[:,('is_valid', 'inc3')] == False) | (base_df.loc[:,('is_valid', 'inc3')].isnull())
recomp_astar_invalid = (base_df.loc[:,('is_valid', 'recomp-astar')] == False) | (base_df.loc[:,('is_valid', 'recomp-astar')].isnull())
recomp_inc3_invalid = (base_df.loc[:,('is_valid', 'recomp-inc3')] == False) | (base_df.loc[:,('is_valid', 'recomp-inc3')].isnull())
all_invalid = astar_invalid & inc3_invalid & recomp_astar_invalid & recomp_inc3_invalid

before_n_rows = base_df.shape[0]
print('Before filtering all invalid no. of rows: {}'.format(before_n_rows))

base_df = base_df.loc[~all_invalid, :]

after_n_rows = base_df.shape[0]
print('After filtering all invalid no. of rows: {}'.format(after_n_rows))
print('{} traces invalid for all algorithms'.format(before_n_rows - after_n_rows))

cost_is_neg_recomp_astar = base_df.loc[:, ('Cost of the alignment', 'recomp-astar')] == -1
cost_is_neg_recomp_inc3 = base_df.loc[:, ('Cost of the alignment', 'recomp-inc3')] == -1

before_n_rows = base_df.shape[0]
print('Before filtering negative cost rows: {}'.format(before_n_rows))

base_df = base_df.loc[~(cost_is_neg_recomp_astar | cost_is_neg_recomp_inc3), :]

after_n_rows = base_df.shape[0]
print('After filtering negative cost rows: {}'.format(after_n_rows))
print('{} traces invalid'.format(before_n_rows - after_n_rows))

### Load feature data

In [None]:
feature_dir = '../prediction-data/features/'

feature_df_list = dict()

for f in os.listdir(feature_dir):
    fp = os.path.join(feature_dir, f)
    
    if not os.path.isfile(fp):
        continue
    
    model = f.replace('-feature.csv', '')
    df = pd.read_csv(fp)
    df['model'] = model
    feature_df_list = df

feature_df = pd.concat(feature_df_list, axis=0, sort=False)

### Merge feature data and performance data

In [None]:
full_df = base_df.merge(feature_df, on=('model', 'log', 'decomposition', 'SP label'), how='inner')

### Getting the no-duplicates dataframe
Two rows are considered duplicates if they have the values in:
- all of the model_trace_features columns
- same cost of the alignment

Note that they do not need the same alignment, i.e., same number of legal move types

In [None]:
print('Number of rows before filtering duplicates: {}'.format(full_df.shape[0]))

select_level_0 = ('model_trace_features', 'Cost of the alignment')
select_level_1 = slice(None)

duplicate_rows = full_df.loc[:, idx[select_level_0, select_level_1]].duplicated()
uniq_full_df = full_df[~duplicate_rows]

print('Number of rows after filtering duplicates: {}'.format(uniq_full_df.shape[0]))
print('Number of rows removed: {}'.format(full_df.shape[0] - uniq_full_df.shape[0]))

### Export dataframes 

In [None]:
out_fp = '../prediction-data/final-data.csv'

# export dataframe after computing k times differences!
full_df.reset_index(drop=False).to_csv(out_fp, index=False)

out_fp = '../../prediction-data/uniq-final-data.csv'
uniq_full_df.reset_index(drop=False).to_csv(out_fp, index=False)