In [1]:
# !export PYTHONPATH=/Users/ilariasartori/syntheseus:/Users/ilariasartori/syntheseus/tutorials/search

In [2]:
# !echo $PYTHONPATH

In [3]:
import numpy as np
import pandas as pd

class SearchResult:
    def __init__(self, name, soln_time_dict, num_different_routes_dict, 
                 final_num_rxn_model_calls_dict, output_graph_dict, routes_dict):
        self.name = name
        self.soln_time_dict = soln_time_dict
        self.num_different_routes_dict = num_different_routes_dict
        self.final_num_rxn_model_calls_dict = final_num_rxn_model_calls_dict
        self.output_graph_dict = output_graph_dict
        self.routes_dict = routes_dict

labelalias = {
    'constant-0': 'constant-0',
    'Tanimoto-distance': 'Tanimoto',
    'Tanimoto-distance-TIMES10': 'Tanimoto * 10',
    'Tanimoto-distance-TIMES100': 'Tanimoto * 100',
    'Tanimoto-distance-EXP': 'Tanimoto exp',
    'Tanimoto-distance-SQRT': 'Tanimoto sqrt',
    "Tanimoto-distance-NUM_NEIGHBORS_TO_1": "Tanimoto neighb to 1",
    "Embedding-from-fingerprints": "Emb fnps",
    "Embedding-from-fingerprints-TIMES10": "Emb fnps * 10",
    "Embedding-from-fingerprints-TIMES100": "Emb_fnps * 100",
}






In [4]:
# alg_names = [x[0] for x in value_fns]
# alg_names

## Load from pickle

In [5]:
# # Load pickle
import pickle
import os

dataset_str = 'paroutes'
# dataset_str = 'guacamol'

if dataset_str == 'paroutes':
    eventid = '202306-2611-1616-0f701fe7-bbee-4d2c-83f7-bba18beb858a'
elif dataset_str == 'guacamol':
    eventid = ''
else:
    pass
    
output_folder = f"CompareTanimotoLearnt/{eventid}"

result = {}
for file_name in [file for file in os.listdir(output_folder) if 'pickle' in file]:
    name = file_name.replace('.pickle','').replace('result_','')
    with open(f'{output_folder}/{file_name}', 'rb') as handle:
        result[name] = pickle.load(handle)



In [6]:
import pandas as pd

def create_result_df(result, name):
    assert name == result[name].name, f"name: {name} is different from result[name].name: {result[name].name}"
    
    soln_time_dict = result[name].soln_time_dict
    num_different_routes_dict = result[name].num_different_routes_dict
    final_num_rxn_model_calls_dict = result[name].final_num_rxn_model_calls_dict
    final_num_value_function_calls_dict = result[name].final_num_value_function_calls_dict
    output_graph_dict = result[name].output_graph_dict
    routes_dict = result[name].routes_dict

    # df_results = pd.DataFrame()
    df_soln_time = pd.DataFrame({'algorithm': [], 'similes': [], 'property':[], 'value': []})
    df_different_routes = pd.DataFrame({'algorithm': [], 'similes': [], 'property':[], 'value': []})

    #     for name_alg, value_dict  in soln_time_dict.items():
    for smiles, value  in soln_time_dict.items():
        row_soln_time = {'algorithm': name, 'similes': smiles, 'property':'sol_time', 'value': value}

        df_soln_time = pd.concat([df_soln_time, pd.DataFrame([row_soln_time])], ignore_index=True)

    #     for name_alg, value_dict  in num_different_routes_dict.items():
    for smiles, value  in num_different_routes_dict.items():
        row_different_routes = {'algorithm': name, 'similes': smiles, 'property':'diff_routes', 'value': value}

        df_different_routes = pd.concat([df_different_routes, pd.DataFrame([row_different_routes])], ignore_index=True)

    df_results_tot = pd.concat([df_soln_time, df_different_routes], axis=0)
    return df_results_tot



df_results_tot = pd.DataFrame({'algorithm': [], 'similes': [], 'property':[], 'value': []})
for name in result.keys():
    df_results_alg = create_result_df(result, name)
    df_results_tot = pd.concat([df_results_tot, df_results_alg], axis=0)
    
    
    
    

In [7]:
df_results_tot.to_csv(f'{output_folder}/results_all.csv', index=False)

## Load from df_results

In [8]:
# # Load csv
# import pandas as pd
# import numpy as np

# # eventid= "202305-1310-3717-7e7e984c-8c3e-4a18-ad67-5c4b29743282"
# # output_folder = f"Results/{eventid}"

# df_results_tot = pd.read_csv(f'{output_folder}/results_all.csv')

## 1. SOLUTIONS

In [9]:
algs_to_consider = 'all'
algs_to_consider = [
    'constant-0',
    'Tanimoto-distance',
    'Tanimoto-distance-TIMES10',
    'Tanimoto-distance-TIMES100',
    'Tanimoto-distance-EXP',
    'Tanimoto-distance-SQRT',
    "Tanimoto-distance-NUM_NEIGHBORS_TO_1",
    "Embedding-from-fingerprints",
    "Embedding-from-fingerprints-TIMES10",
    "Embedding-from-fingerprints-TIMES100",
]

if algs_to_consider != 'all':
    df_results_tot = df_results_tot.loc[df_results_tot['algorithm'].isin(algs_to_consider)]

### 1a. Solution times

In [10]:
results_solution_times = df_results_tot.loc[df_results_tot['property']=='sol_time']

In [11]:
df_result = results_solution_times.copy()

In [12]:
df_result["value_is_inf"] = (df_result['value'] == np.inf) * 1


In [13]:
df_results_grouped = df_result.groupby(["algorithm"], as_index=False).agg(nr_mol_not_solved=pd.NamedAgg(column="value_is_inf", aggfunc="sum"))
df_results_grouped


Unnamed: 0,algorithm,nr_mol_not_solved
0,Embedding-from-fingerprints,65
1,Embedding-from-fingerprints-TIMES10,66
2,Embedding-from-fingerprints-TIMES100,72
3,Tanimoto-distance,66
4,Tanimoto-distance-EXP,65
5,Tanimoto-distance-NUM_NEIGHBORS_TO_1,65
6,Tanimoto-distance-SQRT,65
7,Tanimoto-distance-TIMES10,69
8,Tanimoto-distance-TIMES100,64
9,constant-0,63


In [14]:
df_results_grouped.to_csv(f'{output_folder}/num_mol_not_solved.csv', index=False)

In [16]:
import plotly.express as px
fig = px.box(df_result, x="algorithm", y="value", width=1000, height=600,
             labels={
#                      "algorithm": None,
                     "value": "Time to first solution",
#                      "species": "Species of Iris"
                 },
#              title="Time to first solution"
            )
fig.update_layout(xaxis_title=None)
fig.update_xaxes(labelalias=labelalias, categoryorder='array', categoryarray=list(labelalias.keys()))
fig.write_image(f'{output_folder}/Boxplot_time_first_solution.pdf') 
fig.show() 