In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.model_selection import train_test_split

from gptchem.data import get_polymer_data
from gptchem.evaluator import PolymerKLDivBenchmark, polymer_string2performance, string_distances
from gptchem.formatter import InverseDesignFormatter

In [3]:
data = get_polymer_data()

In [4]:
formatter = InverseDesignFormatter(
    representation_column="string",
    property_columns=["deltaGmin"],
    property_names=["adsorption_energy"],
)

In [5]:
formatted = formatter(data)

In [6]:
polymer_string2performance(formatted.iloc[0]["label"])

{'monomer_squence': 'W-A-B-W-W-A-A-A-R-W-B-B-R-R-B-R',
 'composition': {'W': 4, 'A': 4, 'B': 4, 'R': 4},
 'smiles': '[W][Ta][Tr][W][W][Ta][Ta][Ta][R][W][Tr][Tr][R][R][Tr][R]',
 'prediction': array([-6.1970377], dtype=float32),
 'features':    head_tail_[W]  head_tail_[Tr]  head_tail_[Ta]  head_tail_[R]  \
 0              1               0               0              1   
 
    total_clusters  num_[W]  max_[W]  min_[W]  mean_[W]  num_[Tr]  ...   [W]  \
 0               4     0.25        2        2       2.0      0.25  ...  0.25   
 
    [Tr]  [Ta]   [R]  rel_shannon  length  total_solvent  std_solvent  \
 0  0.25  0.25  0.25          0.5      16            480     3.535534   
 
    total_surface  std_surface  
 0            400          5.0  
 
 [1 rows x 31 columns]}

In [39]:
kldiv_benchmark = PolymerKLDivBenchmark(data.iloc[80:], 20)

In [40]:
kldiv_benchmark.score(data.iloc[0:40])

0.5348989420975668

In [7]:
?string_distances

[0;31mSignature:[0m [0mstring_distances[0m[0;34m([0m[0mtraining_set[0m[0;34m:[0m [0mCollection[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mquery_string[0m[0;34m:[0m [0mstr[0m[0;34m)[0m [0;34m->[0m [0mdict[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Calculate the distances between the query string and the training set.

Args:
    training_set (Collection[str]): The training set
    query_string (str): The query string

Returns:
    dict: A dictionary with the distances, the min, max, mean and the expected length

Example:
    >>> training_set = ["AAA", "BBB", "CCC"]
    >>> query_string = "BBB"
    >>> result = string_distances(training_set, query_string)
    assert result["NormalizedLevenshtein_min"] == 0.0
    assert result["NormalizedLevenshtein_max"] == 1.0
[0;31mFile:[0m      ~/git/kjappelbaum/gptchem/src/gptchem/evaluator.py
[0;31mType:[0m      function


In [8]:
string_distances(formatted.iloc[0:10]["label"], formatted.iloc[1]["label"])

{'Levenshtein_min': 0.0,
 'Levenshtein_max': 18.0,
 'Levenshtein_mean': 12.4,
 'Levenshtein_std': 4.543126676640219,
 'NormalizedLevenshtein_min': 0.0,
 'NormalizedLevenshtein_max': 0.5142857142857142,
 'NormalizedLevenshtein_mean': 0.3734562211981567,
 'NormalizedLevenshtein_std': 0.13521010617991372,
 'LongestCommonSubsequence_min': 0.0,
 'LongestCommonSubsequence_max': 26.0,
 'LongestCommonSubsequence_mean': 19.6,
 'LongestCommonSubsequence_std': 6.916646586316233}