In [None]:
# | default_exp output


In [None]:
# |export
import re
from collections import Counter, defaultdict
from typing import Iterable

import numpy as np
from nbdev.showdoc import *
from strsimpy.levenshtein import Levenshtein
from strsimpy.longest_common_subsequence import LongestCommonSubsequence
from strsimpy.normalized_levenshtein import NormalizedLevenshtein


from sklearn.metrics import r2_score, max_error, mean_absolute_error, mean_squared_error

# Reusable code for analyzing results

> Analyze the outputs of the models


To measure how different our outputs are from the input data, we'll use string distances.


In [None]:
# |export
def string_distances(training_set: Iterable[str], query_string: str):

    distances = defaultdict(list)

    metrics = [
        ("Levenshtein", Levenshtein()),
        ("NormalizedLevenshtein", NormalizedLevenshtein()),
        ("LongestCommonSubsequence", LongestCommonSubsequence()),
    ]

    aggregations = [
        ("min", lambda x: np.min(x)),
        ("max", lambda x: np.max(x)),
        ("mean", lambda x: np.mean(x)),
        ("std", lambda x: np.std(x)),
    ]

    for training_string in training_set:
        for metric_name, metric in metrics:
            distances[metric_name].append(
                metric.distance(training_string, query_string)
            )

    aggregated_distances = {}

    for k, v in distances.items():
        for agg_name, agg_func in aggregations:
            aggregated_distances[f"{k}_{agg_name}"] = agg_func(v)

    return aggregated_distances


In [None]:
# |hide
training_set = ["AAA", "BBB", "CCC"]
query_string = "BBB"
result = string_distances(training_set, query_string)

assert result["NormalizedLevenshtein_min"] == 0.0
assert result["NormalizedLevenshtein_max"] == 1.0


## Polymers

> Code specific for the polymer test case


In [None]:
# |export

def convert2smiles(string):
    new_encoding = {"A": "[Ta]", "B": "[Tr]", "W": "[W]", "R": "[R]"}

    for k, v in new_encoding.items():
        string = string.replace(k, v)

    string = string.replace("-", "")

    return string


To train the model, we simply use single letters, without any special characters such as brackets.


In [None]:
convert2smiles("AWWRRA")


'[Ta][W][W][R][R][Ta]'

To get the composition from the prompt, we will check how often we find a given monomer in the string.


In [None]:
# |export
def get_num_monomer(string, monomer):
    num = re.findall(f"([\d+]) {monomer}", string)
    try:
        num = int(num[0])
    except Exception:
        num = 0
    return num


In [None]:
get_num_monomer("Polymer with 3 A, 5 B and 0 C", "A")


3

In [None]:
# |export
def get_prompt_compostion(prompt):
    composition = {}

    for monomer in ["R", "W", "A", "B"]:
        composition[monomer] = get_num_monomer(prompt, monomer)

    return composition


In [None]:
# |export

def get_target(string, target_name="adsorption"):
    num = re.findall(f"([\d+]) {target_name}", string)
    return int(num[0])


In [None]:
# |export

def get_prompt_data(prompt):
    composition = get_prompt_compostion(prompt)

    return composition, get_target(prompt)


In [None]:
# |export

def get_completion_composition(string):
    parts = string.split("-")
    counts = Counter(parts)
    return dict(counts)


In [None]:
# |export

def string2performance(string):
    # we need to perform a bunch of tasks here:
    # 1) Featurize
    # 2) Query the model

    predicted_monomer_sequence = string.split("@")[0].strip()
    monomer_sq = re.findall("[(R|W|A|B)\-(R|W|A|B)]+", predicted_monomer_sequence)[0]
    composition = get_completion_composition(monomer_sq)
    smiles = convert2smiles(predicted_monomer_sequence)

    features = pd.DataFrame(featurize_many([smiles]))
    prediction = DELTA_G_MODEL.predict(features[FEATURES])
    return {
        "monomer_squence": monomer_sq,
        "composition": composition,
        "smiles": smiles,
        "prediction": prediction,
    }


In [None]:
# |export

def composition_mismatch(composition: dict, found: dict):
    distances = []

    # We also might have the case the there are keys that the input did not contain
    all_keys = set(composition.keys()) & set(found.keys())

    expected_len = []
    found_len = []

    for key in all_keys:
        try:
            expected = composition[key]
        except KeyError:
            expected = 0
        expected_len.append(expected)
        try:
            f = found[key]
        except KeyError:
            f = 0
        found_len.append(f)

        distances.append(np.abs(expected - f))

    expected_len = sum(expected_len)
    found_len = sum(found_len)
    return {
        "distances": distances,
        "min": np.min(distances),
        "max": np.max(distances),
        "mean": np.mean(distances),
        "expected_len": expected_len,
        "found_len": found_len,
    }


In [None]:
# |export

def get_regression_metrics(y_true, y_pred):
    return {
        "r2": r2_score(y_true, y_pred),
        "max_error": max_error(y_true, y_pred),
        "mean_absolute_error": mean_absolute_error(y_true, y_pred),
        "mean_squared_error": mean_squared_error(y_true, y_pred),
    }

In [None]:
get_regression_metrics(
    [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]
)

{'r2': 1.0,
 'max_error': 0,
 'mean_absolute_error': 0.0,
 'mean_squared_error': 0.0}