# Analyze inverse predictions for polymers

In [2]:
%reload_ext autoreload
%autoreload 2

import time
import matplotlib.pyplot as plt
plt.style.use(['science', 'nature'])
from pycm import ConfusionMatrix
import wandb
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd 
import re

from gpt3forchem.polymers.data import get_data
from gpt3forchem.polymers.constants import TARGETS, FEATURES, CAT_TARGETS
from gpt3forchem.polymers.create_prompts import create_single_property_inverse_prompts
from gpt3forchem.fine_tune import fine_tune
from gpt3forchem.query_model import query_gpt3, extract_prediction

In [3]:
df_train = pd.read_json('run_files/2022-06-25-17-15-56_train_inverse_prompts_polymers_2812.jsonl', lines=True)
df_test = pd.read_json('run_files/2022-06-25-17-15-56_valid_inverse_prompts_polymers_313.jsonl',  lines=True)

In [28]:
test_string = df_test['prompt'].iloc[0]
test_completion = df_test['completion'].iloc[0]

In [29]:
test_string

'what is a polymer with 2 adsorption energy and 8 A, 12 B, 10 W, and 10 R?###'

In [30]:
test_completion

' R-A-A-W-B-R-A-B-R-B-W-A-W-B-W-R-R-R-A-B-A-B-W-A-B-B-W-W-B-R-W-W-B-R-B-R-W-B-A-R@@@'

In [4]:
import re

In [5]:
test_completion = 'R-W-A-R-A-B-B-R-A-W-R-A-B-R-A-R-B-W-R-R-R-B-B-W-W-W-B-B-W'

In [7]:
re.findall("[(R|W|A|B)\-(R|W|A|B)]+", test_completion)

['R-W-A-R-A-B-B-R-A-W-R-A-B-R-A-R-B-W-R-R-R-B-B-W-W-W-B-B-W']

In [24]:
def get_num_monomer(string, monomer): 
    num = re.findall(f'([\d+]) {monomer}', string)
    try: 
        num = int(num[0])
    except Exception: 
        num = 0
    return num

In [26]:
def get_target(string): 
    num = re.findall('([\d+]) adsorption', string)
    return int(num[0])

In [32]:
def get_prompt_data(prompt): 
    composition = {}

    for monomer in ['R', "W", 'A', "B"]: 
        composition[monomer] =get_num_monomer(prompt, monomer)

    return composition, get_target(prompt)

In [33]:
get_prompt_data(test_string)

({'R': 0, 'W': 0, 'A': 8, 'B': 2}, 2)