# Single-output tuning for MOFs

In [2]:
%reload_ext autoreload
%autoreload 2

import time

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pycm import ConfusionMatrix
import wandb

from gpt3forchem.mofs.data import get_data
from gpt3forchem.mofs.baseline.classification import XGBClassificationBaseline
from gpt3forchem.mofs.constants import TARGETS, FEATURES, CAT_TARGETS, TEXT
from gpt3forchem.mofs.create_prompts import create_single_property_forward_prompts
from gpt3forchem.fine_tune import fine_tune
from gpt3forchem.query_model import query_gpt3, extract_prediction

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [3]:
df = get_data()

In [4]:
len(df)

3375

### Small data (N=100) setting

In [5]:
train_df, test_df = train_test_split(df, train_size=100, stratify=df[CAT_TARGETS[0]])
train_size = len(train_df)
test_size = len(test_df)
print(f"{len(train_df)} train points and {len(test_df)} test points")


100 train points and 3275 test points


#### Tune and test a baseline

In [7]:
run = wandb.init(
    project="gpt-3",
    job_type="train-baseline-model",
    config={
        "train_size": train_size,
        "test_size": test_size,
        "features": FEATURES,
        "targets": [CAT_TARGETS[0]],
        "ds": "mof",
    },
)


[34m[1mwandb[0m: Currently logged in as: [33mkjappelbaum[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
baseline = XGBClassificationBaseline(43)


In [9]:
baseline.tune(train_df[FEATURES], train_df[CAT_TARGETS[0]])


[32m[I 2022-06-27 09:09:41,564][0m A new study created in memory with name: no-name-a2b7a4b0-8ba4-4d23-adee-d091500a9e04[0m
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.05),
  "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 1),
  "subsample": trial.suggest_loguniform("subsample", 0.00001, 1),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 10.0),
  "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
[32m[I 2022-06-27 09:13:11,748][0m Trial 0 finished with value: 0.07352913752913753 and parameters: {'n_estimators': 1154, 'max_depth': 63, 'learning_rate': 0.0016851060855600127, 'colsample_bytree': 0.2945743468274923, 'subsample': 0.0004322104659169873, 'alpha': 0.5398056408643329, 'lambda': 0.009881250696359372}. Best is trial 0 with value: 0.07352913752913753.[0m
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.05),
  "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 1),
  "subsample":

In [None]:
baseline.fit(train_df[FEATURES].values, train_df[CAT_TARGETS[0]])
predictions = baseline.predict(test_df[FEATURES].values)


In [None]:
run.log(
    {
        "distribution confusion matrix": wandb.plot.confusion_matrix(
            y_true=baseline.label_encoder.transform(test_df[CAT_TARGETS[0]].values),
            preds=baseline.label_encoder.transform(predictions),
            class_names=["very small", "small", "medium", "large", "very large"],
        )
    }
)


In [None]:
cm = ConfusionMatrix(
    test_df[CAT_TARGETS[0]].values,
    predictions,
)


In [None]:
run.log(
    {"ACC": cm.ACC, "ACC_macro": cm.ACC_Macro, "F1_micro": cm.F1_Micro, "F1_macro": cm.F1_Macro}
)


In [None]:
 {"ACC": cm.ACC, "ACC_macro": cm.ACC_Macro, "F1_micro": cm.F1_Micro, "F1_macro": cm.F1_Macro}

In [None]:
run.finish()


#### Fine-tune GPT-3

In [None]:
train_prompts = create_single_property_forward_prompts(train_df, CAT_TARGETS[0], TEXT[1])
valid_prompts = create_single_property_forward_prompts(test_df, CAT_TARGETS[0], TEXT[1])


In [None]:
filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
train_filename = f"run_files/{filename_base}_train_prompts_mof_{train_size}.jsonl"
valid_filename = f"run_files/{filename_base}_valid_prompts_mof_{test_size}.jsonl"
# to save money, just run a small valid frame
valid_small_filename = f"run_files/{filename_base}_validsmall_prompts_mof_{test_size}.jsonl"
train_prompts.to_json(train_filename, orient="records", lines=True)
valid_prompts.to_json(valid_filename, orient="records", lines=True)
valid_prompts.sample(100).to_json(valid_small_filename, orient="records", lines=True)


In [None]:
train_filename

In [None]:
fine_tune(train_filename, valid_filename)