In [1]:
%load_ext autoreload 
%autoreload 2

In [8]:
from matbench.bench import MatbenchBenchmark
from matbench.constants import CLF_KEY
from gptchem.gpt_regressor import GPTRegressor
from gptchem.tuner import Tuner

from loguru import logger
import matplotlib.pyplot as plt
logger.enable("gptchem")

In [4]:
import decorator, time

def retry(howmany, *exception_types, **kwargs):
    timeout = kwargs.get('timeout', 0.0) # seconds
    @decorator.decorator
    def tryIt(func, *fargs, **fkwargs):
        for _ in range(howmany):
            try: return func(*fargs, **fkwargs)
            except exception_types or Exception as e:
                print(e)
                if timeout is not None: time.sleep(timeout)
    return tryIt

In [11]:
mb = MatbenchBenchmark(
    autoload=True,
    subset=[
        "matbench_expt_gap",
        "matbench_steels",
    ],
)

2023-01-27 14:51:36 INFO     Initialized benchmark 'matbench_v0.1' with 2 tasks: 
['matbench_expt_gap', 'matbench_steels']


In [12]:
@retry(3, timeout=5)
def train_test_fold(task, fold):
    regressor = GPTRegressor(task.metadata['target'], Tuner(n_epochs=8, learning_rate_multiplier=0.02, wandb_sync=False))
    train_inputs, train_outputs = task.get_train_and_val_data(fold)

    # train and validate your model
    regressor.fit(train_inputs, train_outputs.values)

    # Get testing data
    test_inputs = task.get_test_data(fold, include_target=False)

    # Predict on the testing data
    # Your output should be a pandas series, numpy array, or python iterable
    # where the array elements are floats or bools
    predictions = regressor.predict(test_inputs)

    # Record your data!
    task.record(fold, predictions)

In [26]:
for task in mb.tasks:
    task.load()
    
    for fold_ind, fold in enumerate(task.folds):
        if task.is_recorded[fold_ind]:
            print(f"Skipping fold {fold_ind} of {task.dataset_name}")
            continue
        train_test_fold(task, fold)
        train_inputs, train_outputs = task.get_train_and_val_data(fold)

        plt.figure()
        plt.hist(train_outputs)

    print(f"{task.dataset_name}: MAE  {task.scores['mae']['mean']}")

2023-01-27 14:57:39 INFO     Dataset matbench_expt_gap already loaded; not reloading dataset.


Upload progress: 100%|██████████| 393k/393k [00:00<00:00, 329Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/04_regression/matbench/out/20230127_145739/train.jsonl: file-ysZj5IeDuZBXSgW0Wtzaw44z


2023-01-27 14:57:42.364 | DEBUG    | gptchem.tuner:tune:186 - Requested fine tuning. {
  "created_at": 1674827862,
  "events": [
    {
      "created_at": 1674827862,
      "level": "info",
      "message": "Created fine-tune: ft-7PRf4QcKSGT0yUWChuZAS7mZ",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": 0.02,
    "n_epochs": 8,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-7PRf4QcKSGT0yUWChuZAS7mZ",
  "model": "ada",
  "object": "fine-tune",
  "organization_id": "org-TFRJXw3PPQocOWbu71eI2t9U",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 392808,
      "created_at": 1674827861,
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/04_regression/matbench/out/20230127_145739/train.jsonl",
      "id": "file-ysZj5IeDuZBXSgW0Wtzaw44z",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "

internal error {
    "error": {
        "message": "internal error",
        "type": "invalid_request_error",
        "param": null,
        "code": null
    }
}
 500 {'error': {'message': 'internal error', 'type': 'invalid_request_error', 'param': None, 'code': None}} {'Date': 'Fri, 27 Jan 2023 14:54:34 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '147', 'Connection': 'keep-alive', 'Vary': 'Origin', 'X-Request-Id': 'd012d34a122aed56a8ee24feec563c94', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains'}


Upload progress: 100%|██████████| 393k/393k [00:00<00:00, 399Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/04_regression/matbench/out/20230127_155439/train.jsonl: file-myBkZRBJPLAv3i83VtpyRFcG


2023-01-27 15:54:42.349 | DEBUG    | gptchem.tuner:tune:186 - Requested fine tuning. {
  "created_at": 1674831282,
  "events": [
    {
      "created_at": 1674831282,
      "level": "info",
      "message": "Created fine-tune: ft-O6Nxt2mjBm0wMNI7I93B9Rnx",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": 0.02,
    "n_epochs": 8,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-O6Nxt2mjBm0wMNI7I93B9Rnx",
  "model": "ada",
  "object": "fine-tune",
  "organization_id": "org-TFRJXw3PPQocOWbu71eI2t9U",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 392808,
      "created_at": 1674831281,
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/04_regression/matbench/out/20230127_155439/train.jsonl",
      "id": "file-myBkZRBJPLAv3i83VtpyRFcG",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "

In [None]:
mb.to_file("gpt_bench.json.gz")