## Split data 

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

file_save_h5 = 'SAVED H5PY FILE'

with h5py.File(str(file_save_h5), "r") as hf:
    X = hf["embeddings"][:]
    y = hf["labels"][:]

X_train, X_rem, y_train, y_rem = train_test_split(
    X,
    y, 
    train_size=0.8,
    random_state=22,
)

test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

## Auto sklearn

In [None]:
import autosklearn.regression
import time
import matplotlib.pyplot as plt

import sklearn
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

from smac.tae import StatusType

In [None]:
def get_runhistory_models_performance(automl):
    metric = automl.automl_._metric
    data = automl.automl_.runhistory_.data
    performance_list = []
    for run_key, run_value in data.items():
        if run_value.status != StatusType.SUCCESS:
            # Ignore crashed runs
            continue
        # Alternatively, it is possible to also obtain the start time with ``run_value.starttime``
        endtime = pd.Timestamp(time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(run_value.endtime)))
        val_score = metric._optimum - (metric._sign * run_value.cost)
        test_score = metric._optimum - (metric._sign * run_value.additional_info['test_loss'])
        train_score = metric._optimum - (metric._sign * run_value.additional_info['train_loss'])
        performance_list.append({
            'Timestamp': endtime,
            'single_best_optimization_score': val_score,
            'single_best_test_score': test_score,
            'single_best_train_score': train_score,
        })
    return pd.DataFrame(performance_list)

In [None]:
hr = 3600
task_name = "yield_TAPE_30min_ensemble2_80training_10test_10valid_noCV"
time_left_for_this_task_in_seconds = int(0.5 * hr)
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=time_left_for_this_task_in_seconds,
    tmp_folder=f"{task_name}/",
    n_jobs=-1,
    memory_limit=None,
    per_run_time_limit=60
)

In [None]:
"""automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=time_left_for_this_task_in_seconds,
    tmp_folder=f"{task_name}/",
    n_jobs=-1,
    memory_limit=None,
    resampling_strategy="cv",
    ensemble_size = 5,
    resampling_strategy_arguments={"folds": 20},
    include_preprocessors=[
        "no_preprocessing",
    ],
)"""

In [None]:
automl.fit(X_train, y_train, X_test=X_test, y_test=y_test, dataset_name=task_name)
#automl.refit(X_train.copy(), y_train.copy())

In [None]:
ensemble_performance_frame = pd.DataFrame(automl.automl_.ensemble_performance_history)
best_values = pd.Series({'ensemble_optimization_score': -np.inf,
                         'ensemble_test_score': -np.inf})
for idx in ensemble_performance_frame.index:
    if (
        ensemble_performance_frame.loc[idx, 'ensemble_optimization_score']
        > best_values['ensemble_optimization_score']
    ):
        best_values = ensemble_performance_frame.loc[idx]
    ensemble_performance_frame.loc[idx] = best_values

individual_performance_frame = get_runhistory_models_performance(automl)
best_values = pd.Series({'single_best_optimization_score': -np.inf,
                         'single_best_test_score': -np.inf,
                         'single_best_train_score': -np.inf})
for idx in individual_performance_frame.index:
    if (
        individual_performance_frame.loc[idx, 'single_best_optimization_score']
        > best_values['single_best_optimization_score']
    ):
        best_values = individual_performance_frame.loc[idx]
    individual_performance_frame.loc[idx] = best_values

pd.merge(
    ensemble_performance_frame,
    individual_performance_frame,
    on="Timestamp", how='outer'
).sort_values('Timestamp').fillna(method='ffill').plot(
    x='Timestamp',
    kind='line',
    legend=True,
    title='Auto-sklearn accuracy over time',
    grid=True,
)
plt.show()

In [None]:
train_predictions = automl.predict(X_train)
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
test_predictions = automl.predict(X_test)
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
valid_predictions = automl.predict(X_valid)
print("Validation R2 score:", sklearn.metrics.r2_score(y_valid, valid_predictions))

In [None]:
plt.scatter(train_predictions, y_train, label="Train", c='#00BDE3')
plt.scatter(test_predictions, y_test, label="Test", c='#b734eb')
plt.scatter(valid_predictions, y_valid, label="Valid", c='#fc9003')
plt.xlabel("Predicted yield (log(ug/ml))")
plt.ylabel("True yield (log(ug/ml))")
plt.legend()
#plt.plot([1.5, 5.0], [0.5, 6.0], c='k', zorder=0)

#plt.xlim([1.5, 5.0])
#plt.ylim([0.5, 6.0])
plt.tight_layout()
plt.savefig('yield_log_autoML_best_ensemble_8hrs_ESM_ensemble5.png', dpi=300)
plt.show()

In [None]:
print('Train correlation:', np.corrcoef(train_predictions, y_train)[0][1])
print('Test correlation:', np.corrcoef(test_predictions, y_test)[0][1])
print('Validation correlation:', np.corrcoef(valid_predictions, y_valid)[0][1])

In [None]:
from scipy.stats import pearsonr

corr_train, _ = pearsonr(train_predictions, y_train)
print('Pearsons correlation for training set: %.3f' % corr_train)
corr_test, _ = pearsonr(test_predictions, y_test)
print('Pearsons correlation for training set: %.3f' % corr_test)

In [None]:
automl.fit_ensemble(y_train, ensemble_size=50)

In [None]:
predictions = automl.predict(X_test)
print(automl.sprint_statistics())

### Save model, and load afterwards

In [None]:
import pickle as pkl


dump_file = 'property_best_ensemble_8hrs_ESM.pkl'

with open(dump_file, 'wb') as f:
    pkl.dump(automl, f)

In [None]:
file = 'property_best_ensemble_8hrs_ESM.pkl'

with open(dump_file, 'rb') as f:
    restored_automl = pkl.load(f)

restored_train_pred = restored_automl.predict(X_train)
restored_test_pred = restored_automl.predict(X_test)

In [None]:
corr_train, _ = pearsonr(restored_train_pred, y_train)
print('Pearsons correlation for training set: %.3f' % corr_train)
corr_test, _ = pearsonr(restored_test_pred, y_test)
print('Pearsons correlation for training set: %.3f' % corr_test)

In [None]:
plt.scatter(restored_train_pred, y_train, label="Restored train samples", c='#d95f02')
plt.scatter(restored_test_pred, y_test, label="Restored test samples", c='#7570b3')
plt.xlabel("Predicted property")
plt.ylabel("True property")
plt.legend()
plt.plot([1.75, 4], 
         [0.5, 5], 
         c='k', 
         zorder=0)

plt.xlim([1.75, 4])
plt.ylim([0.5, 5])
plt.tight_layout()
plt.show()

In [None]:
print(restored_test_pred)