In [52]:
import pandas as pd
# Load in data 
train_split = pd.read_pickle("pandas_dumps/compounds_split.pkl")
all_data = pd.read_csv("activity_data.csv")

In [53]:
all_data = all_data.merge(right=train_split,how="right")
# drop any record that has nan for f_avg_
all_data = all_data[all_data['f_avg_IC50'].notna()]
import numpy as np
all_data['pIC50'] = - all_data.f_avg_IC50.apply(np.log10)

In [54]:
all_data.test

0       False
1       False
2       False
3        True
4        True
        ...  
2032    False
2033    False
2034    False
2035    False
2036    False
Name: test, Length: 1901, dtype: bool

# Training on molecular fingerprints


In [55]:
from flaml import AutoML
from sklearn.datasets import fetch_california_housing
X_train_list = all_data[  ~all_data['test']]['fingerprint']
y_train = all_data[  ~all_data.test]['pIC50']
y_train = y_train.to_numpy()
X_train = []

from rdkit.DataStructs import cDataStructs
for fp in X_train_list:
    array = np.zeros((0, ), dtype=np.int8)
    cDataStructs.ConvertToNumpyArray(fp, array)
    X_train.append(array)
X_train = np.array(X_train)


In [56]:
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    "time_budget": 1,  # in seconds
    "metric": 'r2',
    "task": 'regression',
    "log_file_name": "california.log",
}
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,
           **automl_settings)
# Predict
print(automl.predict(X_train))
# Print the best model
print(automl.model.estimator)

[flaml.automl: 01-28 09:24:12] {2007} INFO - task = regression
[flaml.automl: 01-28 09:24:12] {2009} INFO - Data split method: uniform
[flaml.automl: 01-28 09:24:12] {2013} INFO - Evaluation method: holdout
[flaml.automl: 01-28 09:24:12] {2113} INFO - Minimizing error metric: 1-r2
[flaml.automl: 01-28 09:24:12] {2170} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl: 01-28 09:24:12] {2437} INFO - iteration 0, current learner lgbm
[flaml.automl: 01-28 09:24:13] {2551} INFO - Estimated sufficient time budget=9093s. Estimated necessary time budget=64s.
[flaml.automl: 01-28 09:24:13] {2603} INFO -  at 1.0s,	estimator lgbm's best error=0.9121,	best estimator lgbm's best error=0.9121
[flaml.automl: 01-28 09:24:13] {2437} INFO - iteration 1, current learner xgboost
[flaml.automl: 01-28 09:24:13] {2603} INFO -  at 1.6s,	estimator xgboost's best error=2.2656,	best estimator lgbm's best error=0.9121
[flaml.automl: 01-28 09:24:13] {

[-0.66429708 -0.66429708 -0.66429708 ... -1.08702136 -0.66429708
 -0.66429708]
LGBMRegressor(learning_rate=0.09999999999999995, max_bin=255, n_estimators=4,
              num_leaves=4, reg_alpha=0.0009765625, reg_lambda=1.0, verbose=-1)


# Training on features


In [92]:
# list of feature columns
featurised = pd.read_pickle("pandas_dumps/featurised_compounds.pkl")
featurised = featurised.merge(right=train_split,how="left")
feature_columns = featurised.columns[9:-4]
# X_train = featurised[  ~featurised['test']][feature_columns]
# X_train = X_train.to_numpy()
# y_train = featurised[  ~featurised['test']]['pIC50']
# y_train = y_train.to_numpy()


In [93]:
featurised.columns[9:-4]

Index(['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex',
       'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt',
       'NumValenceElectrons', 'MaxPartialCharge',
       ...
       'fr_pyridine', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone',
       'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiophene',
       'fr_unbrch_alkane', 'fr_urea'],
      dtype='object', length=194)

In [50]:
X_train.max()

17.0

In [51]:
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from flaml import AutoML

set_config(display='diagram')

imputer = SimpleImputer()
standardizer = StandardScaler()
automl = AutoML()

automl_pipeline = Pipeline([
    ("imputuer",imputer),
    ("standardizer", standardizer),
    ("automl", automl)
])
automl_pipeline

# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    "time_budget": 120,  # in seconds
    "metric": 'r2',
    "task": 'regression',
    "log_file_name": "flaml.log",
}
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,
           **automl_settings)
# Predict
# print(automl.predict(X_train))
# Print the best model
print(automl.model.estimator)

[flaml.automl: 01-27 18:19:51] {2007} INFO - task = regression
[flaml.automl: 01-27 18:19:51] {2009} INFO - Data split method: uniform
[flaml.automl: 01-27 18:19:51] {2013} INFO - Evaluation method: cv
[flaml.automl: 01-27 18:19:51] {2113} INFO - Minimizing error metric: 1-r2
[flaml.automl: 01-27 18:19:51] {2170} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl: 01-27 18:19:51] {2437} INFO - iteration 0, current learner lgbm


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').