In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('talk', font_scale=1.5)
sns.set(color_codes=True)

%matplotlib inline

## Load fitted regression and outlier models

In [3]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    from ysi_utils.models import outlier_model, bagging_model, ensemble_predict
    ensemble_predict.bagging_model = bagging_model()

## Load ChEBI data

In [4]:
from ysi_utils.descriptors import dragon_chebi
from ysi_utils.data import chebi, low
from ysi_utils.tools.descriptors import get_element_dict, get_DBE

dragon_chebi.drop_duplicates()
chebi_pred = chebi.set_index('SMILES')

Restrict to just the inlier molecules

In [5]:
chebi_pred = chebi_pred[outlier_model.predict(dragon_chebi.loc[chebi_pred.index]) == 1]

In [6]:
predictions = pd.DataFrame(ensemble_predict(dragon_chebi.loc[chebi_pred.index]), index=chebi_pred.index)
chebi_pred['YSI'] = predictions.mean(1)
chebi_pred['YSI_std'] = predictions.std(1)

In [7]:
chebi_pred['measured'] = chebi_pred.index.isin(low.SMILES)
measured_ysi = low.loc[low.SMILES.isin(chebi_pred.index)].set_index('SMILES')['YSI']

In [8]:
chebi_pred.loc[chebi_pred['measured'], 'YSI'] = measured_ysi
chebi_pred.loc[chebi_pred['measured'], 'YSI_std'] = 0.58

In [9]:
chebi_pred.reset_index(drop=True).loc[:, ['ChEBI', 'Name', 'CAS', 'YSI', 'YSI_std', 'measured']
                                     ].to_csv('chebi_predictions.csv', index=False, float_format='%.3f')