In [None]:
import deepchem as dc
import numpy as np
import torch
import torch.nn.functional as F
from deepchem.metalearning.torch_maml import MetaLearner, MAML
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, train_test_split
import matplotlib.pyplot as plt
import random
from torch_geometric import seed_everything
from torch_geometric.nn import GCNConv, Linear
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
from torch_geometric.data import Data
from torch_geometric.datasets import QM9
from tabulate import tabulate
import pandas as pd
import matplotlib.pyplot as plt
from permetrics.regression import RegressionMetric
from rdkit import Chem
from rdkit.Chem import MolFromSmiles, SDWriter, Draw
from rdkit.Chem import AllChem, AddHs
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.MolStandardize import rdMolStandardize
from tqdm import tqdm

seed_everything(42)

def get_key_by_value(dictionary, value):
    for key, val in dictionary.items():
        if val == value:
            return key
    return None

In [None]:
class MetaMFLearner(MetaLearner):
    def __init__(self, layer_sizes=[1, 40, 20, 1], activation=F.relu, dataset=None, batch_size=10, tasks_dict=None, test_tasks=None):
        self.batch_size = batch_size
        self.layer_sizes = layer_sizes
        self.activation = activation
        self.dataset = dataset
        self.task_index = None  # stores the current task ID
        self.tasks_dict = tasks_dict
        self.test_tasks = test_tasks
        self.layers = self._create_layers()
        self.train_indices = []

    def _create_layers(self):
        layers = []
        for i in range(len(self.layer_sizes) - 1):
            layers.append(torch.nn.Linear(self.layer_sizes[i], self.layer_sizes[i + 1]))
        return torch.nn.ModuleList(layers)

    def compute_model(self, inputs, variables, training):
        device = next(self.parameters()).device
        x, y = inputs
        x = x.to(device)
        y = y.to(device)
        for i, layer in enumerate(self.layers):
            if i < len(self.layers) - 1:
                x = self.activation(layer(x))
            else:
                x = layer(x)
        loss = torch.mean(torch.square(x - y))
        return loss, [x]

    @property
    def variables(self):
        return [param for layer in self.layers for param in layer.parameters()]

    def select_task(self):
        # Select a task ID from the unique IDs in the dataset
        unique_ids = self.tasks_dict.values()
        unique_ids = [x for x in unique_ids if x not in self.test_tasks]
        self.task_index = random.choice(unique_ids)

    def select_task_by_name(self, task_name):
        self.task_index = self.tasks_dict[task_name]

    # Fingerprints
    def get_batch(self):
        # Get samples belonging to the selected task ID
        task_indices = np.where(self.dataset.ids == self.task_index)[0]
        if len(task_indices) < self.batch_size:
            batch_indices = np.random.choice(task_indices, len(task_indices), replace=False)
        else:
            batch_indices = np.random.choice(task_indices, self.batch_size, replace=False)
        self.train_indices = batch_indices
        x = torch.tensor(self.dataset.X[batch_indices], dtype=torch.float32)
        y = torch.tensor(self.dataset.y[batch_indices], dtype=torch.float32).view(-1, 1)
        return [x, y]

    def parameters(self):
        for param in self.variables:
            yield param

In [None]:
hiper_batch_size = 32

# Read the Excel file
df = pd.read_excel('./henry.xlsx')
df.drop(['Unnamed: 0'], axis=1, inplace=True)

# keep only samples from df that have df['temps'] around 298
df = df[np.abs(df['temps'] - 298) < 5]
df = df.reset_index(drop=True)

df['Hcs'] = df['Hcs'] * 1000 # units adjustment

df['smiles_solutes'].value_counts()

In [None]:
# remove O=C=O samples (if any)
df = df[df['smiles_solutes'] != 'O=C=O']
df = df.reset_index(drop=True)

# load into df2 another excel file
df2 = pd.read_csv('carbon-dioxide.csv')
df2.rename(columns={"Henry's law constant": 'Hcs', "Temperature": 'temps',
                    "SMILES": "smiles"}, inplace=True)

df2 = df2[df2['Split'] != 0]
df2['smiles_solutes'] = 'O=C=O'

test_indices = train_test_split(df2.index, test_size=0.2, random_state=0)[1]
df2.loc[test_indices, 'smiles_solutes'] = 'O=C=O_test'

df2 = df2[['smiles', 'smiles_solutes', 'temps', 'Hcs']]

scaler = StandardScaler()
scaler.fit(df2['Hcs'].values.reshape(-1, 1))
df2['Hcs'] = scaler.transform(df2['Hcs'].values.reshape(-1, 1))
df['Hcs'] = scaler.fit_transform(df['Hcs'].values.reshape(-1, 1))

# drop from df rows with df['Hcs'] value outside (-3,3)
df = df[(df['Hcs'] >= -3) & (df['Hcs'] <= 3)]
df = df.reset_index(drop=True)

# join df and df2
df = pd.concat([df, df2])
df = df.reset_index(drop=True)
df

In [None]:
# expand df['smiles'] based on dot into smiles_cation and smiles_anion
df['smiles_cation'] = ''
df['smiles_anion'] = ''
for i, row in df.iterrows():
    smiles = row['smiles']
    temp1, temp2 = smiles.split('.')
    if '+' in temp1:
        df.at[i, 'smiles_cation'] = temp1
        df.at[i, 'smiles_anion'] = temp2
    else:
        df.at[i, 'smiles_cation'] = temp2
        df.at[i, 'smiles_anion'] = temp1
df = df.reset_index(drop=True)

df

In [None]:
df3 = pd.read_excel('./cosmo-predicted-hcs.xlsx')
df3['ils'] = df3['IL_cation'] + ' ' + df3['IL_anion']

# remove rows where henrycnodim == 0
df3 = df3[df3['henrycnodim'] != 0]
df3 = df3[df3['henryc'] != 0]
df3['henryc'] = df3['henryc'] * 101325 # units alignment
df3 = df3.reset_index(drop=True)

target_column_cosmo = 'henryc'
df3[target_column_cosmo] = 1 / (df3[target_column_cosmo])
df3['smiles'] = df3['smiles_cation'] + '.' + df3['smiles_anion']
df3['temps'] = 298
df3['smiles_solutes'] = df3['task'] + '_cosmo'
df3 = df3.dropna(subset=['smiles_anion'])
df3 = df3.reset_index(drop=True)
df3 = df3[['smiles', 'smiles_cation', 'smiles_anion', 'temps', target_column_cosmo, 'ils', 'smiles_solutes']]
df3.rename(columns={target_column_cosmo: 'Hcs'}, inplace=True)
q1 = df3['Hcs'].quantile(0.25)
q3 = df3['Hcs'].quantile(0.75)
iqr = q3 - q1
df3 = df3[(df3['Hcs'] >= q1 - 1.5 * iqr) & (df3['Hcs'] <= q3 + 1.5 * iqr)]
df3 = df3.reset_index(drop=True)

scaler3 = StandardScaler()
df3['Hcs'] = scaler3.fit_transform(df3['Hcs'].values.reshape(-1, 1))
df = pd.concat([df, df3])
df = df.reset_index(drop=True)
df

In [None]:
nrm = rdMolStandardize.Normalizer()

def normalize_smiles(smile):
  cosmo_flag = False
  if '_cosmo' in smile:
    smile = smile.replace('_cosmo', '')
    cosmo_flag = True
  mol = Chem.MolFromSmiles(smile)
  mol_norm = nrm.normalize(mol)
  smile_norm = Chem.MolToSmiles(mol_norm, True)
  if cosmo_flag:
    smile_norm = smile_norm + '_cosmo'
  return smile_norm

smiles_cation_unique = df['smiles_cation'].unique()
smiles_anion_unique = df['smiles_anion'].unique()

smiles_cation_norm_dict = {}
smiles_anion_norm_dict = {}
for smile in smiles_cation_unique:
  smiles_cation_norm_dict[smile] = normalize_smiles(smile)
for smile in smiles_anion_unique:
  smiles_anion_norm_dict[smile] = normalize_smiles(smile)

df['smiles_cation'] = df['smiles_cation'].map(smiles_cation_norm_dict)
df['smiles_anion'] = df['smiles_anion'].map(smiles_anion_norm_dict)
df

In [None]:
tasks_names_to_test = ['O=C=O']

# remove sufix '_valid' from samples in df
df['smiles_solutes'] = df['smiles_solutes'].str.replace('_valid', '')

kf = KFold(n_splits=5, shuffle=True, random_state=0)

for task in tasks_names_to_test:
  # randomly select number of percentage of samples with this task
  df_task = df[df['smiles_solutes'] == task]

  fold_to_use = 4
  for fold_index, (train_index, valid_index) in enumerate(kf.split(df_task)):
    if fold_index == fold_to_use:
      df.loc[df_task.iloc[valid_index].index, 'smiles_solutes'] = task + f'_valid'
      break  # Exit after marking the specified fold as validation

# Extract SMILES, task numbers, and property values
smiles = df['smiles'].values
tasks = df['smiles_solutes'].values
tasks_dict = {task: i for i, task in enumerate(np.unique(tasks))}
tasks = np.array([tasks_dict[task] for task in tasks])

y = df['Hcs'].values

In [None]:
nBITS = 4096
mfpgen_c = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=nBITS)
ao_c = rdFingerprintGenerator.AdditionalOutput()
ao_c.AllocateBitInfoMap()
aos_c = []
tpls_c = []
mfpgen_a = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=nBITS)
ao_a = rdFingerprintGenerator.AdditionalOutput()
ao_a.AllocateBitInfoMap()
aos_a = []
tpls_a = []
mfpgen_s = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=nBITS)
ao_s = rdFingerprintGenerator.AdditionalOutput()
ao_s.AllocateBitInfoMap()
aos_s = []
tpls_s = []

fps_c, fps_a, fps_s = [], [], []

for smi in df['smiles_cation'].values:
  mol = MolFromSmiles(smi)
  fp = mfpgen_c.GetFingerprintAsNumPy(mol, additionalOutput=ao_c)
  mol_fp_info_map = ao_c.GetBitInfoMap()
  aos_c.append(mol_fp_info_map)
  for x in mol_fp_info_map.keys():
    tpls_c.extend([(mol, x, mol_fp_info_map)])
  fps_c.append(fp)
for smi in df['smiles_anion'].values:
  mol = MolFromSmiles(smi)
  fp = mfpgen_a.GetFingerprintAsNumPy(mol, additionalOutput=ao_a)
  mol_fp_info_map = ao_a.GetBitInfoMap()
  aos_a.append(mol_fp_info_map)
  for x in mol_fp_info_map.keys():
    tpls_a.extend([(mol, x, mol_fp_info_map)])
  fps_a.append(fp)
for smi in df['smiles_solutes'].values:
  if '_test' in smi or '_valid' in smi or '_cosmo' in smi:
    smi = smi.replace('_test', '')
    smi = smi.replace('_valid', '')
    smi = smi.replace('_cosmo', '')
  mol = MolFromSmiles(smi)
  fp = mfpgen_s.GetFingerprintAsNumPy(mol, additionalOutput=ao_s)
  mol_fp_info_map = ao_s.GetBitInfoMap()
  aos_s.append(mol_fp_info_map)
  for x in mol_fp_info_map.keys():
    tpls_s.extend([(mol, x, mol_fp_info_map)])
  fps_s.append(fp)

tpls_c.sort(key = lambda i: i[1])
tpls_c_reduced = [tpls_c[0]]
for element in tpls_c:
  if element[1] != tpls_c_reduced[-1][1]:
    tpls_c_reduced.append(element)
print(f"{len(tpls_c_reduced) = }")
p_cations_fps = Draw.DrawMorganBits(tpls_c_reduced, molsPerRow=8, legends=['c_fp_' + str(x[1]) for x in tpls_c_reduced])
p_cations_fps.save('./cations_fps.png')

tpls_a.sort(key = lambda i: i[1])
tpls_a_reduced = [tpls_a[0]]
for element in tpls_a:
  if element[1] != tpls_a_reduced[-1][1]:
    tpls_a_reduced.append(element)
print(f"{len(tpls_a_reduced) = }")
p_anions_fps = Draw.DrawMorganBits(tpls_a_reduced, molsPerRow=8, legends=['a_fp_' + str(x[1]) for x in tpls_a_reduced])
p_anions_fps.save('./anions_fps.png')

fps_c_columns = np.array([f'c_fp_{i+1}' for i in range(nBITS)])
fps_a_columns = np.array([f'a_fp_{i+1}' for i in range(nBITS)])
fps_s_columns = np.array([f's_fp_{i+1}' for i in range(nBITS)])

fps_c = np.array(fps_c, dtype=int)
fps_a = np.array(fps_a, dtype=int)
fps_s = np.array(fps_s, dtype=int)

# Identify columns with zero variance
non_zero_var_cols_c = np.var(fps_c, axis=0) != 0
non_zero_var_cols_a = np.var(fps_a, axis=0) != 0
non_zero_var_cols_s = np.var(fps_s, axis=0) != 0

# Filter columns with non-zero variance
fps_c_filtered = fps_c[:, non_zero_var_cols_c]
fps_a_filtered = fps_a[:, non_zero_var_cols_a]
fps_s_filtered = fps_s[:, non_zero_var_cols_s]
fps_c_filtered_columns = fps_c_columns[non_zero_var_cols_c]
fps_a_filtered_columns = fps_a_columns[non_zero_var_cols_a]
fps_s_filtered_columns = fps_s_columns[non_zero_var_cols_s]
# print(f'{fps_c_filtered.shape = }')
# print(f'{fps_a_filtered.shape = }')
# print(f'{fps_s_filtered.shape = }')

# merge fps_c_filtered and fps_a_filtered
# ecfp = np.concatenate((fps_c_filtered, fps_a_filtered), axis=1)
ecfp = np.concatenate((fps_c_filtered, fps_a_filtered, fps_s_filtered), axis=1)
print(f'{ecfp.shape = }')

# columns_names_ecfp = np.concatenate((fps_c_filtered_columns, fps_a_filtered_columns))
columns_names_ecfp = np.concatenate((fps_c_filtered_columns, fps_a_filtered_columns, fps_s_filtered_columns))
print(f'{columns_names_ecfp.shape = }')

In [None]:
dataset = dc.data.NumpyDataset(X=ecfp, y=np.array(y), ids=tasks)

In [None]:
plt.hist(y, bins=100)
plt.show()

In [None]:
print(tasks_dict)

In [None]:
input_dim = dataset.X.shape[1]

full_tasks_names_to_test = []
for el in tasks_names_to_test:
  full_tasks_names_to_test.append(el)
  full_tasks_names_to_test.append(f"{el}_valid")
  full_tasks_names_to_test.append(f"{el}_test")

tasks_to_test = [tasks_dict[task] for task in full_tasks_names_to_test]

learner = MetaMFLearner(
    layer_sizes=[input_dim, 64, 32, 1],
    dataset=dataset, batch_size=hiper_batch_size,
    tasks_dict=tasks_dict, test_tasks=tasks_to_test
    )
optimizer = dc.models.optimizers.Adam(learning_rate=5e-3)
maml = MAML(learner, meta_batch_size=hiper_batch_size, optimizer=optimizer)
maml.fit(2500)

In [None]:
for val_task in tasks_names_to_test:
    print(f"Testing on task: {val_task}")
    maml.restore()
    learner.select_task_by_name(val_task)
    batch = learner.get_batch()
    print(f"{batch[1].shape = }")
    loss, outputs = maml.predict_on_batch(batch)
    print(loss)
    maml.train_on_current_task(25, restore=False)
    loss, outputs = maml.predict_on_batch(batch)
    print(loss)
    current_task_name = get_key_by_value(tasks_dict, learner.task_index)
    print(f"Task index: {learner.task_index}, task name: {current_task_name}")

    # validation set
    indexes = df[df['smiles_solutes'] == f"{val_task}_valid"].index

    y_true_val = y[indexes]
    y_true_val = torch.tensor(y_true_val, dtype=torch.float32).view(-1, 1)
    x_test_val = ecfp[indexes, :]
    x_test_val = torch.tensor(x_test_val, dtype=torch.float32)

    loss, outputs = maml.predict_on_batch([x_test_val, y_true_val])

    y_pred_val = outputs[0].cpu().detach().numpy()
    y_true_val = y_true_val.cpu().detach().numpy()

    evaluator = RegressionMetric(y_true_val, y_pred_val)
    list_metrics = ["R2", "R2S", "A10", "RMSE", "MAE", "MAPE", "NSE"]
    print('Validation Set - Test Set')
    for metric in list_metrics:
        print(f"{evaluator.get_metric_by_name(metric)[metric]: .4f}")
    print()


    # test set
    indexes = df[df['smiles_solutes'] == f"{val_task}_test"].index

    y_true = y[indexes]
    y_true = torch.tensor(y_true, dtype=torch.float32).view(-1, 1)
    x_test = ecfp[indexes, :]
    x_test = torch.tensor(x_test, dtype=torch.float32)

    loss, outputs = maml.predict_on_batch([x_test, y_true])

    y_pred = outputs[0].cpu().detach().numpy()
    y_true = y_true.cpu().detach().numpy()

    evaluator = RegressionMetric(y_true, y_pred)
    list_metrics = ["R2", "R2S", "A10", "RMSE", "MAE", "MAPE", "NSE"]
    for metric in list_metrics:
        print(f"{evaluator.get_metric_by_name(metric)[metric]: .4f}")

    print()

    for i in range(min(len(y_true), 10)):
        print(f"True: {float(y_true[i][0]): 3.4f} \tPredicted: {float(y_pred[i][0]): 3.4f}")

    plt.scatter(y_true, y_pred, c='red')
    plt.scatter(y_true_val, y_pred_val, c='blue')

    plt.plot([np.min(y_true), np.max(y_true)], [np.min(y_true), np.max(y_true)], 'k--', lw=2)

    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.show()