In [None]:
import deepchem as dc
import chemprop
from spectrae import Spectra, SpectraDataset
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Draw, AllChem, rdFingerprintGenerator
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from tqdm import tqdm
import os
import shutil

In [None]:
class molnet_dataset(SpectraDataset):

  def parse(self, dataset):
    return dataset

  def __len__(self):
    return len(self.samples)

  def sample_to_index(self,sample):
    if not hasattr(self, 'index_to_sequence'):
      print('Generating index to sequence')
      self.index_to_sequence = {}
      for i in tqdm(range(len(self.samples))):
        x = self.__getitem__(i)
        self.index_to_sequence[x] = i
    return self.index_to_sequence[sample]

  def __getitem__(self, idx):
    return self.samples[idx]

In [None]:
class molnet_tanimoto_spectra(Spectra):
  def spectra_properties(self, sample_one, sample_two):
    return TanimotoSimilarity(sample_one, sample_two)

  def cross_split_overlap(self, train, test):
    average_similarity = []
    for i in train:
      for j in test:
        average_similarity.append(self.spectra_properties(i,j))
    return np.mean(average_similarity)

In [None]:
spectra_parameters = {'number_repeats': 3,
                      'random_seed': [42,44,46],
                      'spectral_parameters':["{:.2f}".format(i) for i in np.arange(0,1.05,0.05)],
                      'force_reconstruct': True,
                      }

In [None]:
tasks, dataset, transformers = dc.molnet.load_bace_classification(splitter = None)
bace_data = dataset[0].X.tolist()
bace_data
#bace_spectra = molnet_tanimoto_spectra(bace_dataset, binary = False)
#bace_spectra.pre_calculate_spectra_properties('bace', force_recalculate = True)

In [None]:
tasks,datasets,transformers = dc.molnet.load_bace_classification(splitter = None)
bace_smiles = datasets[0].ids


In [None]:
mfp_bace = []
for i in range(len(bace_smiles)):
  mol = Chem.MolFromSmiles(bace_smiles[i])
  fp = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024).GetFingerprint(mol)
  mfp_bace.append(fp)

In [None]:
bace_dataset = molnet_dataset(mfp_bace,'bace')
print(bace_dataset.samples)
print(type(bace_dataset.samples))
print(bace_dataset.name)

In [None]:
bace_spectra = molnet_tanimoto_spectra(bace_dataset, binary = False)
print(type(bace_spectra.dataset))

In [None]:
bace_spectra.pre_calculate_spectra_properties('bace', force_recalculate = True)

In [None]:
bace_spectra.generate_spectra_splits(**spectra_parameters)

In [None]:
stats_bace = bace_spectra.return_all_split_stats()
stats_bace_df = pd.DataFrame(stats_bace).sort_values(by = 'SPECTRA_parameter', ascending = True)

print('SPECTRA Splits on BACE')
print('----------------------')
for index, row in stats_bace_df.iterrows():
  print(f'SPECTRA parameter: {row["SPECTRA_parameter"]}')
  print(f'Train size: {row["train_size"]} | Test size: {row["test_size"]}')
  print(f'Cross split overlap: {row["cross_split_overlap"]} \n')

In [None]:
plt.scatter(stats_bace['SPECTRA_parameter'], stats_bace['cross_split_overlap'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Cross split overlap')
plt.title('SPECTRA Splits of BACE')
plt.show()

In [None]:
plt.scatter(stats_bace['SPECTRA_parameter'], stats_bace['train_size'])
plt.scatter(stats_bace['SPECTRA_parameter'], stats_bace['test_size'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Split size')
plt.legend(['Train','Test'])
plt.title('SPECTRA Splits of BACE')
plt.show()

In [None]:
save_path_bace = '/content/drive/MyDrive/SAGELab/spectra_project/spectra_splits/Tanimoto/BACE'
if not os.path.exists(save_path_bace):
  os.makedirs(save_path_bace)
shutil.copytree('bace_SPECTRA_splits', os.path.join(save_path_bace, 'bace_SPECTRA_splits'))
shutil.copytree('bace_spectral_property_graphs', os.path.join(save_path_bace, 'bace_spectral_property_graphs'))
shutil.copy('bace_precalculated_spectra_properties', os.path.join(save_path_bace, 'bace_precalculated_spectra_properties'))

In [None]:
tasks,bbbp_datasets,transformers = dc.molnet.load_bbbp(splitter = None)
bbbp_smiles = bbbp_datasets[0].ids

In [None]:
mfp_bbbp = []

for i in range(len(bbbp_smiles)):
  mol = Chem.MolFromSmiles(bbbp_smiles[i])
  fp = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024).GetFingerprint(mol)
  mfp_bbbp.append(fp)

In [None]:
bbbp_dataset = molnet_dataset(mfp_bbbp,'bbbp')

In [None]:
bbbp_spectra = molnet_tanimoto_spectra(bbbp_dataset, binary = False)

In [None]:
bbbp_spectra.pre_calculate_spectra_properties('bbbp', force_recalculate = True)

In [None]:
bbbp_spectra.generate_spectra_splits(**spectra_parameters)

In [None]:
save_path_bbbp = '/content/drive/MyDrive/SAGELab/spectra_project/spectra_splits/Tanimoto/BBBP'
if not os.path.exists(save_path_bbbp):
  os.makedirs(save_path_bbbp)
shutil.copytree('bbbp_SPECTRA_splits', os.path.join(save_path_bbbp, 'bbbp_SPECTRA_splits'))
shutil.copytree('bbbp_spectral_property_graphs', os.path.join(save_path_bbbp, 'bbbp_spectral_property_graphs'))

In [None]:
stats_bbbp = bbbp_spectra.return_all_split_stats()
stats_bbbp_df = pd.DataFrame(stats_bbbp).sort_values(by = 'SPECTRA_parameter', ascending = True)

print('SPECTRA Splits on BBBP')
print('----------------------')
for index, row in stats_bbbp_df.iterrows():
  print(f'SPECTRA parameter: {row["SPECTRA_parameter"]}')
  print(f'Train size: {row["train_size"]} | Test size: {row["test_size"]}')
  print(f'Cross split overlap: {row["cross_split_overlap"]} \n')

In [None]:
plt.scatter(stats_bbbp['SPECTRA_parameter'], stats_bbbp['cross_split_overlap'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Cross split overlap')
plt.title('SPECTRA Splits of BBBP')
plt.show()

In [None]:
plt.scatter(stats_bbbp['SPECTRA_parameter'], stats_bbbp['train_size'])
plt.scatter(stats_bbbp['SPECTRA_parameter'], stats_bbbp['test_size'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Split size')
plt.legend(['Train','Test'])
plt.title('SPECTRA Splits of BBBP')
plt.show()

In [None]:
tasks,clintox_datasets,transformers = dc.molnet.load_clintox(splitter = None)
clintox_smiles = clintox_datasets[0].ids

In [None]:
mfp_clintox = []

for i in range(len(clintox_smiles)):
  mol = Chem.MolFromSmiles(clintox_smiles[i])
  fp = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024).GetFingerprint(mol)
  mfp_clintox.append(fp)

In [None]:
clintox_dataset = molnet_dataset(mfp_clintox,'clintox')
clintox_spectra = molnet_tanimoto_spectra(clintox_dataset, binary = False)
clintox_spectra.pre_calculate_spectra_properties('clintox', force_recalculate = True)

In [None]:
clintox_spectra.generate_spectra_splits(**spectra_parameters)

In [None]:
save_path_clintox = '/content/drive/MyDrive/SAGELab/spectra_project/spectra_splits/tanimoto/ClinTox'
if not os.path.exists(save_path_clintox):
  os.makedirs(save_path_clintox)
shutil.copytree('clintox_SPECTRA_splits', os.path.join(save_path_clintox, 'clintox_SPECTRA_splits'))
shutil.copytree('clintox_spectral_property_graphs', os.path.join(save_path_clintox, 'clintox_spectral_property_graphs'))
shutil.copy('clintox_precalculated_spectra_properties', os.path.join(save_path_clintox, 'clintox_precalculated_spectra_properties'))

In [None]:
stats_clintox = clintox_spectra.return_all_split_stats()
stats_clintox_df = pd.DataFrame(stats_clintox).sort_values(by = 'SPECTRA_parameter', ascending = True)

print('SPECTRA Splits on ClinTox')
print('----------------------')
for index, row in stats_clintox_df.iterrows():
  print(f'SPECTRA parameter: {row["SPECTRA_parameter"]}')
  print(f'Train size: {row["train_size"]} | Test size: {row["test_size"]}')
  print(f'Cross split overlap: {row["cross_split_overlap"]} \n')

In [None]:
plt.scatter(stats_clintox['SPECTRA_parameter'], stats_clintox['cross_split_overlap'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Cross split overlap')
plt.title('SPECTRA Splits of ClinTox')
plt.show()

In [None]:
plt.scatter(stats_clintox['SPECTRA_parameter'], stats_clintox['train_size'])
plt.scatter(stats_clintox['SPECTRA_parameter'], stats_clintox['test_size'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Split size')
plt.legend(['Train','Test'])
plt.title('SPECTRA Splits of ClinTox')
plt.show()

In [None]:
tasks, sider_dataset, transformers = dc.molnet.load_sider(splitter = None)
sider_smiles = sider_dataset[0].ids

In [None]:
mfp_sider = []

for i in range(len(sider_smiles)):
  mol = Chem.MolFromSmiles(sider_smiles[i])
  fp = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024).GetFingerprint(mol)
  mfp_sider.append(fp)

In [None]:
sider_dataset = molnet_dataset(mfp_sider,'sider')
sider_spectra = molnet_tanimoto_spectra(sider_dataset, binary = False)
sider_spectra.pre_calculate_spectra_properties('sider', force_recalculate = True)

In [None]:
sider_spectra.generate_spectra_splits(**spectra_parameters)

In [None]:
save_path_sider = '/content/drive/MyDrive/SAGELab/spectra_project/spectra_splits/tanimoto/SIDER'
if not os.path.exists(save_path_sider):
  os.makedirs(save_path_sider)
shutil.copytree('sider_SPECTRA_splits', os.path.join(save_path_sider, 'sider_SPECTRA_splits'))
shutil.copytree('sider_spectral_property_graphs', os.path.join(save_path_sider, 'sider_spectral_property_graphs'))
shutil.copy('sider_precalculated_spectra_properties', os.path.join(save_path_sider, 'sider_precalculated_spectra_properties'))

In [None]:
stats_sider = sider_spectra.return_all_split_stats()
stats_sider_df = pd.DataFrame(stats_sider).sort_values(by = 'SPECTRA_parameter', ascending = True)

print('SPECTRA Splits on SIDER')
print('----------------------')
for index, row in stats_sider_df.iterrows():
  print(f'SPECTRA parameter: {row["SPECTRA_parameter"]}')
  print(f'Train size: {row["train_size"]} | Test size: {row["test_size"]}')
  print(f'Cross split overlap: {row["cross_split_overlap"]} \n')

In [None]:
plt.scatter(stats_sider['SPECTRA_parameter'], stats_sider['cross_split_overlap'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Cross split overlap')
plt.title('SPECTRA Splits of SIDER')
plt.show()

In [None]:
plt.scatter(stats_sider['SPECTRA_parameter'], stats_sider['train_size'])
plt.scatter(stats_sider['SPECTRA_parameter'], stats_sider['test_size'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Split size')
plt.legend(['Train','Test'])
plt.title('SPECTRA Splits of SIDER')
plt.show()

In [None]:
tasks, delaney_dataset, transformers = dc.molnet.load_delaney(splitter = None)
delaney_smiles = delaney_dataset[0].ids

In [None]:
mfp_delaney = []

for i in range(len(delaney_smiles)):
  mol = Chem.MolFromSmiles(delaney_smiles[i])
  fp = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024).GetFingerprint(mol)
  mfp_delaney.append(fp)

In [None]:
delaney_dataset = molnet_dataset(mfp_delaney,'delaney')
delaney_spectra = molnet_tanimoto_spectra(delaney_dataset, binary = False)
delaney_spectra.pre_calculate_spectra_properties('delaney', force_recalculate = True)

In [None]:
delaney_spectra.generate_spectra_splits(**spectra_parameters)

In [None]:
save_path_delaney = '/content/drive/MyDrive/SAGELab/spectra_project/spectra_splits/tanimoto/Delaney'
if not os.path.exists(save_path_delaney):
  os.makedirs(save_path_delaney)
shutil.copytree('delaney_SPECTRA_splits', os.path.join(save_path_delaney, 'delaney_SPECTRA_splits'))
shutil.copytree('delaney_spectral_property_graphs', os.path.join(save_path_delaney, 'delaney_spectral_property_graphs'))
shutil.copy('delaney_precalculated_spectra_properties', os.path.join(save_path_delaney, 'delaney_precalculated_spectra_properties'))

In [None]:
stats_delaney = delaney_spectra.return_all_split_stats()
stats_delaney_df = pd.DataFrame(stats_delaney).sort_values(by = 'SPECTRA_parameter', ascending = True)

print('SPECTRA Splits on Delaney')
print('----------------------')
for index, row in stats_delaney_df.iterrows():
  print(f'SPECTRA parameter: {row["SPECTRA_parameter"]}')
  print(f'Train size: {row["train_size"]} | Test size: {row["test_size"]}')
  print(f'Cross split overlap: {row["cross_split_overlap"]} \n')

In [None]:
plt.scatter(stats_delaney['SPECTRA_parameter'], stats_delaney['cross_split_overlap'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Cross split overlap')
plt.title('SPECTRA Splits of Delaney')
plt.show()

In [None]:
plt.scatter(stats_delaney['SPECTRA_parameter'], stats_delaney['train_size'])
plt.scatter(stats_delaney['SPECTRA_parameter'], stats_delaney['test_size'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Split size')
plt.legend(['Train','Test'])
plt.title('SPECTRA Splits of Delaney')
plt.show()

In [None]:
tasks, freesolve_dataset, transformers = dc.molnet.load_freesolv(splitter = None)
freesolve_smiles = freesolve_dataset[0].ids

In [None]:
mfp_freesolve = []

for i in range(len(freesolve_smiles)):
  mol = Chem.MolFromSmiles(freesolve_smiles[i])
  fp = rdFingerprintGenerator.GetMorganGenerator(radius = 2, fpSize = 1024).GetFingerprint(mol)
  mfp_freesolve.append(fp)

In [None]:
freesolve = molnet_dataset(mfp_freesolve,'freesolve')
freesolve_spectra = molnet_tanimoto_spectra(freesolve, binary = False)
freesolve_spectra.pre_calculate_spectra_properties('freesolve', force_recalculate = True)

In [None]:
freesolve_spectra.generate_spectra_splits(**spectra_parameters)

In [None]:
stats_freesolve = freesolve_spectra.return_all_split_stats()
stats_freesolve_df = pd.DataFrame(stats_freesolve).sort_values(by = 'SPECTRA_parameter', ascending = True)

print('SPECTRA Splits on FreeSolve')
print('----------------------')
for index, row in stats_freesolve_df.iterrows():
  print(f'SPECTRA parameter: {row["SPECTRA_parameter"]}')
  print(f'Train size: {row["train_size"]} | Test size: {row["test_size"]}')
  print(f'Cross split overlap: {row["cross_split_overlap"]} \n')

In [None]:
plt.scatter(stats_freesolve['SPECTRA_parameter'], stats_freesolve['cross_split_overlap'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Cross split overlap')
plt.title('SPECTRA Splits of FreeSolv')
plt.show()

In [None]:
plt.scatter(stats_freesolve['SPECTRA_parameter'], stats_freesolve['train_size'])
plt.scatter(stats_freesolve['SPECTRA_parameter'], stats_freesolve['test_size'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Split size')
plt.legend(['Train','Test'])
plt.title('SPECTRA Splits of FreeSolv')
plt.show()

In [None]:
save_path_freesolv = '/content/drive/MyDrive/SAGELab/spectra_project/spectra_splits/tanimoto/FreeSolv'
if not os.path.exists(save_path_freesolv):
  os.makedirs(save_path_freesolv)
shutil.copytree('freesolve_SPECTRA_splits', os.path.join(save_path_freesolv, 'freesolv_SPECTRA_splits'))
shutil.copytree('freesolve_spectral_property_graphs', os.path.join(save_path_freesolv, 'freesolv_spectral_property_graphs'))
shutil.copy('freesolve_precalculated_spectra_properties', os.path.join(save_path_freesolv, 'freesolv_precalculated_spectra_properties'))

In [None]:
tasks, hiv_dataset, transformers = dc.molnet.load_hiv(splitter = None)
hiv_smiles = hiv_dataset[0].ids

In [None]:
mfp_hiv = []

for i in range(len(hiv_smiles)):
    mol = Chem.MolFromSmiles(hiv_smiles[i])
    fp = rdFingerprintGenerator.GetMorganGenerator(radius = 2, fpSize = 1024).GetFingerprint(mol)
    mfp_hiv.append(fp)

In [None]:
hiv_dataset = molnet_dataset(mfp_hiv,'hiv')
hiv_spectra = molnet_tanimoto_spectra(hiv_dataset, binary = False)
hiv_spectra.pre_calculate_spectra_properties('hiv', force_recalculate = False)

In [None]:
hiv_spectra.generate_spectra_splits(**spectra_parameters)

In [None]:
tasks, lipo_dataset, transformers = dc.molnet.load_lipo(splitter = None)
lipo_smiles = lipo_dataset[0].ids

In [None]:
mfp_lipo = []

for i in range(len(lipo_smiles)):
  mol = Chem.MolFromSmiles(lipo_smiles[i])
  fp = rdFingerprintGenerator.GetMorganGenerator(radius = 2, fpSize = 1024).GetFingerprint(mol)
  mfp_lipo.append(fp)

In [None]:
lipo_dataset = molnet_dataset(mfp_lipo,'lipo')
lipo_spectra = molnet_tanimoto_spectra(lipo_dataset, binary = False)
lipo_spectra.pre_calculate_spectra_properties('lipo', force_recalculate = False)

In [None]:
lipo_spectra.generate_spectra_splits(**spectra_parameters)

In [None]:
stats_lipo = lipo_spectra.return_all_split_stats()
stats_lipo_df = pd.DataFrame(stats_lipo).sort_values(by = 'SPECTRA_parameter', ascending = True)

print('SPECTRA Splits on Lipo')
print('----------------------')
for index, row in stats_lipo_df.iterrows():
  print(f'SPECTRA parameter: {row["SPECTRA_parameter"]}')
  print(f'Train size: {row["train_size"]} | Test size: {row["test_size"]}')
  print(f'Cross split overlap: {row["cross_split_overlap"]} \n')

In [None]:
import matplotlib.pyplot as plt
plt.scatter(stats_lipo['SPECTRA_parameter'], stats_lipo['cross_split_overlap'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Cross split overlap')
plt.title('SPECTRA Splits of Lipo')
plt.show()

In [None]:
plt.scatter(stats_lipo['SPECTRA_parameter'], stats_lipo['train_size'])
plt.scatter(stats_lipo['SPECTRA_parameter'], stats_lipo['test_size'])
plt.xlabel('SPECTRA parameter')
plt.ylabel('Split size')
plt.legend(['Train','Test'])
plt.title('SPECTRA Splits of Lipo')
plt.show()

In [None]:
tasks, tox21_dataset, transformers = dc.molnet.load_tox21(splitter = None,reload = False)
tox21_smiles = tox21_dataset[0].ids

mfp_tox21 = []
for i in range(len(tox21_smiles)):
    mol = Chem.MolFromSmiles(tox21_smiles[i])
    fp = rdFingerprintGenerator.GetMorganGenerator(radius = 2, fpSize = 1024).GetFingerprint(mol)
    mfp_tox21.append(fp)

In [None]:
tox21_dataset = molnet_dataset(mfp_tox21,'tox21')
tox21_spectra = molnet_tanimoto_spectra(tox21_dataset, binary = False)
tox21_spectra.pre_calculate_spectra_properties('tox21')

In [None]:
tox21_spectra.generate_spectra_splits(**spectra_parameters)