In [175]:
import mordred
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from mordred import Calculator, descriptors
from rdkit import Chem

In [176]:
main = pd.read_csv(r"./data/train_dm300_full.csv")

In [139]:
solvents = ["MeOH", "EtOH", "IPA", "Tol", "Hept", "Acet", "MeCN", "MTBE", "MeTHF", "DMF", "EtOAc"]
solvent_names = {"Methanol":"Methanol", 
                "Ethanol":"Ethanol", 
                "2-Propanol":"2-Propanol", 
                "Toluene":"Toluene", 
                "Heptane":"Heptane", 
                "Acetone":"Acetone", 
                "Acetonitrile":"Acetonitrile", 
                "Methyl tert butyl ether":"Methyl\ntert-butyl ether", 
                "2-Methly tetrahydrofurane":"2-Methyl\ntetrahydrofurane",
                "Dimethyl formamide":"Dimethyl\nformamide", 
                "Etyl acetate":"Ethyl\nacetate" }

solvent_dict = {}
for solvent, name in zip(solvents, solvent_names.keys()):
    solvent_dict[solvent] = main[main['solvent_name']==name]

In [140]:
def descriptors_calculator(key: str, dframe: pd.DataFrame):
    calc = Calculator(descriptors, ignore_3D=False)
    mol_lst = [Chem.MolFromSmiles(smile) for smile in dframe[key]["solute_smiles"]]
    descriptors_lst_solute = calc.pandas(mol_lst)
    cols=descriptors_lst_solute.columns.tolist() 
    descriptors_lst_solute = descriptors_lst_solute.values.astype('float32')
    descriptors_lst_solute = pd.DataFrame(descriptors_lst_solute, columns=cols)
    descriptors_lst_solute = descriptors_lst_solute.dropna(axis=1)
    indx = pd.Series(dframe[key].index)
    descriptors_lst_solute.set_index(indx, inplace=True)
    total_descriptors = dframe[key].join(descriptors_lst_solute)
    total_descriptors.to_csv(r'./data/pls/raw/'+key+'_descriptors.csv', sep=',')
    print(key + ": " + str(len(descriptors_lst_solute.columns)))


In [141]:
for key in solvents:
    descriptors_calculator(key, solvent_dict)

100%|██████████| 417/417 [00:11<00:00, 36.12it/s]


MeOH: 1243


100%|██████████| 56/56 [00:02<00:00, 19.96it/s]


EtOH: 1292


100%|██████████| 57/57 [00:02<00:00, 20.99it/s]

IPA: 1292



100%|██████████| 48/48 [00:01<00:00, 26.62it/s]

Tol: 1295



100%|██████████| 49/49 [00:01<00:00, 28.43it/s]

Hept: 1243



100%|██████████| 96/96 [00:03<00:00, 28.09it/s]


Acet: 1292


100%|██████████| 146/146 [00:04<00:00, 30.07it/s]


MeCN: 1292


100%|██████████| 51/51 [00:02<00:00, 19.12it/s]

MTBE: 1292



100%|██████████| 59/59 [00:02<00:00, 20.77it/s]

MeTHF: 1292



100%|██████████| 123/123 [00:04<00:00, 26.61it/s]


DMF: 1243


100%|██████████| 65/65 [00:02<00:00, 25.81it/s]

EtOAc: 1292





In [136]:
calc = Calculator(descriptors, ignore_3D=False)
mol_lst = [Chem.MolFromSmiles(smile) for smile in solvent_dict['MeCN']["solute_smiles"]]
descriptors_lst_solute = calc.pandas(mol_lst)
indx = pd.Series(solvent_dict['MeCN'].index)
descriptors_lst_solute.set_index(indx, inplace=True)
total_descriptors = solvent_dict['MeCN'].join(descriptors_lst_solute)

100%|██████████| 148/148 [00:05<00:00, 29.45it/s]


In [137]:
total_descriptors.to_csv(r'./data/pls/raw/mecn_full.csv')

In [145]:
reduced_solvent_list = {}
for solvent in solvents:
    reduced_solvent_list[solvent] = pd.read_csv(r"./data/pls/removed/" + solvent + ".csv")

In [161]:
merged_solvents = pd.DataFrame(columns=["name","container","cas","mw","solute_smiles","solvent_smiles","dm300","dm300_error"])

In [166]:
merg = [df[["name","container","cas","mw","solute_smiles","solvent_smiles","dm300","dm300_error"]] for df in reduced_solvent_list.values()]

In [171]:
merged_solvents = pd.concat(merg)

In [174]:
merged_solvents.to_csv(r'./data/filtered_dm300.csv')

In [177]:
calc = Calculator(descriptors, ignore_3D=False)
mol_lst = [Chem.MolFromSmiles(smile) for smile in main["solute_smiles"]]
descriptors_lst_solute = calc.pandas(mol_lst)
cols=descriptors_lst_solute.columns.tolist() 
descriptors_lst_solute = descriptors_lst_solute.values.astype('float32')
descriptors_lst_solute = pd.DataFrame(descriptors_lst_solute, columns=cols)
descriptors_lst_solute = descriptors_lst_solute.dropna(axis=1)



100%|██████████| 1167/1167 [00:39<00:00, 29.25it/s]
100%|██████████| 1167/1167 [00:39<00:00, 29.23it/s]


In [182]:

calc = Calculator(descriptors, ignore_3D=False)
mol_lst = [Chem.MolFromSmiles(smile) for smile in main["solvent_smiles"]]
descriptors_lst_solvent = calc.pandas(mol_lst)
cols=descriptors_lst_solvent.columns.tolist() 
descriptors_lst_solvent = descriptors_lst_solvent.values.astype('float32')
descriptors_lst_solvent = pd.DataFrame(descriptors_lst_solvent, columns=cols)
descriptors_lst_solvent = descriptors_lst_solvent.dropna(axis=1)


100%|██████████| 1167/1167 [00:19<00:00, 60.23it/s]


In [184]:
len(descriptors_lst_solute.columns)

1243

In [185]:
len(descriptors_lst_solvent.columns)


1183

In [191]:
total_descriptors = pd.concat([descriptors_lst_solute, descriptors_lst_solvent], axis=1)

In [197]:
total_descriptors = main.join(total_descriptors)

In [199]:
total_descriptors.to_csv(r"./data/total_descriptors.csv")

In [200]:
len(total_descriptors.columns)

2437