In [5]:
import pandas as pd
import numpy as np
import os
# from rdkit import Chem
# from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
# from gensim.models import word2vec

import warnings
warnings.filterwarnings("ignore")


In [4]:
def getMolDescriptors(mol, missingVal=None):
    res = {}
    for nm,fn in Chem.Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [2]:
path = '/Users/samueljon/Desktop/ODP/2024/Winter/drug-exp-forecast/drug_exp_compatibility'
data = pd.read_csv(os.path.join(path,'dataset.csv'))
w2vec_model = word2vec.Word2Vec.load(os.path.join(path,'model_300dim.pkl'))

In [None]:
#### 2D Descriptors Features

API = []
exp = []
for smiles in data["API_Smiles"]: API.append(getMolDescriptors(Chem.MolFromSmiles(smiles)))    
for smiles in data["Excipient_Smiles"]: exp.append(getMolDescriptors(Chem.MolFromSmiles(smiles)))
data_2D = pd.concat([pd.DataFrame(API), pd.DataFrame(exp), data], axis=1)
print(data_2D.shape)
#%%
# Save the concatenated data to a CSV file
data_2D.to_csv('2D_data.csv', index=False)

In [8]:
#### Mol2Vec Features

data['mol_API'] = data['API_Smiles'].apply(lambda x: Chem.MolFromSmiles(x))
data['mol_Excipient'] = data['Excipient_Smiles'].apply(lambda x: Chem.MolFromSmiles(x)) 
data['mol_Excipient'] = data['mol_Excipient'].apply(lambda x: Chem.AddHs(x))
data['mol_API'] = data['mol_API'].apply(lambda x: Chem.AddHs(x))

data['sentence_API'] = data.apply(lambda x: MolSentence(mol2alt_sentence(x['mol_API'], 1)), axis=1)
data['mol2vec_API'] = [DfVec(x) for x in sentences2vec(data['sentence_API'], w2vec_model, unseen='UNK')]
data['sentence_Excipient'] = data.apply(lambda x: MolSentence(mol2alt_sentence(x['mol_Excipient'], 1)), axis=1)
data['mol2vec_Excipient'] = [DfVec(x) for x in sentences2vec(data['sentence_Excipient'], w2vec_model, unseen='UNK')]

X1 = np.array([x.vec for x in data['mol2vec_API']])  
X2 = np.array([y.vec for y in data['mol2vec_Excipient']])
X = pd.concat((pd.DataFrame(X1), pd.DataFrame(X2), data.drop(['mol2vec_API','mol2vec_Excipient', 'sentence_Excipient', 
                                                                'mol_API', 'mol_Excipient','sentence_API'], axis=1)), axis=1)

X.to_csv('mol2vec_data.csv', sep=',', encoding='utf-8', index=False)

In [11]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,API_CID,Excipient_CID,Outcome1,API_Smiles,Excipient_Smiles
0,-2.884008,2.712215,-5.476322,0.577703,-2.726916,-6.762939,5.555508,-1.646141,-6.55519,-13.648162,...,-4.932941,1.45099,2.979944,-2.0252,0.308887,196,24434,1,C(CCC(=O)O)CC(=O)O,[N+](=O)([O-])[O-].[K+]
1,-2.884008,2.712215,-5.476322,0.577703,-2.726916,-6.762939,5.555508,-1.646141,-6.55519,-13.648162,...,-3.780702,2.85756,1.646724,-2.370362,-1.741092,196,516875,1,C(CCC(=O)O)CC(=O)O,[O-][Mn](=O)(=O)=O.[K+]
2,-2.884008,2.712215,-5.476322,0.577703,-2.726916,-6.762939,5.555508,-1.646141,-6.55519,-13.648162,...,-2.16777,1.297471,0.978477,-0.781948,-0.193472,196,24823,1,C(CCC(=O)O)CC(=O)O,[O-][O+]=O
3,-2.884008,2.712215,-5.476322,0.577703,-2.726916,-6.762939,5.555508,-1.646141,-6.55519,-13.648162,...,-1.864838,1.561008,1.121663,-1.217376,-1.022901,196,14793,1,C(CCC(=O)O)CC(=O)O,O=[Pb]=O
4,-3.841235,2.74777,-4.950537,-0.578358,-2.076167,-3.756751,3.710935,-1.331131,-4.122436,-10.464413,...,-2.292982,1.159315,-0.214513,-1.163031,-0.769442,243,14798,1,C1=CC=C(C=C1)C(=O)O,[OH-].[Na+]


In [6]:
mol2vec_df = pd.read_csv("mol2vec_data.csv")
_2d_df = pd.read_csv("2D_data.csv")

## For now, simple way to remove NaN is just dropping columns that have it
## In the future when using other data, will want to improve this as there will be a lot of NaN most likely 
remove_nan = set([i.split('.1')[0] for i in _2d_df.loc[:, _2d_df.isnull().any()].columns])
remove_nan = list(remove_nan) + [i + ".1" for i in remove_nan]
_2d_df = _2d_df.drop(columns=remove_nan)

api_features_mol2vec = mol2vec_df.iloc[:, :50].values
excipient_features_mol2vec = mol2vec_df.iloc[:, 50:100].values
outcomes_mol2vec = mol2vec_df['Outcome1'].values

mol2vec_data = [(api_features, excipient_features, outcome) 
                for api_features, excipient_features, outcome in zip(api_features_mol2vec, excipient_features_mol2vec, outcomes_mol2vec)]

# The number of feature columns for API (and similarly for excipients) is (total columns - 5 metadata columns) / 2
num_feature_columns = (_2d_df.shape[1] - 5) // 2

# Extract API features, excipient features, and outcomes from 2D dataset
api_features_2d = _2d_df.iloc[:, :num_feature_columns].values
excipient_features_2d = _2d_df.iloc[:, num_feature_columns:2*num_feature_columns].values
outcomes_2d = _2d_df['Outcome1'].values

# Combine into tuples for the dataset
_2d_data = [(api_features, excipient_features, outcome) 
            for api_features, excipient_features, outcome in zip(api_features_2d, excipient_features_2d, outcomes_2d)]


In [40]:
_2d_df.head(5)

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_thiazole.1,fr_thiocyan.1,fr_thiophene.1,fr_unbrch_alkane.1,fr_urea.1,API_CID,Excipient_CID,Outcome1,API_Smiles,Excipient_Smiles
0,9.896552,-0.87001,9.896552,0.062778,0.560077,146.142,136.062,146.057909,58,0,...,0,0,0,0,0,196,24434,1,C(CCC(=O)O)CC(=O)O,[N+](=O)([O-])[O-].[K+]
1,9.896552,-0.87001,9.896552,0.062778,0.560077,146.142,136.062,146.057909,58,0,...,0,0,0,0,0,196,516875,1,C(CCC(=O)O)CC(=O)O,[O-][Mn](=O)(=O)=O.[K+]
2,9.896552,-0.87001,9.896552,0.062778,0.560077,146.142,136.062,146.057909,58,0,...,0,0,0,0,0,196,24823,1,C(CCC(=O)O)CC(=O)O,[O-][O+]=O
3,9.896552,-0.87001,9.896552,0.062778,0.560077,146.142,136.062,146.057909,58,0,...,0,0,0,0,0,196,14793,1,C(CCC(=O)O)CC(=O)O,O=[Pb]=O
4,10.200926,-0.879074,10.200926,0.331019,0.610604,122.123,116.075,122.036779,46,0,...,0,0,0,0,0,243,14798,1,C1=CC=C(C=C1)C(=O)O,[OH-].[Na+]


In [31]:
combined_features = [
    (np.concatenate([api_m, api_2d]), np.concatenate([exc_m, exc_2d]), outcome) 
    for (api_m, exc_m, outcome), (api_2d, exc_2d, _) in zip(mol2vec_data, _2d_data)
]

In [39]:
len(combined_features[0][1])

258

In [41]:
print(_2d_df.head())

   MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  MinAbsEStateIndex  \
0        9.896552       -0.870010           9.896552           0.062778   
1        9.896552       -0.870010           9.896552           0.062778   
2        9.896552       -0.870010           9.896552           0.062778   
3        9.896552       -0.870010           9.896552           0.062778   
4       10.200926       -0.879074          10.200926           0.331019   

        qed    MolWt  HeavyAtomMolWt  ExactMolWt  NumValenceElectrons  \
0  0.560077  146.142         136.062  146.057909                   58   
1  0.560077  146.142         136.062  146.057909                   58   
2  0.560077  146.142         136.062  146.057909                   58   
3  0.560077  146.142         136.062  146.057909                   58   
4  0.610604  122.123         116.075  122.036779                   46   

   NumRadicalElectrons  ...  fr_thiazole.1  fr_thiocyan.1  fr_thiophene.1  \
0                    0  ...      