In [1]:
import pandas as pd
import numpy as np
import rdkit
import duckdb
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

In [53]:
#Reading only 100,000 rows for the initial training and playground stuff
train_df = pd.read_csv(r"C:\dta_genes\train.csv", nrows=100000)

In [3]:
#Converting to RDKit molecules
train_df['molecule'] = train_df['molecule_smiles'].apply(Chem.MolFromSmiles)

In [4]:
# Generate ECFPs
def generate_ecfp(molecule, radius=2, bits=1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

train_df['ecfp'] = train_df['molecule'].apply(generate_ecfp)

In [5]:
train_df.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,ecfp
0,0,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x00000279CBB...,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x00000279CBB...,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x00000279CBB...,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x00000279CBB...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x00000279CBB...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
# One hot encoding protein name
one_hot_encoded = pd.get_dummies(train_df['protein_name'], prefix='Protein: ')
one_hot_encoded = one_hot_encoded.astype(int)
ndfn = train_df.drop('protein_name', axis=1)
ndf = pd.concat([ndfn, one_hot_encoded], axis=1)

In [7]:
print(ndf.columns)

Index(['id', 'buildingblock1_smiles', 'buildingblock2_smiles',
       'buildingblock3_smiles', 'molecule_smiles', 'binds', 'molecule', 'ecfp',
       'Protein: _BRD4', 'Protein: _HSA', 'Protein: _sEH'],
      dtype='object')


In [11]:
list_lengths = ndf['ecfp'].apply(len)
print(list_lengths.unique())

[1024]


In [12]:
# Convert the list column into separate columns
expanded_df = pd.DataFrame(ndf['ecfp'].to_list(), columns=[f'ecfp_{i+1}' for i in range(1024)])

# Combine the expanded DataFrame with the original DataFrame
result_df = pd.concat([ndf, expanded_df], axis=1)

# Drop the original 'ecfp' column
result_df.drop(columns=['ecfp'], inplace=True)


In [30]:
# Filter numeric columns
numeric_columns = result_df.select_dtypes(include='number').columns

# Keep only numeric columns
df_numeric = result_df[numeric_columns]

# Alternatively, drop non-numeric columns
df_numeric = result_df.select_dtypes(include='number')

# If you want to modify the original DataFrame in-place, you can use the inplace parameter
# df.drop(df.select_dtypes(exclude='number').columns, axis=1, inplace=True)

# Display the DataFrame with only numeric columns
print(df_numeric)

          id  binds  Protein: _BRD4  Protein: _HSA  Protein: _sEH  ecfp_1  \
0          0      0               1              0              0       0   
1          1      0               0              1              0       0   
2          2      0               0              0              1       0   
3          3      0               1              0              0       0   
4          4      0               0              1              0       0   
...      ...    ...             ...            ...            ...     ...   
99995  99995      0               0              0              1       0   
99996  99996      0               1              0              0       0   
99997  99997      0               0              1              0       0   
99998  99998      0               0              0              1       0   
99999  99999      0               1              0              0       0   

       ecfp_2  ecfp_3  ecfp_4  ecfp_5  ...  ecfp_1015  ecfp_1016  ecfp_1017

In [31]:
X = df_numeric.drop(columns=['id','binds'])
y = df_numeric[['binds']]

In [32]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
print(X_test.columns)

Index(['Protein: _BRD4', 'Protein: _HSA', 'Protein: _sEH', 'ecfp_1', 'ecfp_2',
       'ecfp_3', 'ecfp_4', 'ecfp_5', 'ecfp_6', 'ecfp_7',
       ...
       'ecfp_1015', 'ecfp_1016', 'ecfp_1017', 'ecfp_1018', 'ecfp_1019',
       'ecfp_1020', 'ecfp_1021', 'ecfp_1022', 'ecfp_1023', 'ecfp_1024'],
      dtype='object', length=1027)


In [35]:
# 3. Define the Model
model = Sequential()

# Input layer
model.add(Dense(128, input_dim=1027, activation='relu'))
model.add(Dropout(0.3))

# Hidden layers
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# 4. Compile the Model
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# 5. Train the Model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(X_train, y_train, 
                    validation_split=0.2, 
                    epochs=100, 
                    batch_size=32, 
                    callbacks=[early_stopping])

# 6. Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9963 - loss: 0.0350 - val_accuracy: 0.9974 - val_loss: 0.0161
Epoch 2/100
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9977 - loss: 0.0185 - val_accuracy: 0.9974 - val_loss: 0.0172
Epoch 3/100
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9974 - loss: 0.0159 - val_accuracy: 0.9974 - val_loss: 0.0223
Epoch 4/100
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9976 - loss: 0.0158 - val_accuracy: 0.9974 - val_loss: 0.0188
Epoch 5/100
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9978 - loss: 0.0147 - val_accuracy: 0.9974 - val_loss: 0.0189
Epoch 6/100
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9974 - loss: 0.0154 - val_accuracy: 0.9974 - val_loss: 0.0326
Epoch 7/10

In [40]:
# Make predictions on the test set
y_pred_proba = model.predict(X_test)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 827us/step


In [44]:
print(y_pred_proba)
print(y_test)

[[4.7577636e-05]
 [3.1310352e-04]
 [2.1947371e-03]
 ...
 [5.5799028e-05]
 [2.4714967e-04]
 [4.4312347e-02]]
       binds
75721      0
80184      0
19864      0
76699      0
92991      0
...      ...
32595      0
29313      0
37862      0
53421      0
42410      1

[20000 rows x 1 columns]


In [43]:
# Calculate the mean average precision
map_score = average_precision_score(y_test, y_pred_proba)
print(f"Mean Average Precision (mAP): {map_score:.2f}")

Mean Average Precision (mAP): 0.06
