In [106]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [107]:
df_data = pd.read_csv('EGFR_Feature_Extraction.csv')
df_data.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,standard_value,class
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7,41.0,active
1,CHEMBL137635,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66,9300.0,inactive
2,CHEMBL306988,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98,500000.0,inactive
3,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53,3000000.0,inactive
4,CHEMBL77085,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95,96000.0,inactive


In [108]:
df = df_data.drop(df_data.columns[-2], axis=1, inplace=True)
df = df_data.iloc[:, 2:]
print("Null values: " + str(df.isnull().values.any()))
print(df['class'].value_counts())
df.head()

Null values: False
class
active      3810
inactive    3524
Name: count, dtype: int64


Unnamed: 0,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,class
0,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7,active
1,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66,inactive
2,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98,inactive
3,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53,inactive
4,215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95,inactive


In [109]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [110]:
le = LabelEncoder()
y = le.fit_transform(y)

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.values
X_test = X_test.values

In [112]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [113]:
num_features = X_train.shape[1]
num_classes = len(le.classes_)
print("Features: " + str(num_features))
print("Classes: " + str(num_classes))

Features: 13
Classes: 2


In [114]:
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [115]:
class TabTransformer(nn.Module):
    def __init__(self, num_features, num_classes, dim_embedding=128, num_heads=8, num_layers=4):
        super(TabTransformer, self).__init__()
        self.embedding = nn.Linear(num_features, dim_embedding)
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_embedding, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(dim_embedding, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Adding a sequence length dimension
        x = self.transformer(x)
        x = torch.mean(x, dim=1)  # Pooling
        x = self.classifier(x)
        return x

In [116]:
model = TabTransformer(num_features, num_classes).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [117]:
X_train = torch.tensor(X_train, dtype=torch.float32).clone().detach()
X_train = X_train.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
y_train = torch.tensor(y_train, dtype=torch.long).clone().detach()
y_train = y_train.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

for epoch in range(100):
    optimizer.zero_grad()
    output = model(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 0.8249430656433105
Epoch 10, Loss: 0.6376112699508667
Epoch 20, Loss: 0.603241503238678
Epoch 30, Loss: 0.5557702779769897
Epoch 40, Loss: 0.5320354700088501
Epoch 50, Loss: 0.5194011926651001
Epoch 60, Loss: 0.5054837465286255
Epoch 70, Loss: 0.4937035143375397
Epoch 80, Loss: 0.47485655546188354
Epoch 90, Loss: 0.44559887051582336


In [118]:
model.eval()
X_test = torch.tensor(X_test, dtype=torch.float32)
X_test = X_test.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
y_test = torch.tensor(y_test, dtype=torch.long)
y_test = y_test.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

with torch.no_grad():
    predictions = model(X_test)
    _, predicted_classes = torch.max(predictions, 1)
    accuracy = (predicted_classes == y_test).float().mean()
    print(f'Test Accuracy: {accuracy.item()}')

  X_test = torch.tensor(X_test, dtype=torch.float32)


Test Accuracy: 0.7552829384803772


  y_test = torch.tensor(y_test, dtype=torch.long)
