In [1]:
import numpy as np
import pandas as pd
import json
from torchtext.transforms import CLIPTokenizer
from torchtext.transforms import PadTransform
import torch
from torch.utils.data import Dataset, DataLoader
from ast import literal_eval
from sklearn import preprocessing
import sqlite3
import torch.nn as nn
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')



In [2]:
df_drug = pd.read_pickle('/kaggle/input/datasetname/df.pkl')
conn = sqlite3.connect("/kaggle/input/datasetname/event.db")
extraction = pd.read_sql('select * from extraction;', conn)
extraction.drop(columns=['index'], inplace = True)
df_drug.drop(columns=['id', 'index'], inplace = True)

def feature_extractor(df, f_list):
    for feature in f_list:
        unique = set('|'.join(df[feature].values.tolist()).split('|'))

        for side in unique:
            df[side] = 0

        for index, row in df.iterrows():
            for side in row[feature].split('|'):
                df.at[index, side] = 1
    df.drop(columns=f_list, inplace=True)

feature_extractor(df_drug, ['side', 'target', 'enzyme', 'pathway', 'smile'])

extraction['side'] = extraction['mechanism'] + extraction['action']
extraction.drop(columns=['mechanism', 'action'], inplace=True)
le = preprocessing.LabelEncoder()
extraction['side'] = le.fit_transform(extraction['side'])

In [3]:
class DDIDataset(Dataset):
    def __init__(self, df, extraction):
        self.extraction = extraction
        self.df = df

    def __len__(self):
        return len(self.extraction)

    def __getitem__(self, idx):
        drugA = torch.tensor(self.df[self.df['name'] == self.extraction.loc[idx]['drugA']].drop(columns=['name']).values.astype('float32'))
        drugB = torch.tensor(self.df[self.df['name'] == self.extraction.loc[idx]['drugB']].drop(columns=['name']).values.astype('float32'))
        return torch.cat([(drugA), (drugB)]).flatten(), self.extraction.loc[idx]['side']

In [4]:
class AdversarialAutoencoder(nn.Module):
    def __init__(self):
        super(AdversarialAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(25658, 4096),
            nn.BatchNorm1d(4096),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(4096, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(2048, 4096),
            nn.BatchNorm1d(4096),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(4096, 25658),
            nn.Sigmoid()
        )
        self.discriminator = nn.Sequential(
            nn.Linear(2048, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

        self.classifier = nn.Sequential(
            nn.Linear(2048, 512 + 256),
            nn.BatchNorm1d(512 + 256),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512 + 256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 65),
            nn.LogSoftmax()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        classification = self.classifier(encoded)

        random_latent = torch.randn_like(encoded)

        real_output = self.discriminator(encoded)
        fake_output = self.discriminator(random_latent)

        return decoded, classification, real_output, fake_output

In [5]:
def train_adversarial_autoencoder(model, train_loader, test_loader, decoder_criterion, classifier_criterion, adversarial_criterion, optimizer, num_epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = model.to(device)
    for epoch in range(num_epochs):
        model.train() # Set model to train mode
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} (training)", leave=False)
        running_loss = 0.0
        running_decoder_loss = 0.0
        running_classifier_loss = 0.0
        running_correct = 0
        running_total = 0
        for i, (inputs, labels) in enumerate(train_pbar):
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            reconstructed_inputs, predicted_labels, real_output, fake_output = model(inputs)
            decoder_loss = decoder_criterion(reconstructed_inputs, inputs)
            classifier_loss = classifier_criterion(predicted_labels, labels)
            
            real_labels = torch.ones_like(real_output)
            fake_labels = torch.zeros_like(fake_output)
            adversarial_loss = adversarial_criterion(real_output, real_labels) + adversarial_criterion(fake_output, fake_labels)
            
            loss = 0.5 * decoder_loss + classifier_loss - adversarial_loss
            
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            running_decoder_loss += decoder_loss.item()
            running_classifier_loss += classifier_loss.item()
            _, predicted = torch.max(predicted_labels.data, 1)
            running_total += labels.size(0)
            running_correct += (predicted == labels).sum().item()
            train_pbar.set_postfix({"loss": running_loss / (i+1),
                                    "decoder_loss": running_decoder_loss / (i+1),
                                    "classifier_loss": running_classifier_loss / (i+1),
                                    "accuracy": 100 * running_correct / running_total})
            
            optimizer.zero_grad()
            random_latent = torch.randn_like(real_output)
            real_loss = adversarial_criterion(real_output, real_labels)
            fake_loss = adversarial_criterion(fake_output, fake_labels)
            discriminator_loss = real_loss + fake_loss
            
            optimizer.step()

        epoch_loss = running_loss / len(train_loader)
        epoch_decoder_loss = running_decoder_loss / len(train_loader)
        epoch_classifier_loss = running_classifier_loss / len(train_loader)
        epoch_train_accuracy = 100 * running_correct / running_total
        train_pbar.set_postfix({"loss": epoch_loss,
                                "decoder_loss": epoch_decoder_loss,
                                "classifier_loss": epoch_classifier_loss,
                                "accuracy": epoch_train_accuracy})
        
        model.eval() 
        test_running_loss = 0.0
        test_running_correct = 0
        test_running_total = 0
        test_pbar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} (testing)", leave=False)
        with torch.no_grad():
            for inputs, labels in test_pbar:
                inputs = inputs.to(device)
                labels = labels.to(device)
                reconstructed_inputs, predicted_labels, _, _ = model(inputs)
                decoder_loss = decoder_criterion(reconstructed_inputs, inputs)
                classifier_loss = classifier_criterion(predicted_labels, labels)
                loss = decoder_loss + classifier_loss
                test_running_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(predicted_labels.data, 1)
                test_running_total += labels.size(0)
                test_running_correct += (predicted == labels).sum().item()
            test_accuracy = 100 * test_running_correct / test_running_total
            test_loss = test_running_loss / len(test_loader.dataset)
        test_pbar.set_postfix({"loss": test_loss,
                               "accuracy": test_accuracy})
        
        print('Epoch [{}/{}], Train Loss: {:.4f}, Train Autoencoder Loss: {:.4f}, Train Classification Loss: {:.4f}, Train Accuracy: {:.2f}%, Test Loss: {:.4f}, Test Accuracy: {:.2f}%'.format(epoch+1, num_epochs, epoch_loss, epoch_decoder_loss, epoch_classifier_loss, epoch_train_accuracy, test_loss, test_accuracy))

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from sklearn.model_selection import KFold

class AdversarialAutoencoderTrainer:
    def __init__(self, df_drug, extraction, num_folds=5, batch_size=64, lr=0.001, num_epochs=20):
        self.df_drug = df_drug
        self.extraction = extraction
        self.num_folds = num_folds
        self.batch_size = batch_size
        self.lr = lr
        self.num_epochs = num_epochs
        
    def train(self):
        dataset = DDIDataset(self.df_drug, self.extraction)
        kf = KFold(n_splits=self.num_folds, shuffle=True, random_state=42)
        fold = 1
        for train_index, test_index in kf.split(dataset):
            train_dataset = torch.utils.data.Subset(dataset, train_index)
            test_dataset = torch.utils.data.Subset(dataset, test_index)
            train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
            model = AdversarialAutoencoder()
            decoder_criterion = nn.MSELoss() 
            adversarial_criterion = nn.MSELoss()
            classifier_criterion = nn.CrossEntropyLoss() 
            optimizer = torch.optim.Adam(model.parameters(), lr=self.lr)
            print("Training fold {}...".format(fold))
            train_adversarial_autoencoder(model, train_loader, test_loader, decoder_criterion, classifier_criterion, adversarial_criterion, optimizer, self.num_epochs)
            fold += 1

In [None]:
trainer = AdversarialAutoencoderTrainer(df_drug, extraction)
trainer.train()

Training fold 1...


                                                                                                                                                

Epoch [1/20], Train Loss: -0.3980, Train Autoencoder Loss: 0.0208, Train Classification Loss: 1.3480, Train Accuracy: 63.61%, Test Loss: 0.9629, Test Accuracy: 71.90%


                                                                                                                                               

Epoch [2/20], Train Loss: -1.1230, Train Autoencoder Loss: 0.0165, Train Classification Loss: 0.7941, Train Accuracy: 75.32%, Test Loss: 0.7517, Test Accuracy: 77.02%


                                                                                                                                               

Epoch [3/20], Train Loss: -1.3046, Train Autoencoder Loss: 0.0147, Train Classification Loss: 0.6204, Train Accuracy: 80.08%, Test Loss: 0.6443, Test Accuracy: 79.94%


                                                                                                                                               

Epoch [4/20], Train Loss: -1.4370, Train Autoencoder Loss: 0.0133, Train Classification Loss: 0.5087, Train Accuracy: 83.04%, Test Loss: 0.5971, Test Accuracy: 81.09%


                                                                                                                                               

Epoch [5/20], Train Loss: -1.5254, Train Autoencoder Loss: 0.0122, Train Classification Loss: 0.4249, Train Accuracy: 85.57%, Test Loss: 0.5756, Test Accuracy: 82.11%


                                                                                                                                               

Epoch [6/20], Train Loss: -1.5915, Train Autoencoder Loss: 0.0113, Train Classification Loss: 0.3624, Train Accuracy: 87.66%, Test Loss: 0.5547, Test Accuracy: 82.80%


                                                                                                                                               

Epoch [7/20], Train Loss: -1.6413, Train Autoencoder Loss: 0.0106, Train Classification Loss: 0.3141, Train Accuracy: 89.36%, Test Loss: 0.5346, Test Accuracy: 83.62%


                                                                                                                                                

Epoch [8/20], Train Loss: -1.6759, Train Autoencoder Loss: 0.0099, Train Classification Loss: 0.2833, Train Accuracy: 90.33%, Test Loss: 0.5224, Test Accuracy: 84.60%


                                                                                                                                                

Epoch [9/20], Train Loss: -1.7090, Train Autoencoder Loss: 0.0094, Train Classification Loss: 0.2503, Train Accuracy: 91.39%, Test Loss: 0.5240, Test Accuracy: 84.57%


                                                                                                                                                 

Epoch [10/20], Train Loss: -1.7408, Train Autoencoder Loss: 0.0088, Train Classification Loss: 0.2217, Train Accuracy: 92.22%, Test Loss: 0.5018, Test Accuracy: 85.87%


                                                                                                                                                 

Epoch [11/20], Train Loss: -1.7650, Train Autoencoder Loss: 0.0084, Train Classification Loss: 0.1953, Train Accuracy: 93.25%, Test Loss: 0.5386, Test Accuracy: 84.60%


                                                                                                                                                 

Epoch [12/20], Train Loss: -1.7792, Train Autoencoder Loss: 0.0081, Train Classification Loss: 0.1826, Train Accuracy: 93.80%, Test Loss: 0.5247, Test Accuracy: 85.98%


                                                                                                                                                 

Epoch [13/20], Train Loss: -1.8009, Train Autoencoder Loss: 0.0077, Train Classification Loss: 0.1640, Train Accuracy: 94.44%, Test Loss: 0.5310, Test Accuracy: 85.99%


                                                                                                                                                 

Epoch [14/20], Train Loss: -1.8147, Train Autoencoder Loss: 0.0074, Train Classification Loss: 0.1542, Train Accuracy: 94.81%, Test Loss: 0.5147, Test Accuracy: 86.62%


                                                                                                                                                 

Epoch [15/20], Train Loss: -1.8332, Train Autoencoder Loss: 0.0070, Train Classification Loss: 0.1361, Train Accuracy: 95.39%, Test Loss: 0.5386, Test Accuracy: 86.68%


                                                                                                                                                 

Epoch [16/20], Train Loss: -1.8348, Train Autoencoder Loss: 0.0069, Train Classification Loss: 0.1313, Train Accuracy: 95.59%, Test Loss: 0.5481, Test Accuracy: 86.82%


                                                                                                                                                 

Epoch [17/20], Train Loss: -1.8494, Train Autoencoder Loss: 0.0066, Train Classification Loss: 0.1209, Train Accuracy: 95.96%, Test Loss: 0.5310, Test Accuracy: 86.78%


                                                                                                                                                  

Epoch [18/20], Train Loss: -1.8629, Train Autoencoder Loss: 0.0064, Train Classification Loss: 0.1076, Train Accuracy: 96.50%, Test Loss: 0.5440, Test Accuracy: 87.29%


                                                                                                                                                 

Epoch [19/20], Train Loss: -1.8564, Train Autoencoder Loss: 0.0062, Train Classification Loss: 0.1152, Train Accuracy: 96.23%, Test Loss: 0.5691, Test Accuracy: 86.82%


                                                                                                                                                  

Epoch [20/20], Train Loss: -1.8779, Train Autoencoder Loss: 0.0059, Train Classification Loss: 0.0944, Train Accuracy: 96.85%, Test Loss: 0.5471, Test Accuracy: 87.15%
Training fold 2...


                                                                                                                                                 

Epoch [1/20], Train Loss: -0.2067, Train Autoencoder Loss: 0.0214, Train Classification Loss: 1.3420, Train Accuracy: 64.14%, Test Loss: 0.9528, Test Accuracy: 72.62%


                                                                                                                                               

Epoch [2/20], Train Loss: -1.1182, Train Autoencoder Loss: 0.0170, Train Classification Loss: 0.8115, Train Accuracy: 74.95%, Test Loss: 0.7546, Test Accuracy: 77.31%


                                                                                                                                               

Epoch [3/20], Train Loss: -1.3205, Train Autoencoder Loss: 0.0152, Train Classification Loss: 0.6219, Train Accuracy: 80.13%, Test Loss: 0.6750, Test Accuracy: 79.67%


Epoch 4/20 (training):  33%|███▎      | 156/466 [01:01<01:58,  2.61it/s, loss=-1.45, decoder_loss=0.014, classifier_loss=0.496, accuracy=83.4] 

In [None]:
# from torch.utils.data import random_split

# dataset = DDIDataset(df_drug, extraction)
# train_size = int(0.8 * len(dataset))
# test_size = len(dataset) - train_size
# train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# model = AdversarialAutoencoder()
# decoder_criterion = nn.MSELoss() 
# adversarial_criterion = nn.MSELoss()
# classifier_criterion = nn.CrossEntropyLoss() 
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# num_epochs = 100

# train_adversarial_autoencoder(model, train_loader, test_loader, decoder_criterion, classifier_criterion, adversarial_criterion, optimizer, num_epochs)