In [None]:
import csv
import sqlite3
import time
import numpy as np
import pandas as pd
import os
import pickle
from pandas import DataFrame
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize

import torch
from torch import nn

In [None]:
def prepare_data(df_drug, feature_list, df_interaction):
    event_count = {}

    # Combine mechanisms and actions into a single event list
    df_interaction['event'] = df_interaction['mechanism'] + ' ' + df_interaction['action']

    df_interaction.drop(columns=['mechanism', 'action'], inplace=True)

    # Create feature vectors for drugs
    drug_names = np.array(df_drug['name']).tolist()
    feature_vectors = np.array([create_feature_vector(feature, df_drug) for feature in feature_list])

    # Map drug names to their feature vectors
    drug_features = {name: feature_vectors[:, :, index] for index, name in enumerate(drug_names)}

    # Generate the dataset
    DDE(drug_features, df_interaction)

def DDE(drug_features, df_interaction):

    for index, row in df_interaction.iterrows():
        event = row['event']
        drugA = row['drugA']
        drugB = row['drugB']
        drug_features_a = drug_features[drugA]
        drug_features_b = drug_features[drugB]
        drug_final = np.concatenate((drug_features_a, drug_features_b), axis=0)
        # Create a folder with the name of the event if it doesn't already exist
        os.makedirs('ddset/'+ event, exist_ok=True)

        # Save the drug_final array to a file named index within the event folder
        np.save(f'ddset/{event}/{index}.npy', drug_final)


def create_feature_vector(feature_name, df):
    def jaccard_similarity(matrix):
        mat = np.mat(matrix)
        numerator = mat * mat.T
        denominator = np.ones(np.shape(mat)) * mat.T + mat * np.ones(np.shape(mat.T)) - mat * mat.T
        return numerator / denominator

    all_features = []
    drug_features = np.array(df[feature_name]).tolist()

    # Get all unique features
    for feature_set in drug_features:
        for feature in feature_set.split('|'):
            if feature not in all_features:
                all_features.append(feature)

    # Create a binary feature matrix
    feature_matrix = np.zeros((len(drug_features), len(all_features)), dtype=float)
    df_feature = DataFrame(feature_matrix, columns=all_features)

    for i, feature_set in enumerate(drug_features):
        for feature in feature_set.split('|'):
            df_feature[feature].iloc[i] = 1

    # Calculate Jaccard similarity matrix and apply PCA
    similarity_matrix = jaccard_similarity(np.array(df_feature))

    return similarity_matrix

In [None]:
df_drug = pd.read_csv('/kaggle/input/ddi-data-sets/drug_information_1258.csv')
df_event = pd.read_csv('/kaggle/input/ddi-data-sets/drug_interaction.csv')
feature_list = df_drug[["target", "enzyme","smile"]]

In [None]:
prepare_data(df_drug, feature_list, df_event)

In [None]:
import torch
import torch.nn as nn
from torchvision import transforms, datasets, models
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from tqdm import tqdm

In [None]:
class NpyFolder(Dataset):
    def __init__(self, root):
        self.root = root
        self.classes = sorted(os.listdir(root))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
        self.samples = []
        for cls in self.classes:
            cls_dir = os.path.join(root, cls)
            for fname in os.listdir(cls_dir):
                if fname.endswith('.npy'):
                    path = os.path.join(cls_dir, fname)
                    item = (path, self.class_to_idx[cls])
                    self.samples.append(item)

    def __getitem__(self, index):
        path, target = self.samples[index]
        data = np.load(path)
        return data, target

    def __len__(self):
        return len(self.samples)

batch_size = 32
dataset = NpyFolder('/kaggle/working/ddset')
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
num_classes = len(dataset.classes)
print(f'Number of classes: {num_classes}')

In [None]:
class DCFNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=3, kernel_size=(3, 1), stride=(3, 1))
        self.dropout1 = nn.Dropout2d(p=0.5)
        self.conv2 = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(3, 2, 1), stride=(1, 1, 1))
        self.dropout2 = nn.Dropout3d(p=0.5)
        self.fnn1 = nn.Linear(in_features=1258, out_features=512)
        self.fnn2 = nn.Linear(in_features=512, out_features=256)
        self.fnn3 = nn.Linear(in_features=256, out_features=100)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.dropout1(x)
        x = x.unsqueeze(1) # Add an additional dimension
        x = self.conv2(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        x = x.flatten(start_dim=1) # Flatten all dimensions except for the first one
        x = self.fnn1(x)
        x = torch.relu(x)
        x = self.fnn2(x)
        x = torch.relu(x)
        x = self.fnn3(x)
        x = torch.log_softmax(x, dim=1)
        return x

In [None]:
from tqdm import tqdm
from sklearn.model_selection import KFold

def to_device(obj):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    return obj.to(device)

def train_kfold(model, data_loader, k=5):
    kfold = KFold(n_splits=k)
    losses = []
    accuracies = []
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#     criterion = nn.NLLLoss()

    # Move model to device
    model = to_device(model)

    for fold, (train_idx, val_idx) in enumerate(kfold.split(data_loader.dataset)):
        print(f'Fold {fold + 1}')
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
        train_loader = torch.utils.data.DataLoader(data_loader.dataset, batch_size=64, sampler=train_subsampler)
        val_loader = torch.utils.data.DataLoader(data_loader.dataset, batch_size=64, sampler=val_subsampler)

        # Training
        model.train()
        for epoch in range(20): # Set the number of epochs
            for batch in tqdm(train_loader):
                optimizer.zero_grad()
                inputs, labels = batch
                inputs = to_device(inputs.float()) # Convert inputs to float and move to device
                labels = to_device(labels)
                inputs = inputs.unsqueeze(1)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                # Your training code here

                pass

            # Validation
            model.eval()
            val_loss = 0
            val_correct = 0
            with torch.no_grad():
                for batch in tqdm(val_loader):
                    inputs, labels = batch
                    inputs = to_device(inputs.float()) # Convert inputs to float and move to device
                    labels = to_device(labels)
                    inputs = inputs.unsqueeze(1)
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()
                    _, preds = torch.max(outputs, dim=1)
                    val_correct += (preds == labels).sum().item()

                    # Your validation code here

                    pass

            # Calculate average loss and accuracy for this fold
            avg_loss = val_loss / len(val_loader.dataset)
            accuracy = val_correct / len(val_loader.dataset)

            # Save loss and accuracy for this fold
            losses.append(avg_loss)
            accuracies.append(accuracy)

            print(f'Epoch {epoch + 1}, Average loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

        # Save model weights
        torch.save(model.state_dict(), f'model_fold_{fold + 1}.pt')

    return losses, accuracies

In [None]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def to_device(obj):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    return obj.to(device)

def train(model, data_loader):
    losses = []
    accuracies = []
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.NLLLoss()
    # Move model to device
    model = to_device(model)

    # Split data into train and validation sets
    train_idx, val_idx = train_test_split(range(len(data_loader.dataset)), test_size=0.2)
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
    train_loader = torch.utils.data.DataLoader(data_loader.dataset, batch_size=64, sampler=train_subsampler)
    val_loader = torch.utils.data.DataLoader(data_loader.dataset, batch_size=64, sampler=val_subsampler)

    # Training
    model.train()
    for epoch in range(100): # Set the number of epochs
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            inputs, labels = batch
            inputs = to_device(inputs.float()) # Convert inputs to float and move to device
            labels = to_device(labels)
            inputs = inputs.unsqueeze(1)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # Your training code here

            pass

        # Validation
        model.eval()
        val_loss = 0
        val_correct = 0
        with torch.no_grad():
            for batch in tqdm(val_loader):
                inputs, labels = batch
                inputs = to_device(inputs.float()) # Convert inputs to float and move to device
                labels = to_device(labels)
                inputs = inputs.unsqueeze(1)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, preds = torch.max(outputs, dim=1)
                val_correct += (preds == labels).sum().item()

                # Your validation code here

                pass

        # Calculate average loss and accuracy for this fold
        avg_loss = val_loss / len(val_loader.dataset)
        accuracy = val_correct / len(val_loader.dataset)

        # Save loss and accuracy for this fold
        losses.append(avg_loss)
        accuracies.append(accuracy)

        print(f'Epoch {epoch + 1}, Average loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Save model weights
    torch.save(model.state_dict(), 'model.pt')

    return losses, accuracies

In [None]:
dcfnn = DCFNN()
train(dcfnn,data_loader)