# Setting up the Data

In [7]:
!pip install rdkit
!pip install dgllife
!pip install  dgl -f https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html

Looking in links: https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html


In [11]:
import rdkit
import torch
print(torch.__version__)

import dgllife

2.4.1+cu121


FileNotFoundError: Cannot find DGL C++ graphbolt library at /home/tristan/Documents/Github/self/chem/myenv/lib/python3.12/site-packages/dgl/graphbolt/libgraphbolt_pytorch_2.4.1.so

In [None]:
import pandas as pd
import torch
import dgl
from dgl.nn.pytorch import GraphConv
from tensorflow.keras.utils import to_categorical

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Create the dataframe: 
# the 12 tox21 assays, 1 smiles column

# add smiles
df = pd.read_csv("../data/tox21_compoundData.csv")
df2 = pd.read_csv("../data/x_tr_smiles.csv.gz")
df["smiles"] = df2["SMILES"]

# drop and sort columns. I like smiles last
df.drop(columns=["ID", "inchikey", 'sdftitle', 'order', "CVfold", "set"], inplace=True)
cols = df.columns.tolist()
cols = cols[0:] + cols[:0]
df = df[cols]

# drop all with smiles null
df.dropna(subset=["smiles"], inplace=True)
df.to_csv("../data/bioact-het-cleaned.csv", index=False)

# Examining the Data

In [None]:
df

In [None]:
tasks = df.columns.tolist()[:12]

In [None]:
one = []
zero = []
nan = []
for task in tasks:
    one_count = df[df[task] == 1].shape[0]
    zero_count = df[df[task] == 0].shape[0]
    nan_count = df[df[task].isna()].shape[0]
    one.append(one_count)
    zero.append(zero_count)
    nan.append(nan_count)
    print(f"{task:<15}: {one_count:5d} ones, {zero_count:5d} zeros, {nan_count:5d} nans")
    

In [None]:
# plot a bar chart of the number of ones, zeros, and nans for each task
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(20, 15))
X = np.arange(len(tasks))
plt.bar(X + 0.2, one, label='Ones', color='g', width=0.2)
plt.bar(X + 0.4, zero, label='Zeros', color='b', width=0.2)
plt.bar(X + 0.6, nan, label='NaNs', color='r', width=0.2)

plt.xticks(X, tasks, rotation=45, ha='right')
plt.xlabel('Tasks')
plt.ylabel('Count')
plt.title('Tox21 Assays')
plt.legend(["Active", "Inactive", "NA"])

In [None]:
def create_dataset_with_gcn(ds, class_embed_vector, GCN, tasks):

    created_data = []
    data = np.arange(len(tasks))
    onehot_encoded = to_categorical(data)
    for numberTask, dataset in enumerate(ds):
        for i, data in enumerate(dataset):
            smiles, g, label, mask = data
            g = g.to(device)
            g = dgl.add_self_loop(g)
            graph_feats = g.ndata.pop('h')
            embbed = GCN(g, graph_feats)
            embbed = embbed.to('cpu')
            embbed = embbed.detach().numpy()
            a = ( embbed, onehot_encoded[numberTask], class_embed_vector[numberTask], label, tasks[numberTask])
            created_data.append(a)
    print('Data created!!')
    return created_data 