## Load HIV dataset

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import re
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("HIV.csv")
df.head()

In [None]:
string = df.loc[0,'smiles']
print(df.loc[0,'smiles'])
print(string[3])

In [None]:
df.value_counts("activity")

In [None]:
df.value_counts("HIV_active")

## Transform into SELFIES representation

In [None]:
import selfies as sf

In [None]:
# transform
df_sf = df.copy()
len_sf=[]
for ind in range(len(df_sf['smiles'])):
    try:
       df_sf.loc[ind, 'smiles'] = sf.encoder(df_sf.loc[ind, 'smiles'])
    except:
        pass #sf.encoder error!

    len_sf.append(sf.len_selfies(df_sf.loc[ind, 'smiles']))

In [None]:
df_sf.rename({'smiles':'selfies'}, axis='columns', inplace=True)
df_sf.head()

In [None]:
string_sf = df_sf.loc[0,'selfies']
print(df_sf.loc[0,'selfies'])

In [None]:
# Compute mean and standard deviation
print(np.mean(len_sf))
print(np.std(len_sf))

In [None]:
# Create a histogram
plt.figure(figsize=(12, 6))
plt.hist(len_sf, bins=range(min(len_sf), max(len_sf) + 1), alpha=0.75, edgecolor='black')
plt.title('Histogram of SELFIES String Lengths')
plt.xlabel('Length of SELFIES String')
plt.ylabel('Count')
plt.grid(axis='y', alpha=0.75)
plt.show()

## Padding, tokenization, and embedding

In [None]:
def split_selfies(string_sf):
    # This regex matches anything inside brackets
    selfies_elements = re.findall(r'\[.*?]', string_sf)
    return selfies_elements

'''
# Example usage:
string_sf = df_sf.loc[0, 'selfies']
selfies_list = split_selfies(string_sf)
print(selfies_list)
'''

def pad_start_end_token(dataset):
    """
    Pad a list of SMILES with "SOS" and "EOS" token
    
    Parameters
    ----------
    smiles: list of str
        A list containing SMILES strings to pad
    
    Returns
    ------
    padded: list of list of str
        A list containing padded SMILES strings. Example: [['SOS', 'C', 'EOS'], ...]
    """
    padded = []
    for ind in range(len(dataset['selfies'])):
        padded.append(["[SOS]"] + split_selfies(df_sf.loc[ind, 'selfies']) + ["[EOS]"])
    return padded

In [None]:
padded_sf = pad_start_end_token(df_sf)
vocab = {element: idx for idx, element in enumerate(np.unique(np.concatenate(padded_sf)))}
vocab

In [None]:
def tokenize_selfies(selfies_list, vocab):
    """Convert a list of SELFIES elements to a list of indices based on the vocabulary (One-hot encoding)."""
    return [vocab[element] for element in selfies_list if element in vocab]



class SelfiesEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SelfiesEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, x):
        return self.embedding(x)

In [None]:
tokenized_selfies = [tokenize_selfies(selfies, vocab) for selfies in padded_sf]

# Instantiate the embedding layer
vocab_size = len(vocab)  # Size of your vocabulary
embedding_dim = 64  # The size of each embedding vector, which you can choose
selfies_embedding_layer = SelfiesEmbedding(vocab_size, embedding_dim)

# Convert a list of tokenized selfies to a tensor and pass it through the embedding layer
# Pad the sequences to the same length to create a tensor
from torch.nn.utils.rnn import pad_sequence
padded_tokenized_selfies = pad_sequence([torch.tensor(ts) for ts in tokenized_selfies],
                                        batch_first=True,
                                        padding_value=0)  # assuming 0 is the padding index

# Get embeddings for the batch of tokenized SELFIES
embedded_selfies = selfies_embedding_layer(padded_tokenized_selfies)


In [None]:
print(len(tokenized_selfies))
print(embedded_selfies.shape)

[41127, 399, 128] indicates: <br>
41127: the amount of the dataset <br>
399: the largest length of selfies string; after the pad_sequence function, all strings should have the same length which is 399 <br>
128: the dimension after the embedding <br>

## Transform Selfies to Graph

In [None]:
import torch
from torch_geometric.data import Data

def create_graph_data_object(tokenized_selfies, embedded_selfies):
    """
    Creates a graph data object for each molecule.
    
    Parameters:
    - tokenized_selfies: List of lists, where each sublist is a sequence of token indices for a molecule.
    - embedded_selfies: Tensor containing embedded vectors for each token in the tokenized_selfies.
    
    Returns:
    - List of Data objects, one for each molecule.
    """
    graph_data_list = []
    
    for i, token_list in enumerate(tokenized_selfies):
        # Get node features from embeddings
        node_features = embedded_selfies[i]
        
        # Create edges
        # Connects each node to the next, creating a path graph
        edge_index = []
        for j in range(len(token_list) - 1):
            edge_index.append([j, j + 1])
            edge_index.append([j + 1, j])  # Adding reverse edge for undirected graph
            
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        
        # Create a graph data object
        data = Data(x=node_features, edge_index=edge_index)
        graph_data_list.append(data)
    
    return graph_data_list

# Assume embedded_selfies is a padded tensor where each row corresponds to embeddings of a molecule
graph_data_list = create_graph_data_object(tokenized_selfies, embedded_selfies)

# Print the first graph object
print(graph_data_list[0])


In [3]:
import networkx as nx

def plot_graph(data):
    """
    Plot a graph from a PyTorch Geometric Data object.
    
    Parameters:
    - data: PyTorch Geometric Data object containing edge_index and x (node features).
    """
    # Create a networkx graph
    G = nx.Graph()

    # Add edges
    edge_index = data.edge_index.numpy()
    num_edges = edge_index.shape[1]
    for i in range(0, num_edges, 2):  # step by 2 to avoid adding edges twice
        G.add_edge(edge_index[0][i], edge_index[1][i])

    # Draw the graph
    plt.figure(figsize=(8, 8))
    nx.draw(G, with_labels=True, node_color='skyblue', node_size=700, edge_color='#FF5733', font_size=15, font_weight='bold')
    plt.show()

# Example usage: Plot the first graph object
if len(graph_data_list) > 0:
    plot_graph(graph_data_list[0])


NameError: name 'graph_data_list' is not defined

In [1]:
# Add activity labels to each graph object
for i, data in enumerate(graph_data_list):
    # Assuming df_sf and graph_data_list are aligned
    data.y = torch.tensor([df_sf.loc[i, 'activity']])  # Assuming 'activity' is the column with CI/CA/CM labels

NameError: name 'graph_data_list' is not defined

## GNN

In [32]:
# 首先根據活性分類進行過濾和抽樣
sampled_df = pd.DataFrame()  # 初始化一個空的DataFrame用於儲存抽樣結果

# 進行隨機抽樣
ci_samples = df_sf[df_sf['activity'] == 'CI'].sample(n=1000, random_state=42)  # 從CI類中隨機選取1000個樣本
ca_samples = df_sf[df_sf['activity'] == 'CA'].sample(n=300, random_state=42)   # 從CA類中隨機選取300個樣本
cm_samples = df_sf[df_sf['activity'] == 'CM'].sample(n=700, random_state=42)   # 從CM類中隨機選取700個樣本

# 合併這些抽樣結果到一個新的DataFrame中
sampled_df = pd.concat([ci_samples, ca_samples, cm_samples], ignore_index=True)

# 檢查抽樣結果
print(sampled_df['activity'].value_counts())

activity
CI    1000
CM     700
CA     300
Name: count, dtype: int64


In [33]:
# 過濾出已經創建的圖數據對象，使其只包含抽樣的樣本
filtered_graph_data_list = [graph_data_list[i] for i in sampled_df.index]

# 現在 filtered_graph_data_list 包含了隨機抽取的樣本的圖結構
print(f"Total graphs in the sampled dataset: {len(filtered_graph_data_list)}")


Total graphs in the sampled dataset: 2000
