In [1]:
from __future__ import division
from __future__ import print_function
import matplotlib; matplotlib.use('agg')
import numpy as np
from scipy.spatial.distance import pdist, squareform

import time
import argparse
import numpy as np
import math

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from utils import *
from models import *
from torch.utils.data import Dataset

from sklearn import metrics
from sklearn.utils import shuffle, resample
from sklearn.model_selection import train_test_split, KFold
import os

import matplotlib.pyplot as plt
from time import gmtime, strftime

from rdkit import DataStructs

random_state = 11
nBits = [64, 128, 256, 512, 1024]
batch_size = 100
num_epochs = 80
weight_decay = 0.0001  # L-2 Norm
learning_rate = 0.0005

In [2]:
def split_data(data, labels):
    X, x_test, y, y_test = train_test_split(data, labels, test_size=0.1, random_state=random_state, stratify=labels)
    
#     tensor_x_test = torch.from_numpy(x_test).float()
#     tensor_y_test = torch.from_numpy(y_test).long()
#     test_dataset = torch.utils.data.TensorDataset(tensor_x_test, tensor_y_test)
#     test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size)
    
    x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=random_state, stratify=y)
    
#     tensor_x_val = torch.from_numpy(x_val).float()
#     tensor_y_val = torch.from_numpy(y_val).long()
#     val_dataset = torch.utils.data.TensorDataset(tensor_x_val, tensor_y_val)
#     val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size)
    
#     tensor_x_train = torch.from_numpy(x_train).float()
#     tensor_y_train = torch.from_numpy(y_train).long()
#     train_dataset = torch.utils.data.TensorDataset(tensor_x_train, tensor_y_train)
#     train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size)
    
#     return (train_loader, val_loader, test_loader)
    return ((x_train, y_train), (x_val, y_val), (x_test, y_test))

In [3]:
def test_model(dist_mat, train_size, train_labels, true_labels):
# We assume that the dist matrix is 4 sub matrices, train vs. train train vs. something else.
# So the first train_size X train_size entries are the similarity of the train vs. itself.
    
    sub_mat = dist_mat[train_size:, :train_size]
    correct = [0]*4
    total = dist_mat.shape[0] - train_size
    top_ks = [1, 5, 10, 30]
#     top_ks = np.array(top_ks) - 1
    for i, topk in enumerate(top_ks):
        nn = np.partition(sub_mat, topk, axis=1)
        nn = sub_mat < nn[:,topk].reshape(-1, 1)
        nn_labels = np.matmul(nn, train_labels)
        correct[i] = (np.multiply(nn_labels, true_labels) > 0).sum()
    return np.true_divide(correct,total).tolist()

In [4]:
x_all, y_all, target, sizes, mol_to_graph_transform, parameter_holder, edge_vocab, node_vocab = \
    load_data('small_batch_test')

Loading small_batch_test dataset...
Done.


In [5]:
for nbits in nBits:
    
    fps = [AllChem.GetMorganFingerprintAsBitVect(MolFromInchi(mol_dat[-1]), 2, nBits=nbits) for mol_dat in x_all]
    len_train = len(fps)
    labels = y_all

    np_fps = []
    for fp in fps:
      arr = np.zeros((1,), dtype=np.int8)
      DataStructs.ConvertToNumpyArray(fp, arr)
      np_fps.append(arr)

    np_fps = np.concatenate(np_fps, axis=0)
    np_fps = np_fps.reshape(-1, nbits)

    np_fps, labels = shuffle(np_fps, labels, random_state=random_state)
    train_loader, validation_loader, test_loader = split_data(np_fps, labels)
    train_val = np.concatenate((train_loader[0], validation_loader[0]))
    train_val = squareform(pdist(train_val, 'jaccard'))

    train_test = np.concatenate((train_loader[0], test_loader[0]))
    train_test = squareform(pdist(train_test, 'jaccard'))
    
    print(nbits)
    print(test_model(train_val, train_loader[0].shape[0], train_loader[1], validation_loader[1]))
    print(test_model(train_test, train_loader[0].shape[0], train_loader[1], test_loader[1]))

64
[0.6714120815698346, 0.8410927279722971, 0.8518661023470565, 0.8726433243555214]
[0.6692067890543817, 0.8330446830620021, 0.8514028403186699, 0.8708001385521302]
128
[0.7325894574836476, 0.868410927279723, 0.8811081185071181, 0.8976529434397845]
[0.7235885001731902, 0.8638725320401801, 0.8791132663664704, 0.8957395219951507]
256
[0.7387456714120816, 0.8745671412081569, 0.8895729126587149, 0.9118891881492882]
[0.7346726705923103, 0.8714928992033253, 0.885001731901628, 0.9037062694838933]
1024
[0.7391304347826086, 0.8803385917660639, 0.893035782993459, 0.9157368218545594]
[0.7357118115691029, 0.8756494631104953, 0.8888119154832006, 0.9078628333910634]


In [None]:
train_val = np.concatenate((train_loader[0], validation_loader[0]))
train_val = squareform(pdist(train_val, 'jaccard'))

train_test = np.concatenate((train_loader[0], test_loader[0]))
train_test = squareform(pdist(train_test, 'jaccard'))

In [None]:
test_model(train_val, train_loader[0].shape[0], train_loader[1], validation_loader[1])

In [None]:
test_model(train_test, train_loader[0].shape[0], train_loader[1], test_loader[1])