In [1]:
import gensim
import os
import collections
import smart_open
import random
import multiprocessing
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import numpy as np
from sklearn import svm
from sklearn.utils import shuffle
from keras.callbacks import ModelCheckpoint
import time
import re


# Imports
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#global variabls

directory_path =  "D:/MLOntology/NCIt/"
data_path = directory_path + "data/"
vector_model_path = directory_path +"vectorModel/"
cnn_model_path = directory_path +"cnnModel/"

In [3]:
def get_trailing_number(s):
    m = re.search(r'\d+$', s)
    return m.group() if m else None

In [4]:
#read class label file
#create mapping from id to labels  
#iso-8859-1 , encoding="iso-8859-1"
conceptLabelDict={}
errors=[]

def read_label(fname):
    with smart_open.smart_open(fname) as f:
        for i, line in enumerate(f):
            #get the id for each concept paragraph
            splitted = line.decode("iso-8859-1").split("\t")
            if len(splitted)==3:
                conceptID = get_trailing_number(splitted[1])
                conceptLabelDict[conceptID] = splitted[2].replace("\r\n", "")
            else:
                errors.append(splitted)

label_file = data_path + "ontClassLabels_owl_ncit.txt"
read_label(label_file)
print(conceptLabelDict["4863"])
print(conceptLabelDict["115117"])
print(errors)

prostate carcinoma
stage ia esophageal cancer ajcc v7
[]


In [5]:
# PV-DM seems better??
vector_model_file = vector_model_path + "model1"

vector_model = gensim.models.Doc2Vec.load(vector_model_file)

inferred_vector = vector_model.infer_vector(['congenital', 'prolong', 'rupture', 'premature', 'membrane', 'lung'])
pprint(vector_model.docvecs.most_similar([inferred_vector], topn=10))

[('3801', 0.576964795589447),
 ('7387', 0.534248948097229),
 ('7158', 0.5248719453811646),
 ('7087', 0.5207959413528442),
 ('66803', 0.516688883304596),
 ('6409', 0.5089408159255981),
 ('8502', 0.5051150321960449),
 ('136619', 0.4978005290031433),
 ('3191', 0.49357178807258606),
 ('3087', 0.4708564281463623)]


In [6]:
def getVectorFromModel(concept_id, conceptLabelDict, model):
    if concept_id in model.docvecs:
        concept_vector= model.docvecs[concept_id]
    else:
        print("%s not found, get inferred vector "%(concept_id))
        concept_label = conceptLabelDict[concept_id]
        concept_vector= model.infer_vector(concept_label.split())
    return concept_vector

def getVector(line, conceptLabelDict, model):        
    a = getVectorFromModel(line[0], conceptLabelDict, model)
    b = getVectorFromModel(line[1], conceptLabelDict, model)
    c = np.array((a, b))
    c = c.T 
#     c = np.expand_dims(c, axis=2)
#     print(c.shape)
    return c

In [7]:
n_channels=2 

def get_batches(x_samples, y_samples, batch_size=64):
    samples = list(zip(x_samples, y_samples))
    num_samples = len(samples)
    
    shuffle(samples)
    for offset in range(0, num_samples, batch_size):
        batch_samples = samples[offset:offset+batch_size]

        X_samples = []
        Y_samples= []
        for batch_sample in batch_samples:
            pair_list = batch_sample[0]
            data_vector = getVector(pair_list, conceptLabelDict, vector_model)
#                 data_vector = stackVector(data_vector)
#                 print(data_vector.shape)
            X_samples.append(data_vector)
            class_label = batch_sample[1] 
            Y_samples.append(class_label)

        X_samples = np.array(X_samples).astype('float32')
        Y_samples = np.eye(n_channels)[Y_samples]
#             print('one batch ready')
        yield shuffle(X_samples, Y_samples)

In [15]:
conceptPairDict={}
errors=[]
conceptPairList=[]

def read_pair(fname):
    with smart_open.smart_open(fname) as f:
        for i, line in enumerate(f):
            #get the id for each concept paragraph
            splitted = line.decode("iso-8859-1").split("\t")
            if len(splitted)==3:
                childID = get_trailing_number(splitted[1])
                parentID = get_trailing_number(splitted[2].replace("\r\n", ""))
                conceptPairList.append([childID, parentID , 1])
#                 conceptPairDict[splitted[1]] = splitted[2].replace("\r\n", "")
            else:
                errors.append(splitted)

pair_file = data_path + "ontHierarchy_owl_ncit.txt"
read_pair(pair_file)

checkpairs = conceptPairList[10:15]
print(checkpairs)
print(len(conceptPairList))

[['4861', '7318', 1], ['87152', '87150', 1], ['87153', '140032', 1], ['87154', '87153', 1], ['87155', '87153', 1]]
16533


In [16]:
conceptNotPairDict={}
conceptNotPairList=[]

def read_not_pair(fname):
    with smart_open.smart_open(fname) as f:
        for i, line in enumerate(f):
            #get the id for each concept paragraph
            splitted = line.decode("iso-8859-1").split("\t")
            if len(splitted)==2:
                childID = get_trailing_number(splitted[0])
                notparentID = get_trailing_number(splitted[1].replace("\r\n", ""))
                conceptNotPairList.append([childID, notparentID, 0])
#                 conceptNotPairDict[splitted[1]] = splitted[2].replace("\r\n", "")
            else:
                errors.append(splitted)

notPair_file = data_path + "taxNotPairs_owl_ncit.txt"
read_not_pair(notPair_file)


first2pairs =conceptNotPairList[10:15]
print(first2pairs)
print(len(conceptNotPairList))

[['7918', '9151', 0], ['7918', '48612', 0], ['7918', '48613', 0], ['7918', '91231', 0], ['7918', '66753', 0]]
37147


In [32]:
testingPairList=[]

def read_test_pair(fname):
    with smart_open.smart_open(fname) as f:
        for i, line in enumerate(f):
            #get the id for each concept paragraph
            splitted = line.decode("iso-8859-1").split("\t")
            if len(splitted)==2:
                childID = get_trailing_number(splitted[0])
                notparentID = get_trailing_number(splitted[1].replace("\r\n", ""))
                assert childID in vector_model.docvecs, "%s not in vector model"%(childID)
                assert notparentID in vector_model.docvecs, "%s not in vector model"%(notparentID)
                testingPairList.append([childID, notparentID, 0])
#                 conceptNotPairDict[splitted[1]] = splitted[2].replace("\r\n", "")
            else:
                errors.append(splitted)

testingPair_file = data_path + "testing_owl_ncit.txt"
read_test_pair(testingPair_file)


first2pairs =testingPairList[10:15]
print(first2pairs)
print(len(testingPairList))

[['8784', '86026', 0], ['8784', '7917', 0], ['8784', '86034', 0], ['8784', '86033', 0], ['8784', '86053', 0]]
737409


In [33]:
def readFromPairList(id_pair_list, id_notPair_list):
    pair_list = id_pair_list + id_notPair_list
    random.shuffle(pair_list)
    idpairs_list =[]
    label_list =[]
    for i, line in enumerate(pair_list):      
        idpairs_list.append([line[0], line[1]])
        label_list.append(line[2])
    return idpairs_list, label_list

idpairs_list, label_list= readFromPairList([], testingPairList)

print(label_list[:20])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [37]:
test_acc = []
batch_size = 1000


import tensorflow as tf


with tf.Session() as sess:
    #First let's load meta graph and restore weights
    saver = tf.train.import_meta_graph('D:/MLOntology/checkpoints-cnn/har.ckpt.meta')
    # Restore
    saver.restore(sess, 'D:/MLOntology/checkpoints-cnn/har.ckpt')
    # Access the graph
    graph = tf.get_default_graph()
    #How to access saved variable/Tensor/placeholders 
    inputs_ = graph.get_tensor_by_name("inputs:0")
    labels_ = graph.get_tensor_by_name("labels:0")    
    keep_prob_ = graph.get_tensor_by_name("keep:0")
    # How to access saved operation
    accuracy = graph.get_tensor_by_name("accuracy:0")

#     saver.restore(sess, tf.train.latest_checkpoint('D:/MLOntology/checkpoints-cnn/'))

    
    for x_t, y_t in get_batches(idpairs_list, label_list, batch_size):
        feed = {inputs_: x_t,
                labels_: y_t,
                keep_prob_: 1}
        
        batch_acc = sess.run(accuracy, feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.6f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from D:/MLOntology/checkpoints-cnn/har.ckpt
Test accuracy: 0.685525


In [35]:
test_acc = []
batch_size = 1000


import tensorflow as tf


with tf.Session() as sess:
    #First let's load meta graph and restore weights
    saver = tf.train.import_meta_graph(cnn_model_path+'area/har.ckpt.meta')
    # Restore
    saver.restore(sess, cnn_model_path+'area/har.ckpt')
    # Access the graph
    graph = tf.get_default_graph()
    #How to access saved variable/Tensor/placeholders 
    inputs_ = graph.get_tensor_by_name("inputs:0")
    labels_ = graph.get_tensor_by_name("labels:0")    
    keep_prob_ = graph.get_tensor_by_name("keep:0")
    # How to access saved operation
    accuracy = graph.get_tensor_by_name("accuracy:0")

#     saver.restore(sess, tf.train.latest_checkpoint('D:/MLOntology/checkpoints-cnn/'))

    
    for x_t, y_t in get_batches(idpairs_list, label_list, batch_size):
        feed = {inputs_: x_t,
                labels_: y_t,
                keep_prob_: 1}
        
        batch_acc = sess.run(accuracy, feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.6f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from D:/MLOntology/NCIt/cnnModel/area/har.ckpt
Test accuracy: 0.126865


In [36]:
test_acc = []
batch_size = 1000


import tensorflow as tf


with tf.Session() as sess:
    #First let's load meta graph and restore weights
    saver = tf.train.import_meta_graph(cnn_model_path+'hier/har.ckpt.meta')
    # Restore
    saver.restore(sess, cnn_model_path+'hier/har.ckpt')
    # Access the graph
    graph = tf.get_default_graph()
    #How to access saved variable/Tensor/placeholders 
    inputs_ = graph.get_tensor_by_name("inputs:0")
    labels_ = graph.get_tensor_by_name("labels:0")    
    keep_prob_ = graph.get_tensor_by_name("keep:0")
    # How to access saved operation
    accuracy = graph.get_tensor_by_name("accuracy:0")

#     saver.restore(sess, tf.train.latest_checkpoint('D:/MLOntology/checkpoints-cnn/'))

    
    for x_t, y_t in get_batches(idpairs_list, label_list, batch_size):
        feed = {inputs_: x_t,
                labels_: y_t,
                keep_prob_: 1}
        
        batch_acc = sess.run(accuracy, feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.6f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from D:/MLOntology/NCIt/cnnModel/hier/har.ckpt
Test accuracy: 0.172135


In [39]:
import numpy as np
a = np.zeros((128,2))
print(a.shape)

(128, 2)


In [40]:
b= np.ones((128,2))
print(b.shape)


(128, 2)


In [42]:
c= np.concatenate((a,b),axis=1)
print(c.shape)

(128, 4)


In [43]:
print(c)

[[0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1. 1.]
 [0. 0. 1.