In [13]:
import json

with open('data_train.json') as f:
    data_train = json.loads(f.read())
    
with open('data_test.json') as f:
    data_test = json.loads(f.read())

with open('data_valid.json') as f:
    data_valid = json.loads(f.read())




In [23]:
# Split sentences of each abstract in the training data
# Will use the test data as documents to query among the training data

import nltk
training_abstracts = [train['abstract'] for train in data_train]
training_split = [nltk.sent_tokenize(paragraph) for paragraph in training_abstracts]


In [32]:
# Used Google's Universal Sentence Encoder to generate representations of abstracts
# Stopped running this block after 986 entries, hence the error below
# The variable 'all_embeddings' still works

import tensorflow as tf
import tensorflow_hub as hub 
import numpy as np

all_embeddings=[]
module_url = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/1"
embed = hub.Module(module_url)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    for i in range(0,len(training_split)):
        embeddings = embed(training_split[i]) 
        e_beddings = sess.run(embeddings)
        e_beddings = np.asarray(e_beddings)
        e_beddings = sum(e_beddings)/ float(len(training_split[i]))
        all_embeddings.append(e_beddings)

        
        


INFO:tensorflow:Initialize variable module_3/embeddings/part_0:0 from checkpoint b'/var/folders/gd/kmzrmlrj7_j8mj5r4ywnwsxh0000gn/T/tfhub_modules/2cdbcae1a547f4fb93475d4b7d4140f8590995b7/variables/variables' with embeddings


KeyboardInterrupt: 

In [33]:
len(all_embeddings)

986

In [185]:
# There may be a pattern in the 'most similar' results for a query document
# This pattern may indicate whether or not a paper is novel
# Cluster the results of each document by Affinity Propagation
# Explore the number of clusters found 
# Loop over the first "n" entries of the test data set

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import AffinityPropagation

num_clusters=[]
n = 101
for i in range(0,n):

    test = data_test[i]['abstract']
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        test_sentences = nltk.sent_tokenize(test)
        test_embed = sess.run(embed(test_sentences))
    
    test_embed = sum(test_embed)/ float(len(test_sentences))
    neighbors = NearestNeighbors(n_neighbors = 20, algorithm = "auto", metric = 'cosine').fit(all_embeddings)
    distances, indices = neighbors.kneighbors([test_embed])
    
    indices = list(indices[0])
    result_embeddings = [all_embeddings[i] for i in indices]
    af = AffinityPropagation().fit(result_embeddings)
    cluster_centers_indices = af.cluster_centers_indices_
    n_clusters = len(cluster_centers_indices)
    num_clusters.append(n_clusters)
num_clusters

[4,
 5,
 7,
 4,
 5,
 6,
 3,
 5,
 5,
 3,
 3,
 6,
 5,
 4,
 4,
 5,
 4,
 6,
 6,
 3,
 5,
 7,
 6,
 5,
 4,
 5,
 5,
 4,
 5,
 6,
 4,
 5,
 4,
 5,
 7,
 5,
 4,
 5,
 6,
 6,
 5,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 5,
 4,
 5,
 4,
 3,
 6,
 4,
 6,
 4,
 3,
 5,
 5,
 8,
 4,
 5,
 4,
 3,
 5,
 4,
 4,
 6,
 5,
 3,
 3,
 5,
 5,
 5,
 5,
 5,
 6,
 3,
 6,
 6,
 4,
 5,
 4,
 3,
 4,
 6,
 5,
 7,
 3,
 7,
 5,
 6,
 4,
 6,
 5,
 5,
 5,
 5]

In [206]:
# Set P as the threshold determining if a paper is novel or not
# Hypothesis: Number of clusters greater or equal to p, then the paper is novel

num_clusters = np.asarray(num_clusters)
p = np.percentile(num_clusters,95)
p

amount_p = len([x for x in num_clusters if x>= 7])
amount_p/ float(len(num_clusters))
# If we set p to 7, about 6% of the first 100 papers in the training data
# would be categorized as a novel paper

0.0594059405940594

In [197]:
# Prediction function
# Takes in the embeddings of a query document, and documents to search from
# Takes in p, calculated above
# Returns the indices corresponding to the original dataset if document is novel

def novel_predict(query_embedding, embeddings, p):
    neighbors = NearestNeighbors(n_neighbors = 20, algorithm = "auto", metric = 'cosine').fit(embeddings)
    distances, indices = neighbors.kneighbors([query_embedding])
    indices = list(indices[0])
    result_embeddings = [embeddings[i] for i in indices]
    af = AffinityPropagation().fit(result_embeddings)
    cluster_centers_indices = af.cluster_centers_indices_
    n_clusters = len(cluster_centers_indices)
    
    if n_clusters >= p:
        print("This document may be novel")
        return(indices)
    
    else:
        return(indices)
    

In [198]:
#Example run

n=2
p=7
test = data_test[n]['abstract']
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    test_sentences = nltk.sent_tokenize(test)
    test_embed = sess.run(embed(test_sentences))
    test_embed = sum(test_embed)/ float(len(test_sentences))

result_indices = novel_predict(test_embed,all_embeddings,p)
print(result_indices)
results = [data_train[i] for i in result_indices]

for result in results:
    print(result)


This document may be novel
[874, 590, 83, 951, 945, 756, 952, 732, 213, 197, 719, 804, 105, 751, 88, 662, 853, 924, 308, 785]
{'abstract': 'Constraint-based (CB) learning is a formalism for learning a causal network with a database D by performing a series of conditional-independence tests to infer structural information. This paper considers a new test of independence that combines ideas from Bayesian learning, Bayesian network inference, and classical hypothesis testing to produce a more reliable and robust test. The new test can be calculated in the same asymptotic time and space required for the standard tests such as the chi-squared test, but it allows the specification of a prior distribution over parameters and can be used when the database is incomplete. We prove that the test is correct, and we demonstrate empirically that, when used with a CB causal discovery algorithm with noninformative priors, it recovers structural features more reliably and it produces networks with smal

In [195]:
data_test[2]['abstract']

'Cross-validation (CV) is a technique for evaluating the ability of statistical models/learning systems based on a given data set. Despite its wide applicability, the rather heavy computational cost can prevent its use as the system size grows. To resolve this difficulty in the case of Bayesian linear regression, we develop a formula for evaluating the leave-one-out CV error approximately without actually performing CV. The usefulness of the developed formula is tested by statistical mechanical analysis for a synthetic model. This is confirmed by application to a real-world supernova data set as well.'