In [27]:
import csv
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Build up my corpus

In [2]:
corpus = []

for f in os.listdir("datasets/papers/"):
    if (f.endswith(".txt")):
        with open(os.path.join("datasets/papers", f), "r", encoding = "utf8") as paper:
            corpus.append((f, paper.read()))

In [29]:
corpus[1:10]

[('100226.txt', 'Coherent functions and program checkers'),
 ('100228.txt', 'The wakeup problem'),
 ('100231.txt', 'Efficient robust parallel computations'),
 ('100262.txt', 'An optimal algorithm for on-line bipartite matching'),
 ('100269.txt',
  'One-way functions are necessary and sufficient for secure signatures'),
 ('100270.txt', 'Pseudo-random generators under uniform assumptions'),
 ('100272.txt', 'Witness indistinguishable and witness hiding protocols'),
 ('100273.txt',
  'Public-key cryptosystems provably secure against chosen ciphertext attacks'),
 ('100287.txt', 'The round complexity of secure protocols')]

## Build the TF-IDF vectorizer and matrix

In [28]:
tf = TfidfVectorizer(analyzer = "word", 
                         ngram_range = (1, 3), 
                         min_df = 0, 
                         stop_words = "english")

tf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [30]:
tfidf_matrix = tf.fit_transform([content for file, content in corpus])
tfidf_matrix

<5801x41397 sparse matrix of type '<class 'numpy.float64'>'
	with 77828 stored elements in Compressed Sparse Row format>

## Define a function to find similar documents

In [26]:
def find_similar(matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(matrix[index: index + 1], matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

## Let's test it out

In [22]:
k = 69

print(corpus[k])
print("============================================================================")

for index, score in find_similar(tfidf_matrix, k):
    print(score, corpus[index])

('1034226.txt', 'Disk Scrubbing in Large Archival Storage Systems')
0.206014472291 ('276683.txt', 'Archival storage for digital libraries')
0.183158767923 ('651321.txt', 'Venti: A New Approach to Archival Storage')
0.169082360056 ('651308.txt', 'Enabling the Archival Storage of Signed Documents')
0.163059191107 ('6721.txt', 'Modelling storage systems')
0.154747351567 ('1098163.txt', 'Disk Infant Mortality in Large Storage Systems')


In [23]:
k = 1000

print(corpus[k])
print("============================================================================")

for index, score in find_similar(tfidf_matrix, k):
    print(score, corpus[index])

('1592576.txt', 'VL2: a scalable and flexible data center network')
0.368580782349 ('1402967.txt', 'A scalable, commodity data center network architecture')
0.17191681242 ('2465363.txt', 'MDCC: multi-data center consistency')
0.0842481596981 ('365690.txt', 'Determining a computing center environment')
0.0728900387948 ('317906.txt', 'Flexible type analysis')
0.0610049952954 ('383072.txt', 'A scalable content-addressable network')


In [24]:
k = 1

print(corpus[k])
print("============================================================================")

for index, score in find_similar(tfidf_matrix, k):
    print(score, corpus[index])

('100226.txt', 'Coherent functions and program checkers')
0.102871883709 ('802557.txt', 'Program slicing')
0.0831211600727 ('74871.txt', 'Mirage: a coherent distributed shared memory design')
0.0811169433187 ('176577.txt', 'A coherent distributed file cache with directory write-behind')
0.0772474014488 ('324179.txt', 'Linear hash functions')
0.0747132843379 ('6503.txt', 'How to construct random functions')


In [25]:
k = 129

print(corpus[k])
print("============================================================================")

for index, score in find_similar(tfidf_matrix, k):
    print(score, corpus[index])

('1057980.txt', 'Optimistic replication')
0.455880427813 ('675975.txt', 'Optimistic Replication for Internet Data Services')
0.303054286223 ('879308.txt', 'Optimistic Active Replication')
0.197371551772 ('758421.txt', 'Optimistic Atomic Broadcast')
0.185949033747 ('831121.txt', 'Optimistic Byzantine Agreement')
0.174990993836 ('2162347.txt', 'Optimistic generic broadcast')


In [31]:
k = 888

print(corpus[k])
print("============================================================================")

for index, score in find_similar(tfidf_matrix, k):
    print(score, corpus[index])

('141970.txt', 'The Apertos reflective operating system: the concept and its implementation')
0.111595933688 ('361161.txt', 'Monitors: an operating system structuring concept')
0.0998146193129 ('1173748.txt', 'Reflective program generation with patterns')
0.0981128385031 ('2156790.txt', 'A type system for reflective program generators')
0.0873313154276 ('21853.txt', 'Operating systems: design and implementation')
0.0758885399523 ('231070.txt', 'The design and implementation of the 4.4BSD operating system')
