In [1]:
import pickle
import numpy as np
import pandas as pd
import nltk
import string
import sys
import matplotlib.pyplot as plt

from tqdm import tqdm
from scipy.sparse import dok_matrix, find
from scipy.spatial.distance import cosine, euclidean, jaccard

In [2]:
def __get_words(sentence):
    """
    Given a sentence, parses it's tokens, removing punctuation, stopwords and small words;
    It yields each word, one at a time.
    """
    #stopwords = set(map(str.lower, nltk.corpus.stopwords.words("english")))
    punctuation = set(string.punctuation)
    for word in nltk.tokenize.wordpunct_tokenize(sentence):
        word = word.lower()
        if (word.isalnum()) \
        and (word not in punctuation):
            yield word 

            
def get_vocabulary(documents) -> dict:
    """
    Given a list of paragraphs, iterates over it's sentences. 
    Every time a new word is found, it is added to the dictionary of words with a unique integer reference.
    """
    all_words = {}
    #sentences = []
    i=1
    
    for doc in tqdm(documents):
        for sentence in nltk.sent_tokenize(doc):
            for word in __get_words(sentence):
                if word not in all_words:
                    all_words[ word ] = i
                    i+=1                           
    return all_words

In [3]:
df = pd.read_csv("data/arxiv_data.csv")

In [4]:
words = get_vocabulary(df.summaries)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51774/51774 [00:19<00:00, 2631.62it/s]


In [5]:
rev_words = {item[1]:item[0] for item in words.items()}

In [6]:
len(words)

58933

In [104]:
def get_doc(idx):
    return df.loc[idx, "summaries"]

In [7]:
def word2int(word):
    return word[word]

def int2word(idx):
    return rev_words[idx]

In [8]:
def to_bag_of_words(documents, vocabulary):
    docs = dok_matrix((len(documents), len(vocabulary) + 1))
    
    for i, d in tqdm(enumerate(documents)):
        for sentence in nltk.sent_tokenize(d):
            for word in __get_words(sentence):
                col = vocabulary[word]
                docs[i, col] = 1
    
    return docs

In [8]:
def sparse_to_set(docs):
    new_docs = {
        i: set(list(map(lambda key: key[0], doc.keys))) for i, doc in tqdm(enumerate(docs))
    }
    return new_docs

In [9]:
docs = to_bag_of_words(df.summaries, words)

51774it [02:15, 381.70it/s]


In [21]:
set(find(docs[:, 0])[0])

set()

In [38]:
docs = docs.tocsc()

## LSHT for Jaccard Similarrity

In [40]:
def get_buckets(documents, permutations, N, B, R, NB):
    buckets = {}
    
    docs_set = set(range(N))
    
    for band in tqdm(range(B)):
        signatures = np.zeros((N, R), dtype=int)
        for r in range(R):
            current_perm = permutations[band*R + r]
            L = docs_set.copy()
            i=0
            while len(L)>0:
                elem = current_perm[i]
                docs_found = documents[elem] & L
                
                if len(docs_found)>0:
                    signatures[list(docs_found), r] = i
                    L -= docs_found
                i+=1
                if i==N:
                    signatures[list(L),r]=i
                    L = {}
        
        for doc in range(N):
            bucket = hash(tuple(signatures[doc]))%NB
            buckets.setdefault((band, bucket), set()).add(doc)
    return buckets

In [86]:
def LSHT(documents, B, R, NB=28934501):
    N, M = documents.shape
    
    #d_transpose = documents.T
    d_transpose = []
    for i in tqdm(range(M)):
        d_transpose.append( 
            set( find( documents[:, i] )[0] )
        )
    
    P = B*R
    permutations = np.array([np.random.permutation(M) for _ in range(P)])
    buckets = get_buckets(d_transpose, permutations, N, B, R, NB)
    return buckets
    

In [87]:
buckets = LSHT(docs, 40, 5)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58934/58934 [00:12<00:00, 4590.81it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [05:40<00:00,  8.50s/it]


In [94]:
def get_most_similar_from_buckets(buckets, n_docs):
    distances = { i:{} for i in range(n_docs)}
    
    for key in tqdm(buckets):
        bucket = buckets[key]
        for pair in permutations(bucket, 2):
            distances_to_doc_a = distances[pair[0]]
            distances_to_doc_a[pair[1]] = distances_to_doc_a.get(pair[1], 0) + 1
            
    for key in tqdm(distances.keys()):
        docs = list(distances[key].keys())
        counts = list(distances[key].values())

        order = np.argsort(counts)[::-1]
        distances[key] = [docs[i] for i in order]
    return distances
        

In [96]:
similar = get_most_similar_from_buckets(buckets, docs.shape[0])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1442940/1442940 [00:03<00:00, 448592.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51774/51774 [00:02<00:00, 25566.13it/s]


In [112]:
similar[0]

[27091, 17704, 30014, 13449, 5267, 26780, 27076]

In [98]:
def JaccardSim(d1, d2):
    d1, d2 = d1.toarray(), d2.toarray()
    
    a =np.inner(d1,d2)
    bc=np.sum(d1+d2)-a
    return a/bc

In [118]:
JaccardSim(docs[0], docs[30014])

array([[0.17177914]])

In [119]:
print(get_doc(0))

Stereo matching is one of the widely used techniques for inferring depth from
stereo images owing to its robustness and speed. It has become one of the major
topics of research since it finds its applications in autonomous driving,
robotic navigation, 3D reconstruction, and many other fields. Finding pixel
correspondences in non-textured, occluded and reflective areas is the major
challenge in stereo matching. Recent developments have shown that semantic cues
from image segmentation can be used to improve the results of stereo matching.
Many deep neural network architectures have been proposed to leverage the
advantages of semantic segmentation in stereo matching. This paper aims to give
a comparison among the state of art networks both in terms of accuracy and in
terms of speed which are of higher importance in real-time applications.


In [120]:
print(get_doc(30014))

Geometric model fitting is a fundamental research topic in computer vision
and it aims to fit and segment multiple-structure data. In this paper, we
propose a novel superpixel-guided two-view geometric model fitting method
(called SDF), which can obtain reliable and consistent results for real images.
Specifically, SDF includes three main parts: a deterministic sampling
algorithm, a model hypothesis updating strategy and a novel model selection
algorithm. The proposed deterministic sampling algorithm generates a set of
initial model hypotheses according to the prior information of superpixels.
Then the proposed updating strategy further improves the quality of model
hypotheses. After that, by analyzing the properties of the updated model
hypotheses, the proposed model selection algorithm extends the conventional
"fit-and-remove" framework to estimate model instances in multiple-structure
data. The three parts are tightly coupled to boost the performance of SDF in
both speed and accurac

## LHST for Cosine Similarity, using TF-IDF Representation