In [186]:
import pickle
import numpy as np
import pandas as pd
import nltk
import string
import sys
import matplotlib.pyplot as plt

from random import randint
from tqdm import tqdm
from itertools import permutations
from scipy.sparse import dok_matrix, find
from scipy.spatial.distance import cosine, euclidean, jaccard

In [2]:
def __get_words(sentence):
    """
    Given a sentence, parses it's tokens, removing punctuation, stopwords and small words;
    It yields each word, one at a time.
    """
    #stopwords = set(map(str.lower, nltk.corpus.stopwords.words("english")))
    punctuation = set(string.punctuation)
    for word in nltk.tokenize.wordpunct_tokenize(sentence):
        word = word.lower()
        if (word.isalnum()) \
        and (word not in punctuation):
            yield word 

            
def get_vocabulary(documents) -> dict:
    """
    Given a list of paragraphs, iterates over it's sentences. 
    Every time a new word is found, it is added to the dictionary of words with a unique integer reference.
    """
    all_words = {}
    #sentences = []
    i=0
    
    for doc in tqdm(documents):
        for sentence in nltk.sent_tokenize(doc):
            for word in __get_words(sentence):
                if word not in all_words:
                    all_words[ word ] = i
                    i+=1                           
    return all_words

In [3]:
df = pd.read_csv("data/arxiv_data.csv")

In [4]:
words = get_vocabulary(df.summaries)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51774/51774 [00:32<00:00, 1575.45it/s]


In [5]:
rev_words = {item[1]:item[0] for item in words.items()}

In [6]:
len(words)

58933

In [7]:
def get_doc(idx):
    return df.loc[idx, "summaries"]

In [8]:
def word2int(word):
    return words[word]

def int2word(idx):
    return rev_words[idx]

In [157]:
def get_most_similar_from_buckets(buckets, n_docs):
    distances = { i:{} for i in range(n_docs)}
    
    for key in tqdm(buckets):
        bucket = buckets[key]
        for pair in permutations(bucket, 2):
            distances_to_doc_a = distances[pair[0]]
            distances_to_doc_a[pair[1]] = distances_to_doc_a.get(pair[1], 0) + 1
            
    for key in tqdm(distances.keys()):
        docs = list(distances[key].keys())
        counts = list(distances[key].values())

        order = np.argsort(counts)[::-1]
        distances[key] = [docs[i] for i in order]
    return distances

## LSHT for Jaccard Similarity | Bag of Words Representation

In [13]:
def to_bag_of_words(documents, vocabulary):
    N = len(documents)
    docs = {i:set() for i in range(N)}#dok_matrix((len(documents), len(vocabulary)))
    
    for i, d in tqdm(enumerate(documents)):
        for sentence in nltk.sent_tokenize(d):
            for word in __get_words(sentence):
                col = vocabulary[word]
                docs[i]|= {col}
    
    sparse_docs = dok_matrix((len(documents), len(vocabulary)))
    for row, cols in tqdm(docs.items()):
        cols = list(cols)
        sparse_docs[row, cols]=1
    return sparse_docs

In [14]:
docs = to_bag_of_words(df.summaries, words).tocsc()

51774it [00:38, 1353.40it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51774/51774 [00:17<00:00, 3011.95it/s]


In [17]:
docs.shape

(51774, 58933)

In [16]:
def get_buckets(documents, permutations, N, B, R, NB):
    buckets = {}
    
    docs_set = set(range(N))
    
    for band in tqdm(range(B)):
        signatures = np.zeros((N, R), dtype=int)
        for r in range(R):
            current_perm = permutations[band*R + r]
            L = docs_set.copy()
            i=0
            while len(L)>0:
                elem = current_perm[i]
                docs_found = documents[elem] & L
                
                if len(docs_found)>0:
                    signatures[list(docs_found), r] = i
                    L -= docs_found
                i+=1
                if i==N:
                    signatures[list(L),r]=i
                    L = {}
        
        for doc in range(N):
            bucket = hash(tuple(signatures[doc]))%NB
            buckets.setdefault((band, bucket), set()).add(doc)
    return buckets

In [18]:
def LSHT(documents, B, R, NB=28934501):
    N, M = documents.shape
    
    #d_transpose = documents.T
    d_transpose = []
    for i in tqdm(range(M)):
        d_transpose.append( 
            set( find( documents[:, i] )[0] )
        )
    
    P = B*R
    permutations = np.array([np.random.permutation(M) for _ in range(P)])
    buckets = get_buckets(d_transpose, permutations, N, B, R, NB)
    return buckets
    

In [19]:
buckets = LSHT(docs, 40, 5 , NB=999999)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58933/58933 [00:23<00:00, 2546.37it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [10:32<00:00, 15.81s/it]


In [20]:
len(buckets)

1432952

In [24]:
similar = get_most_similar_from_buckets(buckets, docs.shape[0])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1432952/1432952 [00:04<00:00, 330317.25it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51774/51774 [00:02<00:00, 21462.43it/s]


In [27]:
similar[0][:5]

[17704, 27091, 514, 6225, 9138]

In [26]:
def JaccardSim(d1, d2):
    d1, d2 = d1.toarray(), d2.toarray()
    
    a =np.inner(d1,d2)
    bc=np.sum(d1+d2)-a
    return a/bc

In [68]:
1 - (1 - 0.8**5)**40

0.9999998732227786

In [76]:
i=10
list(map(lambda j: JaccardSim(docs[i], docs[j]), similar[i][:5]))

[array([[1.]]),
 array([[0.19565217]]),
 array([[0.18954248]]),
 array([[0.17699115]]),
 array([[0.12222222]])]

In [61]:
similar[i][:5]

[4866, 19173, 8492, 37524, 17726]

In [62]:
print(get_doc(i))

Machine learning has been utilized to perform tasks in many different domains
such as classification, object detection, image segmentation and natural
language analysis. Data labeling has always been one of the most important
tasks in machine learning. However, labeling large amounts of data increases
the monetary cost in machine learning. As a result, researchers started to
focus on reducing data annotation and labeling costs. Transfer learning was
designed and widely used as an efficient approach that can reasonably reduce
the negative impact of limited data, which in turn, reduces the data
preparation cost. Even transferring previous knowledge from a source domain
reduces the amount of data needed in a target domain. However, large amounts of
annotated data are still demanded to build robust models and improve the
prediction accuracy of the model. Therefore, researchers started to pay more
attention on auto annotation and labeling. In this survey paper, we provide a
review of previo

In [63]:
print(get_doc(4866))

Machine learning has been utilized to perform tasks in many different domains
such as classification, object detection, image segmentation and natural
language analysis. Data labeling has always been one of the most important
tasks in machine learning. However, labeling large amounts of data increases
the monetary cost in machine learning. As a result, researchers started to
focus on reducing data annotation and labeling costs. Transfer learning was
designed and widely used as an efficient approach that can reasonably reduce
the negative impact of limited data, which in turn, reduces the data
preparation cost. Even transferring previous knowledge from a source domain
reduces the amount of data needed in a target domain. However, large amounts of
annotated data are still demanded to build robust models and improve the
prediction accuracy of the model. Therefore, researchers started to pay more
attention on auto annotation and labeling. In this survey paper, we provide a
review of previo

## LSHT for Cosine Similarity | TF-IDF Representation

In [77]:
def process_raw_text(documents):
    new_docs = []
    for i, d in tqdm(enumerate(documents)):
        current = []
        for sentence in nltk.sent_tokenize(d):
            for word in __get_words(sentence):
                current.append( word)
        new_docs.append(current)
    return new_docs

In [78]:
def count_words(doc) -> dict[str, int]:
    """
    Counts the ocurrence of each word in the document corpus.
    """
    #return dict(zip(*np.unique(doc, return_counts=True)))
    return np.unique(doc, return_counts=True)
    
def get_tf_matrix(docs, vocab):
    N, M = len(docs), len(vocab)
    tf_matrix = dok_matrix((N, M))
    
    for i, doc in tqdm(enumerate(docs)):
        #calc document  tf vector
        words, counts = count_words(doc)
        if len(words)>0:
            max_value = counts.max()
            
            words_idx = list(map(word2int, words))
            tf_matrix[i, words_idx] = counts/max_value
    return tf_matrix

def get_idf_matrix(docs, vocab):
    N = len(docs)
    word_counts = np.zeros(len(vocab))
    
    for doc in tqdm(docs):
        for word in np.unique(doc):
            word_counts[ word2int(word) ] += 1
    
    return np.log2( (N + 1) / (word_counts + 1) )


def get_tf_idf(docs, vocab):
    tf = get_tf_matrix(docs, vocab).tocsr()
    idf = get_idf_matrix(docs, vocab)
    
    return tf.multiply(idf)

In [79]:
docs = process_raw_text(df.summaries)

51774it [00:32, 1583.14it/s]


In [80]:
tfidf = get_tf_idf(docs, words).tocsr()

51774it [00:36, 1417.65it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51774/51774 [00:16<00:00, 3079.07it/s]


In [87]:
subset = tfidf[:10]
subset.shape

(10, 58933)

In [96]:
vectors.shape

(5, 58933)

In [109]:
vectors = np.where(np.random.random( (tfidf.shape[1], 5) )<=0.5, -1, 1)

In [115]:
sigs = np.where( (subset @ vectors)<=0, 0, 1)

In [116]:
sigs.shape

(10, 5)

In [117]:
bins = 2**np.arange(5)

In [121]:
sigs @ bins

array([29, 17,  9, 17,  0, 13, 10, 13, 25,  9])

In [149]:
def get_buckets_cosine(documents, vectors, N, B, R, NB):
    buckets = {}
    signatures = np.where( (documents @ vectors) <= 0, 0, 1)
    binary_power = 2**np.arange(R)
    
    for band in tqdm(range(B)):        
        band_signatures = signatures[:, band*R:band*R+R]
        ##print(band_signatures)
        #print(band_signatures.shape)
        
        for doc in range(N):
            bucket = hash(tuple(band_signatures[doc]))%NB
            buckets.setdefault((band, bucket), set()).add(doc)
    return buckets

In [150]:
def LSHT_cosine(documents, B, R, NB=28934501):
    N, M = documents.shape
    
    P = B*R
    v_vectors = np.where(np.random.random(size=(M, P))<=0.5, -1, 1)
    buckets = get_buckets_cosine(documents, v_vectors, N, B, R, NB)
    return buckets

In [154]:
buckets = LSHT_cosine(tfidf, 80, 20, NB=999999)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:21<00:00,  3.68it/s]


In [158]:
sims = get_most_similar_from_buckets(buckets, tfidf.shape[0])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2981805/2981805 [00:03<00:00, 796048.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51774/51774 [00:00<00:00, 92685.97it/s]


In [163]:
tfidf[0].toarray().ravel()

array([3.58497428, 2.16827036, 0.06165452, ..., 0.        , 0.        ,
       0.        ])

In [249]:
i = randint(0, 51774)
list(map(lambda j: 1 - cosine(tfidf[i].toarray().ravel(), tfidf[j].toarray().ravel()), sims[i]))

[1,
 1,
 0.026569881752453228,
 0.01870539138215488,
 0.019666222438966097,
 0.011531970680638715,
 0.04602095817530805,
 0.28984236025492627,
 0.28984236025492627]

In [241]:
sims[i][:5]

[15718, 20921, 39075, 22102, 39971]

In [242]:
print(get_doc(i))

In this paper, we study a new representation-learning task, which we termed
as disassembling object representations. Given an image featuring multiple
objects, the goal of disassembling is to acquire a latent representation, of
which each part corresponds to one category of objects. Disassembling thus
finds its application in a wide domain such as image editing and few- or
zero-shot learning, as it enables category-specific modularity in the learned
representations. To this end, we propose an unsupervised approach to achieving
disassembling, named Unsupervised Disassembling Object Representation (UDOR).
UDOR follows a double auto-encoder architecture, in which a fuzzy
classification and an object-removing operation are imposed. The fuzzy
classification constrains each part of the latent representation to encode
features of up to one object category, while the object-removing, combined with
a generative adversarial network, enforces the modularity of the
representations and integrity of

In [243]:
print(get_doc(20921))

The problem of air pollution threatens public health. Air quality forecasting
can provide the air quality index hours or even days later, which can help the
public to prevent air pollution in advance. Previous works focus on citywide
air quality forecasting and cannot solve nationwide city forecasting problem,
whose difficulties lie in capturing the latent dependencies between
geographically distant but highly correlated cities. In this paper, we propose
the group-aware graph neural network (GAGNN), a hierarchical model for
nationwide city air quality forecasting. The model constructs a city graph and
a city group graph to model the spatial and latent dependencies between cities,
respectively. GAGNN introduces differentiable grouping network to discover the
latent dependencies among cities and generate city groups. Based on the
generated city groups, a group correlation encoding module is introduced to
learn the correlations between them, which can effectively capture the
dependencies 

In [250]:
i

11051

In [251]:
similarity = []
for j in tqdm(range(tfidf.shape[0])):
    similarity.append(
        1 - cosine(
            tfidf[i].toarray().ravel(), tfidf[j].toarray().ravel()
        )
    )
similarity = np.array(similarity)
idx = np.argsort(similarity)[::-1][:10]
similarity[idx]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51774/51774 [00:25<00:00, 1996.94it/s]


array([1.        , 1.        , 1.        , 0.31584664, 0.31584664,
       0.31388063, 0.309387  , 0.30180997, 0.29849764, 0.29408726])