In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk

from scipy.spatial.distance import minkowski, cosine
from IPython.display import display
from typing import Sequence, Callable

In [2]:
df = pd.read_csv("arxiv_data.csv", nrows=500)

In [3]:
df.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


# Bag of Words

In [4]:
def get(index):
    return df.loc[index, "summaries"]

In [5]:
def get_vocabulary(all_docs)->set:
    vocab = set()
    for doc in all_docs:
        for word in nltk.word_tokenize(doc):
            vocab|={word.lower()}
    return vocab

In [6]:
vocab = list(get_vocabulary(df["summaries"]))
len(vocab)

7684

In [7]:
exclude = lambda i: pd.Index(data = range(mat.shape[0])).difference([i])

In [8]:
def get_stats(i: int):
    not_i = exclude(i)
    
    most_sim = mat[i,not_i].argmin()
    most_sim = not_i[most_sim]
    
    print(f"Min. Dist. Index = {most_sim}")
    print(f"Min. Dist. = {mat[i, most_sim]}\n")
    print(get(i))
    print()
    print(get(most_sim))

In [9]:
def get_rank(i: int, top=10):
    not_i = exclude(i)
    
    candidates = mat[i,not_i].argsort()[1:top+1]
    candidates = not_i[candidates]
    
    cand_scores = list(map(lambda x: float(f"{x:.4f}"), mat[i, candidates]))

    print(f"""Top Candidates: {candidates}
Their Scores: {cand_scores}""")
    
    return candidates

## Term Existence

In [70]:
def bag_of_words(document: str, vocab: list):
    bow = np.zeros(len(vocab))
    loc = dict(map(lambda pair: (pair[1], pair[0]), enumerate(vocab)))
    
    for word in nltk.word_tokenize(document):
        bow[loc[word.lower()]]=1
        
    return bow

In [71]:
def get_similarity_mat(N, dist):
    bows = [ bag_of_words(get(i), vocab) for i in range(N) ]
    
    sim_mat = np.zeros((N,N))
    for i in range(N):
        for j in range(i, N):
            sim_mat[i,j] = dist(bows[i], bows[j])
            sim_mat[j,i] = sim_mat[i,j]
    display(pd.DataFrame(data = sim_mat, columns=range(N), index=range(N)).loc[0:5, 0:5])
    return sim_mat

### Euclidean Distance

In [96]:
mat = get_similarity_mat(500, minkowski)

Unnamed: 0,0,1,2,3,4,5
0,0.0,13.266499,13.304135,13.0,13.638182,11.958261
1,13.266499,0.0,15.132746,14.73092,15.556349,14.177447
2,13.304135,15.132746,0.0,13.490738,15.264338,13.490738
3,13.0,14.73092,13.490738,0.0,15.0,12.489996
4,13.638182,15.556349,15.264338,15.0,0.0,14.035669
5,11.958261,14.177447,13.490738,12.489996,14.035669,0.0


In [97]:
index=45

In [98]:
get_stats(index)

Min. Dist. Index = 147
Min. Dist. = 10.488088481701515

In this work, we address the challenging task of few-shot segmentation.
Previous few-shot segmentation methods mainly employ the information of support
images as guidance for query image segmentation. Although some works propose to
build cross-reference between support and query images, their extraction of
query information still depends on the support images. We here propose to
extract the information from the query itself independently to benefit the
few-shot segmentation task. To this end, we first propose a prior extractor to
learn the query information from the unlabeled images with our proposed
global-local contrastive learning. Then, we extract a set of predetermined
priors via this prior extractor. With the obtained priors, we generate the
prior region maps for query images, which locate the objects, as guidance to
perform cross interaction with support features. In such a way, the extraction
of query information is detach

In [99]:
rete = get_rank(index)

Top Candidates: Int64Index([354, 19, 282, 472, 459, 491, 382, 35, 293, 173], dtype='int64')
Their Scores: [10.9087, 11.1803, 11.225, 11.2694, 11.2694, 11.3578, 11.4018, 11.4018, 11.4455, 11.4891]


> For document indexed with *0*, altough both the abstracts are revolve around image, their main research topic are not alike. <br><br>
This occurs for more than one document.

### Cosine Distance

In [88]:
mat = get_similarity_mat(500, cosine)

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.780618,0.759312,0.790238,0.725305,0.756036
1,0.780618,0.0,0.820732,0.831268,0.805819,0.846417
2,0.759312,0.820732,0.0,0.678378,0.758392,0.742052
3,0.790238,0.831268,0.678378,0.0,0.776502,0.688504
4,0.725305,0.805819,0.758392,0.776502,0.0,0.735822
5,0.756036,0.846417,0.742052,0.688504,0.735822,0.0


In [89]:
index = 45

In [90]:
get_stats(index)

Min. Dist. Index = 19
Min. Dist. = 0.6399588500884522

In this work, we address the challenging task of few-shot segmentation.
Previous few-shot segmentation methods mainly employ the information of support
images as guidance for query image segmentation. Although some works propose to
build cross-reference between support and query images, their extraction of
query information still depends on the support images. We here propose to
extract the information from the query itself independently to benefit the
few-shot segmentation task. To this end, we first propose a prior extractor to
learn the query information from the unlabeled images with our proposed
global-local contrastive learning. Then, we extract a set of predetermined
priors via this prior extractor. With the obtained priors, we generate the
prior region maps for query images, which locate the objects, as guidance to
perform cross interaction with support features. In such a way, the extraction
of query information is detache

In [91]:
rcte = get_rank(index)

Top Candidates: Int64Index([257, 173, 48, 379, 498, 166, 461, 355, 237, 205], dtype='int64')
Their Scores: [0.6538, 0.6726, 0.6731, 0.6731, 0.6746, 0.6746, 0.6752, 0.6796, 0.6851, 0.6858]


> Similarly to the last scenario, in *0*, both the papers talk about image processing. 

> In index *45* both papers refer image segmentation as the topic of the research, applied to different cases

### Dot Product Distance

In [92]:
mat = get_similarity_mat(500, lambda a,b: 1/np.dot(a,b))

Unnamed: 0,0,1,2,3,4,5
0,0.011364,0.041667,0.037037,0.045455,0.030303,0.043478
1,0.041667,0.007353,0.04,0.045455,0.034483,0.055556
2,0.037037,0.04,0.006993,0.023256,0.027027,0.032258
3,0.045455,0.045455,0.023256,0.008,0.03125,0.028571
4,0.030303,0.034483,0.027027,0.03125,0.006098,0.029412
5,0.043478,0.055556,0.032258,0.028571,0.029412,0.009901


In [93]:
index = 45

In [94]:
get_stats(index)

Min. Dist. Index = 41
Min. Dist. = 0.024390243902439025

In this work, we address the challenging task of few-shot segmentation.
Previous few-shot segmentation methods mainly employ the information of support
images as guidance for query image segmentation. Although some works propose to
build cross-reference between support and query images, their extraction of
query information still depends on the support images. We here propose to
extract the information from the query itself independently to benefit the
few-shot segmentation task. To this end, we first propose a prior extractor to
learn the query information from the unlabeled images with our proposed
global-local contrastive learning. Then, we extract a set of predetermined
priors via this prior extractor. With the obtained priors, we generate the
prior region maps for query images, which locate the objects, as guidance to
perform cross interaction with support features. In such a way, the extraction
of query information is detac

In [95]:
rdte = get_rank(index)

Top Candidates: Int64Index([355, 452, 184, 203, 257, 113, 122, 485, 205, 177], dtype='int64')
Their Scores: [0.0244, 0.0256, 0.0263, 0.0263, 0.0278, 0.0278, 0.0278, 0.0278, 0.0278, 0.0278]


> For index *0*, same result as the *cosine* scenario.

> In index *45* both papers refer image segmentation as the topic of the research, applied to different cases. <br>
\* Not the same document as in *cosine* distance

In [100]:
print(rete, rcte, rdte, sep="\n")

Int64Index([354, 19, 282, 472, 459, 491, 382, 35, 293, 173], dtype='int64')
Int64Index([257, 173, 48, 379, 498, 166, 461, 355, 237, 205], dtype='int64')
Int64Index([355, 452, 184, 203, 257, 113, 122, 485, 205, 177], dtype='int64')


In [101]:
np.intersect1d(rete, rcte)

array([173], dtype=int64)

In [102]:
np.intersect1d(rete, rdte)

array([], dtype=int64)

In [103]:
np.intersect1d(rdte, rcte)

array([205, 257, 355], dtype=int64)

> ## Final comments

We can see that the different distance methods recommend majorly distinct documents. This could mean that this Bagging method is not sufficiently good to represent the data, as even with difference metrics, the intersection set would be expected to be bigger then what it was.

**The removal of stopwords could improve this model results.**

## Term Weighting

In [104]:
def bag_of_words(document: str, vocab: list):
    bow = np.zeros(len(vocab))
    loc = dict(map(lambda pair: (pair[1], pair[0]), enumerate(vocab)))
    
    for word in nltk.word_tokenize(document):
        bow[loc[word.lower()]]+=1
        
    return bow

In [105]:
def get_similarity_mat(N, dist):
    bows = [ bag_of_words(get(i), vocab) for i in range(N) ]
    
    sim_mat = np.zeros((N,N))
    for i in range(N):
        for j in range(i, N):
            sim_mat[i,j] = dist(bows[i], bows[j])
            sim_mat[j,i] = sim_mat[i,j]
    display(pd.DataFrame(data = sim_mat, columns=range(N), index=range(N)).loc[0:5, 0:5])
    return sim_mat

### Euclidean Distance

In [106]:
mat = get_similarity_mat(500, minkowski)  

Unnamed: 0,0,1,2,3,4,5
0,0.0,31.76476,25.806976,23.811762,33.970576,18.894444
1,31.76476,0.0,33.151169,33.196385,32.32646,29.832868
2,25.806976,33.151169,0.0,20.712315,30.659419,23.811762
3,23.811762,33.196385,20.712315,0.0,28.930952,22.226111
4,33.970576,32.32646,30.659419,28.930952,0.0,31.921779
5,18.894444,29.832868,23.811762,22.226111,31.921779,0.0


In [113]:
index = 327

get_stats(index)

Min. Dist. Index = 35
Min. Dist. = 16.73320053068151

Image segmentation has long been a basic problem in computer vision.
Depth-wise Layering is a kind of segmentation that slices an image in a
depth-wise sequence unlike the conventional image segmentation problems dealing
with surface-wise decomposition. The proposed Depth-wise Layering technique
uses a single depth image of a static scene to slice it into multiple layers.
The technique employs a thresholding approach to segment rows of the dense
depth map into smaller partitions called Line-Segments in this paper. Then, it
uses the line-segment labelling method to identify number of objects and layers
of the scene independently. The final stage is to link objects of the scene to
their respective object-layers. We evaluate the efficiency of the proposed
technique by applying that on many images along with their dense depth maps.
The experiments have shown promising results of layering.

Segmentation of images is a long-standing chall

In [112]:
retw = get_rank(index)

Top Candidates: Int64Index([438, 200, 463, 205, 294, 166, 304, 478, 71, 383], dtype='int64')
Their Scores: [19.6977, 20.9284, 21.2838, 21.587, 21.7486, 21.7945, 22.0454, 22.0907, 22.3159, 22.3159]


> Index *0*: The documents don't really share anything beside the fact that are image related

> Index *327*: Both Related to Image Segmentation

### Cosine Distance

In [114]:
mat = get_similarity_mat(500, cosine)  

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.478418,0.41402,0.407314,0.353476,0.414093
1,0.478418,0.0,0.452115,0.476195,0.31211,0.401459
2,0.41402,0.452115,0.0,0.226834,0.292437,0.337706
3,0.407314,0.476195,0.226834,0.0,0.251733,0.343801
4,0.353476,0.31211,0.292437,0.251733,0.0,0.287083
5,0.414093,0.401459,0.337706,0.343801,0.287083,0.0


In [118]:
index = 327

In [119]:
get_stats(index)

Min. Dist. Index = 447
Min. Dist. = 0.25660158929168186

Image segmentation has long been a basic problem in computer vision.
Depth-wise Layering is a kind of segmentation that slices an image in a
depth-wise sequence unlike the conventional image segmentation problems dealing
with surface-wise decomposition. The proposed Depth-wise Layering technique
uses a single depth image of a static scene to slice it into multiple layers.
The technique employs a thresholding approach to segment rows of the dense
depth map into smaller partitions called Line-Segments in this paper. Then, it
uses the line-segment labelling method to identify number of objects and layers
of the scene independently. The final stage is to link objects of the scene to
their respective object-layers. We evaluate the efficiency of the proposed
technique by applying that on many images along with their dense depth maps.
The experiments have shown promising results of layering.

In machine learning and other fields, sugges

In [120]:
rctw = get_rank(index)

Top Candidates: Int64Index([83, 471, 488, 125, 235, 481, 399, 461, 111, 338], dtype='int64')
Their Scores: [0.2692, 0.2792, 0.2909, 0.2947, 0.2956, 0.2977, 0.2986, 0.3041, 0.3051, 0.3054]


> Index *0*: Both documents reference semantic image segmentation

> Index *327*: Both have a repeated word "segmentation", but share different meaning, due to the context

### Dot Product Distance

In [121]:
mat = get_similarity_mat(500, lambda a,b: 1/np.dot(a,b))

Unnamed: 0,0,1,2,3,4,5
0,0.002427,0.002538,0.002646,0.002833,0.001757,0.003968
1,0.002538,0.000722,0.001543,0.001748,0.000901,0.002119
2,0.002646,0.001543,0.00099,0.001387,0.001026,0.002242
3,0.002833,0.001748,0.001387,0.001161,0.00105,0.002451
4,0.001757,0.000901,0.001026,0.00105,0.000532,0.001527
5,0.003968,0.002119,0.002242,0.002451,0.001527,0.002227


In [129]:
index = 327

In [130]:
get_stats(index)

Min. Dist. Index = 235
Min. Dist. = 0.001218026796589525

Image segmentation has long been a basic problem in computer vision.
Depth-wise Layering is a kind of segmentation that slices an image in a
depth-wise sequence unlike the conventional image segmentation problems dealing
with surface-wise decomposition. The proposed Depth-wise Layering technique
uses a single depth image of a static scene to slice it into multiple layers.
The technique employs a thresholding approach to segment rows of the dense
depth map into smaller partitions called Line-Segments in this paper. Then, it
uses the line-segment labelling method to identify number of objects and layers
of the scene independently. The final stage is to link objects of the scene to
their respective object-layers. We evaluate the efficiency of the proposed
technique by applying that on many images along with their dense depth maps.
The experiments have shown promising results of layering.

The hatching process also influences the su

In [131]:
rdtw = get_rank(index)

Top Candidates: Int64Index([447, 481, 125, 399, 338, 476, 83, 243, 4, 487], dtype='int64')
Their Scores: [0.0013, 0.0013, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0017, 0.0017]


> Index *0*: Segmentation of Images

> Index *327*: Segmentation of Images

> ## Final Comments

In [132]:
print(retw, rctw, rdtw, sep="\n")

Int64Index([438, 200, 463, 205, 294, 166, 304, 478, 71, 383], dtype='int64')
Int64Index([83, 471, 488, 125, 235, 481, 399, 461, 111, 338], dtype='int64')
Int64Index([447, 481, 125, 399, 338, 476, 83, 243, 4, 487], dtype='int64')


In [133]:
np.intersect1d(retw, rctw)

array([], dtype=int64)

In [134]:
np.intersect1d(retw, rdtw)

array([], dtype=int64)

In [135]:
np.intersect1d(rctw, rdtw)

array([ 83, 125, 338, 399, 481], dtype=int64)

**Cosine Distance** and **Dot Product Distance** share a great number of likely candidates

## Term Frequency Transformation

In [136]:
def bag_of_words(document: str, vocab: list, t: Callable) -> Sequence:
    bow = np.zeros(len(vocab))
    loc = dict(map(lambda pair: (pair[1], pair[0]), enumerate(vocab)))
    
    for word in nltk.word_tokenize(document):
        bow[loc[word.lower()]]+=1
    
    return t(bow)

In [137]:
def get_similarity_mat(N: int, dist: Callable, t: Callable):
    bows = [ bag_of_words(get(i), vocab, t) for i in range(N) ]
    
    sim_mat = np.zeros((N,N))
    for i in range(N):
        for j in range(i, N):
            sim_mat[i,j] = dist(bows[i], bows[j])
            sim_mat[j,i] = sim_mat[i,j]
    display(pd.DataFrame(data = sim_mat, columns=range(N), index=range(N)).loc[0:5, 0:5])
    return sim_mat

In [138]:
t1 = lambda arr: np.log2(1+arr)
t2 = lambda arr: np.log2(1 + t1(arr))
t3 = lambda arr, k: (k+1)*arr/(arr+k)

#### Comparing effect of Different Frequency Transformers

Given the fact that the book referenced vector dot product, that measure will be the one used for this step

In [139]:
dot_dist = lambda x, y: 1/np.dot(x,y)

### First Transform

In [140]:
mat = get_similarity_mat(500, dot_dist, t1)

Unnamed: 0,0,1,2,3,4,5
0,0.00609,0.011734,0.010949,0.012623,0.009014,0.013929
1,0.011734,0.003546,0.008557,0.010351,0.007956,0.01235
2,0.010949,0.008557,0.003336,0.006837,0.006969,0.009425
3,0.012623,0.010351,0.006837,0.004016,0.00751,0.009618
4,0.009014,0.007956,0.006969,0.00751,0.00287,0.008678
5,0.013929,0.01235,0.009425,0.009618,0.008678,0.005565


In [143]:
index = 440

In [144]:
get_stats(index)

Min. Dist. Index = 49
Min. Dist. = 0.0072254638212528545

The ability of neural networks to continuously learn and adapt to new tasks
while retaining prior knowledge is crucial for many applications. However,
current neural networks tend to forget previously learned tasks when trained on
new ones, i.e., they suffer from Catastrophic Forgetting (CF). The objective of
Continual Learning (CL) is to alleviate this problem, which is particularly
relevant for medical applications, where it may not be feasible to store and
access previously used sensitive patient data. In this work, we propose a
Continual Learning approach for brain segmentation, where a single network is
consecutively trained on samples from different domains. We build upon an
importance driven approach and adapt it for medical image segmentation.
Particularly, we introduce learning rate regularization to prevent the loss of
the network's knowledge. Our results demonstrate that directly restricting the
adaptation of importan

In [145]:
rftt1 = get_rank(index)

Top Candidates: Int64Index([191, 14, 434, 222, 4, 425, 477, 78, 41, 432], dtype='int64')
Their Scores: [0.0074, 0.0075, 0.0075, 0.0077, 0.0078, 0.0078, 0.0078, 0.0079, 0.008, 0.008]


> Index *0*: Image Segmentation

> Index *440*: Both documents talk about adaptative models, the general topic seems to be highly related

### Second Transform

In [146]:
mat = get_similarity_mat(500, dot_dist, t2)

Unnamed: 0,0,1,2,3,4,5
0,0.008158,0.020499,0.018565,0.022146,0.015655,0.022797
1,0.020499,0.005135,0.016202,0.01949,0.015514,0.023296
2,0.018565,0.016202,0.004714,0.011577,0.012807,0.015909
3,0.022146,0.01949,0.011577,0.005561,0.014134,0.015484
4,0.015655,0.015514,0.012807,0.014134,0.004192,0.015173
5,0.022797,0.023296,0.015909,0.015484,0.015173,0.007332


In [150]:
index = 440

In [151]:
get_stats(index)

Min. Dist. Index = 191
Min. Dist. = 0.01179637827596876

The ability of neural networks to continuously learn and adapt to new tasks
while retaining prior knowledge is crucial for many applications. However,
current neural networks tend to forget previously learned tasks when trained on
new ones, i.e., they suffer from Catastrophic Forgetting (CF). The objective of
Continual Learning (CL) is to alleviate this problem, which is particularly
relevant for medical applications, where it may not be feasible to store and
access previously used sensitive patient data. In this work, we propose a
Continual Learning approach for brain segmentation, where a single network is
consecutively trained on samples from different domains. We build upon an
importance driven approach and adapt it for medical image segmentation.
Particularly, we introduce learning rate regularization to prevent the loss of
the network's knowledge. Our results demonstrate that directly restricting the
adaptation of important

In [152]:
rftt2 = get_rank(index)

Top Candidates: Int64Index([49, 14, 95, 222, 434, 41, 425, 477, 75, 78], dtype='int64')
Their Scores: [0.0119, 0.0121, 0.0126, 0.0127, 0.0128, 0.0128, 0.0129, 0.013, 0.0131, 0.0132]


### Third Transform

In [161]:
k=2
t3_k = lambda arr: t3(arr, k)

In [162]:
mat = get_similarity_mat(500, dot_dist, t3_k)

Unnamed: 0,0,1,2,3,4,5
0,0.007211,0.016773,0.015164,0.01797,0.012984,0.018714
1,0.016773,0.004526,0.012843,0.015694,0.012707,0.01862
2,0.015164,0.012843,0.0041,0.009474,0.010465,0.01295
3,0.01797,0.015694,0.009474,0.004888,0.011487,0.012884
4,0.012984,0.012707,0.010465,0.011487,0.003691,0.012606
5,0.018714,0.01862,0.01295,0.012884,0.012606,0.006537


In [163]:
index = 440

In [164]:
get_stats(index)

Min. Dist. Index = 191
Min. Dist. = 0.009746947512371186

The ability of neural networks to continuously learn and adapt to new tasks
while retaining prior knowledge is crucial for many applications. However,
current neural networks tend to forget previously learned tasks when trained on
new ones, i.e., they suffer from Catastrophic Forgetting (CF). The objective of
Continual Learning (CL) is to alleviate this problem, which is particularly
relevant for medical applications, where it may not be feasible to store and
access previously used sensitive patient data. In this work, we propose a
Continual Learning approach for brain segmentation, where a single network is
consecutively trained on samples from different domains. We build upon an
importance driven approach and adapt it for medical image segmentation.
Particularly, we introduce learning rate regularization to prevent the loss of
the network's knowledge. Our results demonstrate that directly restricting the
adaptation of importan

In [165]:
rftt3 = get_rank(index)

Top Candidates: Int64Index([49, 14, 434, 95, 222, 425, 78, 477, 41, 4], dtype='int64')
Their Scores: [0.0098, 0.01, 0.0103, 0.0104, 0.0104, 0.0104, 0.0106, 0.0106, 0.0106, 0.0109]


> ## Final Comments

In [166]:
print(rftt1, rftt2, rftt3, sep="\n")

Int64Index([191, 14, 434, 222, 4, 425, 477, 78, 41, 432], dtype='int64')
Int64Index([49, 14, 95, 222, 434, 41, 425, 477, 75, 78], dtype='int64')
Int64Index([49, 14, 434, 95, 222, 425, 78, 477, 41, 4], dtype='int64')


In [167]:
np.intersect1d(rftt1,rftt2)

array([ 14,  41,  78, 222, 425, 434, 477], dtype=int64)

In [168]:
np.intersect1d(rftt1,rftt3)

array([  4,  14,  41,  78, 222, 425, 434, 477], dtype=int64)

In [169]:
np.intersect1d(rftt2, rftt3)

array([ 14,  41,  49,  78,  95, 222, 425, 434, 477], dtype=int64)

> Using the same distance metric with the different frequency transformation gave the same results

## Inverse Term Frequency Transformation

In [194]:
def bag_of_words(document: str, doc_freq: list, vocab: list, t: Callable) -> Sequence:
    bow = np.zeros(len(vocab))
    loc = dict(map(lambda pair: (pair[1], pair[0]), enumerate(vocab)))
    
    for word in nltk.word_tokenize(document):
        bow[loc[word.lower()]]+=1
    
    bow = t(bow)
    
    for i, v in enumerate(bow):
        if v!=0:
            bow[i] = np.log((doc_freq[i] + 1) / bow[i])
    return bow

In [186]:
def get_similarity_mat(N: int, dist: Callable, t: Callable):
    bows = [ bag_of_words(get(i), doc_freq, vocab, t) for i in range(N) ]
    
    sim_mat = np.zeros((N,N))
    for i in range(N):
        for j in range(i, N):
            sim_mat[i,j] = dist(bows[i], bows[j])
            sim_mat[j,i] = sim_mat[i,j]
    display(pd.DataFrame(data = sim_mat, columns=range(N), index=range(N)).loc[0:5, 0:5])
    return sim_mat

In [172]:
t1 = lambda arr: np.log2(1+arr)
t2 = lambda arr: np.log2(1 + t1(arr))
t3 = lambda arr, k: (k+1)*arr/(arr+k)

In [178]:
def get_vocabulary_and_frequency(all_docs)->set:
    vocab = dict()
    for doc in all_docs:
        current_words = dict()
        for word in nltk.word_tokenize(doc):
            try:
                current_words[word.lower()]
            except:
                vocab[ word.lower() ] = vocab.get(word.lower(),  0) + 1
                current_words[ word.lower() ] = 1
    return vocab.keys(), vocab.values()

In [198]:
vocab, doc_freq = list(map(list, get_vocabulary_and_frequency(df["summaries"])))

### First Transform

In [200]:
mat = get_similarity_mat(500, dot_dist, t1)

Unnamed: 0,0,1,2,3,4,5
0,0.000681,0.001682,0.00141,0.001651,0.00117,0.001588
1,0.001682,0.000724,0.001522,0.001649,0.001351,0.001895
2,0.00141,0.001522,0.000453,0.000982,0.001021,0.001156
3,0.001651,0.001649,0.000982,0.000514,0.001172,0.00108
4,0.00117,0.001351,0.001021,0.001172,0.000437,0.001154
5,0.001588,0.001895,0.001156,0.00108,0.001154,0.000552


In [204]:
index = 225

In [205]:
get_stats(index)

Min. Dist. Index = 41
Min. Dist. = 0.0011564612145292582

We generalize a graph-based multiclass semi-supervised classification
technique based on diffuse interface methods to multilayer graphs. Besides the
treatment of various applications with an inherent multilayer structure, we
present a very flexible approach that interprets high-dimensional data in a
low-dimensional multilayer graph representation. Highly efficient numerical
methods involving the spectral decomposition of the corresponding differential
graph operators as well as fast matrix-vector products based on the
nonequispaced fast Fourier transform (NFFT) enable the rapid treatment of large
and high-dimensional data sets. We perform various numerical tests putting a
special focus on image segmentation. In particular, we test the performance of
our method on data sets with up to 10 million nodes per layer as well as up to
104 dimensions resulting in graphs with up to 52 layers. While all presented
numerical experiments can 

In [206]:
riftt1 = get_rank(index)

Top Candidates: Int64Index([321, 314, 300, 389, 476, 141, 355, 113, 374, 236], dtype='int64')
Their Scores: [0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012]


## Second Transform