In [179]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk

from scipy.spatial.distance import minkowski, cosine
from IPython.display import display

In [125]:
df = pd.read_csv("arxiv_data.csv", nrows=500)

In [126]:
df.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


# Bag of Words

In [None]:
def get(index):
    return df.loc[index, "summaries"]

In [None]:
def get_vocabulary(all_docs)->set:
    vocab = set()
    for doc in all_docs:
        for word in nltk.word_tokenize(doc):
            vocab|={word.lower()}
    return vocab

In [167]:
vocab = list(get_vocabulary(df["summaries"]))
len(vocab)

7684

In [166]:
def bag_of_words(document: str, vocab: list):
    bow = np.zeros(len(vocab))
    loc = dict(map(lambda pair: (pair[1], pair[0]), enumerate(vocab)))
    
    for word in nltk.word_tokenize(document):
        bow[loc[word.lower()]]=1
        
    return bow

In [180]:
def get_similarity_mat(N, dist):
    bows = [ bag_of_words(get(i), vocab) for i in range(N) ]
    
    sim_mat = np.zeros((n,n))
    for i in range(n):
        for j in range(i, n):
            sim_mat[i,j] = dist(bows[i], bows[j])
            sim_mat[j,i] = sim_mat[i,j]
    display(pd.DataFrame(data = sim_mat, columns=range(n), index=range(n)).loc[0:5, 0:5])
    return sim_mat

In [181]:
mat = get_similarity_mat(500, minkowski)

Unnamed: 0,0,1,2,3,4,5
0,0.0,13.266499,13.304135,13.0,13.638182,11.958261
1,13.266499,0.0,15.132746,14.73092,15.556349,14.177447
2,13.304135,15.132746,0.0,13.490738,15.264338,13.490738
3,13.0,14.73092,13.490738,0.0,15.0,12.489996
4,13.638182,15.556349,15.264338,15.0,0.0,14.035669
5,11.958261,14.177447,13.490738,12.489996,14.035669,0.0


In [182]:
get(0)

'Stereo matching is one of the widely used techniques for inferring depth from\nstereo images owing to its robustness and speed. It has become one of the major\ntopics of research since it finds its applications in autonomous driving,\nrobotic navigation, 3D reconstruction, and many other fields. Finding pixel\ncorrespondences in non-textured, occluded and reflective areas is the major\nchallenge in stereo matching. Recent developments have shown that semantic cues\nfrom image segmentation can be used to improve the results of stereo matching.\nMany deep neural network architectures have been proposed to leverage the\nadvantages of semantic segmentation in stereo matching. This paper aims to give\na comparison among the state of art networks both in terms of accuracy and in\nterms of speed which are of higher importance in real-time applications.'

In [184]:
most_sim = mat[0,1:].argmin()
most_sim

146

In [185]:
mat[0, most_sim]

12.68857754044952

In [186]:
get(most_sim)

'We present a Neural Network based Handwritten Text Recognition (HTR) model\narchitecture that can be trained to recognize full pages of handwritten or\nprinted text without image segmentation. Being based on Image to Sequence\narchitecture, it can extract text present in an image and then sequence it\ncorrectly without imposing any constraints regarding orientation, layout and\nsize of text and non-text. Further, it can also be trained to generate\nauxiliary markup related to formatting, layout and content. We use character\nlevel vocabulary, thereby enabling language and terminology of any subject. The\nmodel achieves a new state-of-art in paragraph level recognition on the IAM\ndataset. When evaluated on scans of real world handwritten free form test\nanswers - beset with curved and slanted lines, drawings, tables, math,\nchemistry and other symbols - it performs better than all commercially\navailable HTR cloud APIs. It is deployed in production as part of a commercial\nweb applica

## Term Weighting

In [187]:
def bag_of_words(document: str, vocab: list):
    bow = np.zeros(len(vocab))
    loc = dict(map(lambda pair: (pair[1], pair[0]), enumerate(vocab)))
    
    for word in nltk.word_tokenize(document):
        bow[loc[word.lower()]]+=1
        
    return bow

In [188]:
def get_similarity_mat(N, dist):
    bows = [ bag_of_words(get(i), vocab) for i in range(N) ]
    
    sim_mat = np.zeros((n,n))
    for i in range(n):
        for j in range(i, n):
            sim_mat[i,j] = dist(bows[i], bows[j])
            sim_mat[j,i] = sim_mat[i,j]
    display(pd.DataFrame(data = sim_mat, columns=range(n), index=range(n)).loc[0:5, 0:5])
    return sim_mat

In [189]:
mat = get_similarity_mat(500, minkowski)  

Unnamed: 0,0,1,2,3,4,5
0,0.0,31.76476,25.806976,23.811762,33.970576,18.894444
1,31.76476,0.0,33.151169,33.196385,32.32646,29.832868
2,25.806976,33.151169,0.0,20.712315,30.659419,23.811762
3,23.811762,33.196385,20.712315,0.0,28.930952,22.226111
4,33.970576,32.32646,30.659419,28.930952,0.0,31.921779
5,18.894444,29.832868,23.811762,22.226111,31.921779,0.0


In [190]:
get(0)

'Stereo matching is one of the widely used techniques for inferring depth from\nstereo images owing to its robustness and speed. It has become one of the major\ntopics of research since it finds its applications in autonomous driving,\nrobotic navigation, 3D reconstruction, and many other fields. Finding pixel\ncorrespondences in non-textured, occluded and reflective areas is the major\nchallenge in stereo matching. Recent developments have shown that semantic cues\nfrom image segmentation can be used to improve the results of stereo matching.\nMany deep neural network architectures have been proposed to leverage the\nadvantages of semantic segmentation in stereo matching. This paper aims to give\na comparison among the state of art networks both in terms of accuracy and in\nterms of speed which are of higher importance in real-time applications.'

In [191]:
most_sim = mat[0,1:].argmin()
most_sim

262

In [192]:
mat[0, most_sim]

21.18962010041709

In [193]:
get(most_sim)

'Image segmentation aims at identifying regions of interest within an image,\nby grouping pixels according to their properties. This task resembles the\nstatistical one of clustering, yet many standard clustering methods fail to\nmeet the basic requirements of image segmentation: segment shapes are often\nbiased toward predetermined shapes and their number is rarely determined\nautomatically. Nonparametric clustering is, in principle, free from these\nlimitations and turns out to be particularly suitable for the task of image\nsegmentation. This is also witnessed by several operational analogies, as, for\ninstance, the resort to topological data analysis and spatial tessellation in\nboth the frameworks. We discuss the application of nonparametric clustering to\nimage segmentation and provide an algorithm specific for this task. Pixel\nsimilarity is evaluated in terms of density of the color representation and the\nadjacency structure of the pixels is exploited to introduce a simple, ye

## Term Frequency Transformation