In [1]:
# import nltk
# nltk.download('all')

In [2]:
from src.utility import Utility as util
import os
import tqdm.notebook 

In [3]:
def preprocess(texts):
    """
    テキストのリストに対して処理を行い、その結果のリストを返します。

    処理内容：
        空の文字列の要素はPDFでの空行であるとみなします。
        空の文字列要素に当たるまでの要素は１つの文章の１部とみなして結合します。
        数字は全て0に置き換えます。
    """
    result = []
    connect_next_text = False
    for text in texts:
        if connect_next_text:
            result[-1] = util.connect_texts(result[-1], util.convert_number_to_word(text, word='0'))
        else:
            result.append(util.convert_number_to_word(text, '0'))
        connect_next_text = not util.is_invalid_text(text)
    return list(filter(lambda x: not util.is_invalid_text(x, ch_lowerlimit=3, word_lowerlimit=3), result))

In [9]:
LIMIT = 0
RECURSIVE = True
corpus = []
filenames = []
for pdf_path in tqdm.notebook.tqdm(util.get_pdflist('data', limit=LIMIT, recursive=RECURSIVE)):
    text = ' '.join(preprocess(util.load_pdf_texts(pdf_path)))
    corpus.append(text)
    filenames.append(os.path.basename(pdf_path))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=227.0), HTML(value='')))




In [10]:
print(corpus[1])

Designing Network Design Spaces Ilija Radosavovic Raj Prateek Kosaraju Ross Girshick Kaiming He Facebook AI Research (FAIR) In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting ﬁndings that do not match the current practice of n

In [11]:
import sentence_transformers
import numpy as np
model = sentence_transformers.SentenceTransformer('paraphrase-distilroberta-base-v1')
sentence_embeddings = model.encode(corpus)

In [12]:
def cossim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [14]:
for i in range(len(sentence_embeddings)):
    print("{} : {:.3f}".format(filenames[i], cossim(sentence_embeddings[0], sentence_embeddings[i])))

2003.08237.pdf : 1.000
2003.13678.pdf : 0.326
2101.00004.pdf : 0.291
2101.00051.pdf : 0.303
2101.00065.pdf : 0.210
2101.00304.pdf : 0.328
2101.00405.pdf : 0.250
2101.00500.pdf : 0.330
2101.00523.pdf : 0.199
2101.00570.pdf : 0.258
2101.00613.pdf : 0.317
2101.00650.pdf : 0.323
2101.00688.pdf : 0.204
2101.00758.pdf : 0.308
2101.00819.pdf : 0.235
2101.00823.pdf : 0.185
2101.00950.pdf : 0.280
2101.01234.pdf : 0.216
2101.01267.pdf : 0.305
2101.01287.pdf : 0.215
2101.01389.pdf : 0.210
2101.01462.pdf : 0.050
2101.01538.pdf : 0.139
2101.01548.pdf : 0.183
2101.01557.pdf : 0.201
2101.01564.pdf : 0.354
2101.01607.pdf : 0.291
2104.06648.pdf : 0.400
2104.06652.pdf : 0.323
2104.06655.pdf : 0.333
2104.06666.pdf : 0.343
2104.06667.pdf : 0.203
2104.06670.pdf : 0.383
2104.06677.pdf : 0.289
2104.06683.pdf : 0.397
2104.06685.pdf : 0.361
2104.06687.pdf : 0.344
2104.06700.pdf : 0.356
2104.06703.pdf : 0.380
2104.06718.pdf : 0.391
2104.06719.pdf : 0.503
2104.06722.pdf : 0.313
2104.06735.pdf : 0.335
2104.06744.