In [104]:
import pandas as pd

In [105]:
from PyPDF2 import PdfReader

In [109]:
import glob

In [131]:
pdf_dictionary = {}
for path in glob.glob("data_source/*"):
    reader = PdfReader(path)
    all_pages = []
    for page in reader.pages:
        all_pages.append(page.extract_text())
    doc_string = " ".join(all_pages)
    pdf_dictionary[path] = doc_string

In [132]:
df = pd.DataFrame(pdf_dictionary.items(), columns=["path", "text"])

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [134]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])

In [135]:
X

<12x21832 sparse matrix of type '<class 'numpy.float64'>'
	with 34526 stored elements in Compressed Sparse Row format>

In [136]:
vectorizer.get_feature_names_out()

array(['00', '000', '00001', ..., '𝜎𝜇and𝛼𝜇smearing', '𝜒0', '𝜙º'],
      dtype=object)

In [137]:
v1 = X.toarray()[0]

In [138]:
v2 = X.toarray()[1]

In [139]:
v1,v2

(array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]))

In [140]:
import numpy as np

In [141]:
np.dot(v1, v2)

0.5707229231562305

In [142]:
np.linalg.norm(v1)

0.9999999999999997

In [143]:
np.linalg.norm(v2)

0.9999999999999996

In [144]:
np.dot(v1, v2)/np.linalg.norm(v1)/np.linalg.norm(v2)

0.570722923156231

In [145]:
df

Unnamed: 0,path,text
0,data_source/waves_quantum.pdf,Chapter 10\nIntroduction to quantum\nmechanics...
1,data_source/2301.00029.pdf,arXiv:2301.00029v2 [math-ph] 26 Jan 2023Gene...
2,data_source/qmech.pdf,Quantum Mechanics\nRichard Fitzpatrick\nProfes...
3,data_source/2201.00019.pdf,To appear in JHEP\nFACET: A new long-lived par...
4,data_source/2301.00292.pdf,Inference for Large Panel Data with Many Covar...
5,data_source/2301.00085.pdf,arXiv:2301.00085v1 [math.CO] 31 Dec 2022On t...
6,data_source/2301.00091.pdf,\n \n \n \n \nWealth Redistribution and Mu...
7,data_source/qm_papers.pdf,RESEARCH\nON\nTEACHING AND LEARNING\nQUANTUM M...
8,data_source/2301.01362.pdf,Measuring tail risk at high-frequency: An L1-r...
9,data_source/2201.00738.pdf,Single Phonon Detection for Dark Matter via Qu...


In [146]:
# Version 1

X_dense = X.toarray()
X_norms = np.linalg.norm(X_dense, axis=1)
X_dense_normalized = (X_dense.T/X_norms).T
pd.DataFrame(X_dense_normalized @ X_dense_normalized.T)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.570723,0.81667,0.703458,0.794276,0.474494,0.630163,0.765888,0.560353,0.708646,0.130414,0.632214
1,0.570723,1.0,0.548284,0.498114,0.548574,0.348831,0.458839,0.545071,0.390486,0.508908,0.092659,0.465749
2,0.81667,0.548284,1.0,0.6595,0.730661,0.466742,0.595915,0.728439,0.516973,0.67553,0.123459,0.611672
3,0.703458,0.498114,0.6595,1.0,0.673633,0.400403,0.585819,0.673409,0.492786,0.69762,0.114596,0.660313
4,0.794276,0.548574,0.730661,0.673633,1.0,0.468289,0.627758,0.72494,0.582817,0.670213,0.127968,0.631379
5,0.474494,0.348831,0.466742,0.400403,0.468289,1.0,0.368941,0.432368,0.344783,0.409764,0.072138,0.378541
6,0.630163,0.458839,0.595915,0.585819,0.627758,0.368941,1.0,0.61782,0.476364,0.58417,0.106001,0.555573
7,0.765888,0.545071,0.728439,0.673409,0.72494,0.432368,0.61782,1.0,0.521763,0.703929,0.120892,0.648814
8,0.560353,0.390486,0.516973,0.492786,0.582817,0.344783,0.476364,0.521763,1.0,0.489337,0.089798,0.471372
9,0.708646,0.508908,0.67553,0.69762,0.670213,0.409764,0.58417,0.703929,0.489337,1.0,0.111769,0.622699


In [147]:
# Version 2

result = []
for i, row_a in enumerate(X_dense):
    for j, row_b in enumerate(X_dense):
        similarity = np.dot(row_a, row_b) / np.linalg.norm(row_a) /  np.linalg.norm(row_b)
        result.append([i, j, similarity])
pd.DataFrame(result).pivot(columns=0, index=1, values=2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1.0,0.570723,0.81667,0.703458,0.794276,0.474494,0.630163,0.765888,0.560353,0.708646,0.130414,0.632214
1,0.570723,1.0,0.548284,0.498114,0.548574,0.348831,0.458839,0.545071,0.390486,0.508908,0.092659,0.465749
2,0.81667,0.548284,1.0,0.6595,0.730661,0.466742,0.595915,0.728439,0.516973,0.67553,0.123459,0.611672
3,0.703458,0.498114,0.6595,1.0,0.673633,0.400403,0.585819,0.673409,0.492786,0.69762,0.114596,0.660313
4,0.794276,0.548574,0.730661,0.673633,1.0,0.468289,0.627758,0.72494,0.582817,0.670213,0.127968,0.631379
5,0.474494,0.348831,0.466742,0.400403,0.468289,1.0,0.368941,0.432368,0.344783,0.409764,0.072138,0.378541
6,0.630163,0.458839,0.595915,0.585819,0.627758,0.368941,1.0,0.61782,0.476364,0.58417,0.106001,0.555573
7,0.765888,0.545071,0.728439,0.673409,0.72494,0.432368,0.61782,1.0,0.521763,0.703929,0.120892,0.648814
8,0.560353,0.390486,0.516973,0.492786,0.582817,0.344783,0.476364,0.521763,1.0,0.489337,0.089798,0.471372
9,0.708646,0.508908,0.67553,0.69762,0.670213,0.409764,0.58417,0.703929,0.489337,1.0,0.111769,0.622699


In [162]:
# Version 3

from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(X)

In [149]:
{
    0: [
            (3, 0.7837), 
            (5, 0.4293), 
            (6, 0.1293), 
        ],
    1: [
            (2, 0.7837), 
            (4, 0.4293), 
            (6, 0.1293), 
        ],
}

{0: [(3, 0.7837), (5, 0.4293), (6, 0.1293)],
 1: [(2, 0.7837), (4, 0.4293), (6, 0.1293)]}

In [206]:
ind = similarities.argsort(axis=1)[:, -4:-1]

In [207]:
ind

array([[ 7,  4,  2],
       [ 2,  4,  0],
       [ 7,  4,  0],
       [ 4,  9,  0],
       [ 7,  2,  0],
       [ 2,  4,  0],
       [ 7,  4,  0],
       [ 4,  2,  0],
       [ 7,  0,  4],
       [ 3,  7,  0],
       [ 4,  0, 11],
       [ 0,  7,  3]])

In [209]:
values = np.take_along_axis(similarities, ind, axis=1)
values

array([[0.76588805, 0.79427585, 0.8166697 ],
       [0.54828418, 0.54857389, 0.57072292],
       [0.72843884, 0.73066103, 0.8166697 ],
       [0.67363333, 0.69762035, 0.70345838],
       [0.7249397 , 0.73066103, 0.79427585],
       [0.46674159, 0.46828925, 0.47449387],
       [0.61781998, 0.6277577 , 0.63016345],
       [0.7249397 , 0.72843884, 0.76588805],
       [0.52176269, 0.56035325, 0.58281677],
       [0.69762035, 0.7039291 , 0.70864648],
       [0.12796832, 0.13041356, 0.30914935],
       [0.63221411, 0.64881354, 0.66031337]])

In [213]:
np.dstack([ind, values])

array([[[ 7.        ,  0.76588805],
        [ 4.        ,  0.79427585],
        [ 2.        ,  0.8166697 ]],

       [[ 2.        ,  0.54828418],
        [ 4.        ,  0.54857389],
        [ 0.        ,  0.57072292]],

       [[ 7.        ,  0.72843884],
        [ 4.        ,  0.73066103],
        [ 0.        ,  0.8166697 ]],

       [[ 4.        ,  0.67363333],
        [ 9.        ,  0.69762035],
        [ 0.        ,  0.70345838]],

       [[ 7.        ,  0.7249397 ],
        [ 2.        ,  0.73066103],
        [ 0.        ,  0.79427585]],

       [[ 2.        ,  0.46674159],
        [ 4.        ,  0.46828925],
        [ 0.        ,  0.47449387]],

       [[ 7.        ,  0.61781998],
        [ 4.        ,  0.6277577 ],
        [ 0.        ,  0.63016345]],

       [[ 4.        ,  0.7249397 ],
        [ 2.        ,  0.72843884],
        [ 0.        ,  0.76588805]],

       [[ 7.        ,  0.52176269],
        [ 0.        ,  0.56035325],
        [ 4.        ,  0.58281677]],

       [[ 

In [None]:
np.vstack()

In [None]:
np.hstack()

In [None]:
np.dstack()

In [None]:
for a,b in zip([1,2,3], [3,4,5]):
    

In [203]:
pd.DataFrame(similarities)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.570723,0.81667,0.703458,0.794276,0.474494,0.630163,0.765888,0.560353,0.708646,0.130414,0.632214
1,0.570723,1.0,0.548284,0.498114,0.548574,0.348831,0.458839,0.545071,0.390486,0.508908,0.092659,0.465749
2,0.81667,0.548284,1.0,0.6595,0.730661,0.466742,0.595915,0.728439,0.516973,0.67553,0.123459,0.611672
3,0.703458,0.498114,0.6595,1.0,0.673633,0.400403,0.585819,0.673409,0.492786,0.69762,0.114596,0.660313
4,0.794276,0.548574,0.730661,0.673633,1.0,0.468289,0.627758,0.72494,0.582817,0.670213,0.127968,0.631379
5,0.474494,0.348831,0.466742,0.400403,0.468289,1.0,0.368941,0.432368,0.344783,0.409764,0.072138,0.378541
6,0.630163,0.458839,0.595915,0.585819,0.627758,0.368941,1.0,0.61782,0.476364,0.58417,0.106001,0.555573
7,0.765888,0.545071,0.728439,0.673409,0.72494,0.432368,0.61782,1.0,0.521763,0.703929,0.120892,0.648814
8,0.560353,0.390486,0.516973,0.492786,0.582817,0.344783,0.476364,0.521763,1.0,0.489337,0.089798,0.471372
9,0.708646,0.508908,0.67553,0.69762,0.670213,0.409764,0.58417,0.703929,0.489337,1.0,0.111769,0.622699


In [170]:
values = np.take_along_axis(similarities, ind, axis=1)

In [172]:
np.dstack([ind, values])

array([[[ 7.        ,  0.76588805],
        [ 4.        ,  0.79427585],
        [ 2.        ,  0.8166697 ]],

       [[ 2.        ,  0.54828418],
        [ 4.        ,  0.54857389],
        [ 0.        ,  0.57072292]],

       [[ 7.        ,  0.72843884],
        [ 4.        ,  0.73066103],
        [ 0.        ,  0.8166697 ]],

       [[ 4.        ,  0.67363333],
        [ 9.        ,  0.69762035],
        [ 0.        ,  0.70345838]],

       [[ 7.        ,  0.7249397 ],
        [ 2.        ,  0.73066103],
        [ 0.        ,  0.79427585]],

       [[ 2.        ,  0.46674159],
        [ 4.        ,  0.46828925],
        [ 0.        ,  0.47449387]],

       [[ 7.        ,  0.61781998],
        [ 4.        ,  0.6277577 ],
        [ 0.        ,  0.63016345]],

       [[ 4.        ,  0.7249397 ],
        [ 2.        ,  0.72843884],
        [ 0.        ,  0.76588805]],

       [[ 7.        ,  0.52176269],
        [ 0.        ,  0.56035325],
        [ 4.        ,  0.58281677]],

       [[ 

In [152]:
df

Unnamed: 0,path,text
0,data_source/waves_quantum.pdf,Chapter 10\nIntroduction to quantum\nmechanics...
1,data_source/2301.00029.pdf,arXiv:2301.00029v2 [math-ph] 26 Jan 2023Gene...
2,data_source/qmech.pdf,Quantum Mechanics\nRichard Fitzpatrick\nProfes...
3,data_source/2201.00019.pdf,To appear in JHEP\nFACET: A new long-lived par...
4,data_source/2301.00292.pdf,Inference for Large Panel Data with Many Covar...
5,data_source/2301.00085.pdf,arXiv:2301.00085v1 [math.CO] 31 Dec 2022On t...
6,data_source/2301.00091.pdf,\n \n \n \n \nWealth Redistribution and Mu...
7,data_source/qm_papers.pdf,RESEARCH\nON\nTEACHING AND LEARNING\nQUANTUM M...
8,data_source/2301.01362.pdf,Measuring tail risk at high-frequency: An L1-r...
9,data_source/2201.00738.pdf,Single Phonon Detection for Dark Matter via Qu...
