In [1]:
import faiss
import sys
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist as scipy_cdist

sys.path.append('file_processing')

from file_processing.directory import Directory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Data pre-processing
dir = Directory('./tests/resources')
data = [file.processor.__dict__ for file in dir.get_files()]
data = pd.json_normalize(data, max_level=1, sep='_')
df = pd.DataFrame(data).get(['size', 'extension', 'file_name', 'metadata_text', 'absolute_path'])
df['metadata_text'] = df['metadata_text'].str.strip()
df['metadata_text'] = df['metadata_text'].str.replace('\n', '')
df = df[df['extension'].isin(['.pdf', '.docx']) & df['metadata_text'].str.len() > 10 & ~df['metadata_text'].isnull()]
df = df.reset_index(drop=True)

Processing files:  62%|██████▏   | 71/115 [00:45<00:02, 14.87file/s]invalid pdf header: b'U\x85[\x9c\xc0'
EOF marker not found
Processing files: 100%|██████████| 115/115 [00:46<00:00,  2.49file/s]


In [3]:
# Encoding and indexing
encoder = SentenceTransformer("paraphrase-MiniLM-L3-v2")
vectors = encoder.encode(df['metadata_text'])
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatIP(vector_dimension)
index = faiss.IndexIDMap(index)
faiss.normalize_L2(vectors)
index.add_with_ids(vectors, df.index.values.astype(np.int64))

  return self.fget.__get__(instance, owner)()


In [50]:
# Manual search by string
search_text = 'test'
search_vector = encoder.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)
k = index.ntotal
distances, ann = index.search(_vector, k=k)
results = pd.DataFrame({'distances': distances[0], 'proximity': ann[0]})
merge = pd.merge(results, df, left_on='proximity', right_index=True)
merge

Unnamed: 0,distances,proximity,size,extension,file_name,metadata_text,absolute_path
0,0.133713,4,13999,.docx,SampleReport.docx,Sample Report23 March 2023Person NameAcme IncL...,C:\Users\BLUO\Downloads\report\tests\resources...
1,0.124697,3,73894,.pdf,HealthCanadaOverviewFromWikipedia.pdf,Health Canada Over view from Wikipedia 13 Apr...,C:\Users\BLUO\Downloads\report\tests\resources...
2,0.097444,1,221266,.pdf,ArtificialNeuralNetworksForBeginners.pdf,Artificial Neural Networks for Beginner sCar...,C:\Users\BLUO\Downloads\report\tests\resources...
3,0.097444,0,221266,.pdf,ArtificialNeuralNetworksForBeginners.pdf,Artificial Neural Networks for Beginner sCar...,C:\Users\BLUO\Downloads\report\tests\resources...
4,0.092941,5,105687,.pdf,SampleReport.pdf,Sampl e Report 23 March 2023 Person Name Ac...,C:\Users\BLUO\Downloads\report\tests\resources...
5,0.089802,2,11001,.docx,HealthCanadaOverviewFromWikipedia.docx,Health Canada Overview from Wikipedia13 April ...,C:\Users\BLUO\Downloads\report\tests\resources...


In [5]:
# Brute force search
similarities = 1 - scipy_cdist(vectors, vectors, 'cosine')
similarities = np.around(similarities, decimals=2)

sim_df = pd.DataFrame(
    data=similarities,
    columns=df.file_name.tolist(),
    index=df.file_name.tolist()
)

sim_df.sort_index(axis=1, inplace=True)
sim_df.sort_index(axis=0, inplace=True)

sim_df

Unnamed: 0,ArtificialNeuralNetworksForBeginners.pdf,ArtificialNeuralNetworksForBeginners.pdf.1,HealthCanadaOverviewFromWikipedia.docx,HealthCanadaOverviewFromWikipedia.pdf,SampleReport.docx,SampleReport.pdf
ArtificialNeuralNetworksForBeginners.pdf,1.0,1.0,-0.05,-0.01,0.27,0.34
ArtificialNeuralNetworksForBeginners.pdf,1.0,1.0,-0.05,-0.01,0.27,0.34
HealthCanadaOverviewFromWikipedia.docx,-0.05,-0.05,1.0,0.95,0.1,0.05
HealthCanadaOverviewFromWikipedia.pdf,-0.01,-0.01,0.95,1.0,0.08,0.06
SampleReport.docx,0.27,0.27,0.1,0.08,1.0,0.91
SampleReport.pdf,0.34,0.34,0.05,0.06,0.91,1.0


In [48]:
# Using FAISS indexes
k_nearest = 4
search_vector = vectors.copy().astype(np.float32)
faiss.normalize_L2(search_vector)

cpu_similarities, cpu_similarities_ids = index.search(search_vector, k=k_nearest)
cpu_similarities = np.around(np.clip(cpu_similarities, 0, 3), decimals=2)

sim = pd.DataFrame(cpu_similarities)
sim = sim.where(sim >= 0.3).fillna('')
sim_ids = pd.DataFrame(cpu_similarities_ids)
sim_ids = sim_ids.where(sim != '', '')

df_out = pd.DataFrame(df.file_name)
for i in range(3):
    df_out[f'{i+1}_id'] = sim_ids[i+1].map(df.file_name)
    df_out[str(i+1)] = sim[i+1]

df_out = df_out.fillna('')
df_out

Unnamed: 0,file_name,1_id,1,2_id,2,3_id,3
0,ArtificialNeuralNetworksForBeginners.pdf,ArtificialNeuralNetworksForBeginners.pdf,1.0,SampleReport.pdf,0.34,,
1,ArtificialNeuralNetworksForBeginners.pdf,ArtificialNeuralNetworksForBeginners.pdf,1.0,SampleReport.pdf,0.34,,
2,HealthCanadaOverviewFromWikipedia.docx,HealthCanadaOverviewFromWikipedia.pdf,0.95,,,,
3,HealthCanadaOverviewFromWikipedia.pdf,HealthCanadaOverviewFromWikipedia.docx,0.95,,,,
4,SampleReport.docx,SampleReport.pdf,0.91,,,,
5,SampleReport.pdf,SampleReport.docx,0.91,ArtificialNeuralNetworksForBeginners.pdf,0.34,ArtificialNeuralNetworksForBeginners.pdf,0.34
