In [1]:
import os
import json
import logging
from typing import Optional
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm import tqdm
from file_processing import File
from file_processing.tools.errors import EmptySelection

import faiss
from sentence_transformers import SentenceTransformer

# Preprocess the files to embeddings

In [2]:
from file_processing import Directory

directory = Directory('./tests/resources/similarity_test_files/')

Preprocess the files to extract the text

In [3]:
data = [file.processor.__dict__ for file in directory._file_generator()]
data = pd.json_normalize(data, max_level=1, sep='_')
df = pd.DataFrame(data)

if df.empty:
    raise EmptySelection(f'Filtered selection of files is empty')
elif not df.empty:
    df = df.get(['size', 'extension', 'file_name',
                'metadata_text', 'absolute_path'])
    
df['metadata_text'] = df['metadata_text'].str.strip()
df['metadata_text'] = df['metadata_text'].str.replace('\n', '')

# Only keeping pdf/docx/txt files with sufficiently long 'text' metadata
df = df[(df['extension'].isin(['.pdf', '.docx', '.txt'])) &
        (df['metadata_text'].str.len() > 10) &
        (df['metadata_text'].notnull())]
df = df.reset_index(drop=True)
file_names = df.file_name

# Encoding
encoder = SentenceTransformer("paraphrase-MiniLM-L3-v2")
vectors = encoder.encode(df['metadata_text'])

df['metadata_text']

Processing files: 20 files completed [00:00, 49.58 files completed/s]


.gitattributes:   0%|          | 0.00/744 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

0     Aviation safety in CanadaFrom: Transport Canad...
1     The Canadian ConstitutionA constitution provid...
2     Causes of climate changeWhat is the most impor...
3     COVID-19: Symptoms, treatment, what to do if y...
4     Canada Pension Plan disability benefitsOvervie...
5     CPP Retirement pensionOverviewThe Canada Pensi...
6     Documents for Express EntryYou need certain do...
7     EI regular benefitsHow much you could receiveF...
8     How Express Entry worksExpress Entry is an onl...
9     Funding - Culture, history and sportCOVID-19: ...
10    Canada's health care systemLearn about Canada'...
11    History of CanadaCanadian history does not beg...
12    How the Courts are OrganizedPrevious Page Tabl...
13    Our Security, Our RightsOn June 21, 2019, an A...
14    Net-zero emissions by 2050The transition to a ...
15    Origin of the name "Canada"Today, it seems imp...
16    Personal income taxGet ready to do your taxesC...
17    Starting a businessTable of contentsBefore

In [5]:
vectors.shape

(20, 384)

Do the encoding