In [1]:
import faiss
import sys
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist as scipy_cdist
from file_processing.directory import Directory

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dir = Directory('./tests/resources/similarity_test_files/')
dir.identify_duplicates('./tests/resources/sample_reports/similarity_faiss.csv', threshold=0.3, top_n=5)

Processing files: 100%|██████████| 20/20 [00:00<00:00, 66.99file/s]


In [4]:
# Data pre-processing
dir = Directory('./tests/resources/similarity_test_files/')
data = [file.processor.__dict__ for file in dir.get_files()]
data = pd.json_normalize(data, max_level=1, sep='_')
df = pd.DataFrame(data).get(['size', 'extension', 'file_name', 'metadata_text', 'absolute_path'])
df['metadata_text'] = df['metadata_text'].str.strip()
df['metadata_text'] = df['metadata_text'].str.replace('\n', '')
df = df[(df['extension'].isin(['.pdf', '.docx', '.txt'])) & (df['metadata_text'].str.len() > 10) & (df['metadata_text'].notnull())]
df = df.reset_index(drop=True)

Processing files: 100%|██████████| 20/20 [00:00<00:00, 66.66file/s]


In [5]:
# Encoding and indexing
encoder = SentenceTransformer("paraphrase-MiniLM-L3-v2")
vectors = encoder.encode(df['metadata_text'])
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatIP(vector_dimension)
index = faiss.IndexIDMap(index)
faiss.normalize_L2(vectors)
index.add_with_ids(vectors, df.index.values.astype(np.int64))

In [6]:
# Manual search by string
search_text = 'test'
search_vector = encoder.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)
k = index.ntotal
distances, ann = index.search(_vector, k=k)
results = pd.DataFrame({'distances': distances[0], 'proximity': ann[0]})
merge = pd.merge(results, df, left_on='proximity', right_index=True)
merge

Unnamed: 0,distances,proximity,size,extension,file_name,metadata_text,absolute_path
0,0.157976,6,2640,.txt,documents_for_express_entry.txt,Documents for Express EntryYou need certain do...,C:\Users\BLUO\Downloads\report\tests\resources...
1,0.146722,12,6919,.txt,how_courts_are_organized.txt,How the Courts are OrganizedPrevious Page Tabl...,C:\Users\BLUO\Downloads\report\tests\resources...
2,0.098273,8,2471,.txt,express_entry.txt,How Express Entry worksExpress Entry is an onl...,C:\Users\BLUO\Downloads\report\tests\resources...
3,0.097482,1,6541,.txt,canadian_constitution.txt,The Canadian ConstitutionA constitution provid...,C:\Users\BLUO\Downloads\report\tests\resources...
4,0.087014,9,7667,.txt,funding_culture_history_sport.txt,"Funding - Culture, history and sportCOVID-19: ...",C:\Users\BLUO\Downloads\report\tests\resources...
5,0.081411,19,16741,.txt,visitors_to_canada.txt,TravellersVisitors to CanadaHave proper identi...,C:\Users\BLUO\Downloads\report\tests\resources...
6,0.062574,0,3888,.txt,aviation_safety.txt,Aviation safety in CanadaFrom: Transport Canad...,C:\Users\BLUO\Downloads\report\tests\resources...
7,0.049971,18,18111,.txt,travel_advisories.txt,Travel advice and advisories by destinationCOV...,C:\Users\BLUO\Downloads\report\tests\resources...
8,0.047868,3,5703,.txt,coronavirus_symptoms.txt,"COVID-19: Symptoms, treatment, what to do if y...",C:\Users\BLUO\Downloads\report\tests\resources...
9,0.02616,14,4307,.txt,net_zero_emissions_by_2050.txt,Net-zero emissions by 2050The transition to a ...,C:\Users\BLUO\Downloads\report\tests\resources...


In [7]:
# Brute force search
similarities = 1 - scipy_cdist(vectors, vectors, 'cosine')
similarities = np.around(similarities, decimals=2)

sim_df = pd.DataFrame(
    data=similarities,
    columns=df.file_name.tolist(),
    index=df.file_name.tolist()
)

sim_df.sort_index(axis=1, inplace=True)
sim_df.sort_index(axis=0, inplace=True)

sim_df

Unnamed: 0,CPP_disability_benefits.txt,CPP_retirement_pension.txt,EI_regular_benefits.txt,aviation_safety.txt,canadian_constitution.txt,climate_change_causes.txt,coronavirus_symptoms.txt,documents_for_express_entry.txt,express_entry.txt,funding_culture_history_sport.txt,healthcare_system.txt,history_of_canada.txt,how_courts_are_organized.txt,national_security_act.txt,net_zero_emissions_by_2050.txt,origin_of_name_canada.txt,personal_income_tax.txt,start_a_business.txt,travel_advisories.txt,visitors_to_canada.txt
CPP_disability_benefits.txt,1.0,0.73,0.37,0.34,0.31,0.04,0.21,0.35,0.19,0.32,0.51,0.37,0.18,0.31,0.37,0.37,0.21,-0.11,0.3,0.35
CPP_retirement_pension.txt,0.73,1.0,0.23,0.17,0.17,0.02,0.16,0.26,0.17,0.17,0.3,0.25,0.1,0.24,0.27,0.19,0.31,-0.11,0.21,0.22
EI_regular_benefits.txt,0.37,0.23,1.0,0.15,0.01,-0.01,0.03,0.26,0.31,0.21,0.13,0.12,-0.01,0.1,0.22,0.08,0.15,0.01,0.17,0.18
aviation_safety.txt,0.34,0.17,0.15,1.0,0.31,0.09,0.1,0.35,0.27,0.26,0.35,0.3,0.11,0.33,0.42,0.21,0.03,0.05,0.47,0.47
canadian_constitution.txt,0.31,0.17,0.01,0.31,1.0,-0.04,0.05,0.28,0.04,0.19,0.42,0.43,0.5,0.57,0.34,0.47,-0.03,-0.07,0.23,0.35
climate_change_causes.txt,0.04,0.02,-0.01,0.09,-0.04,1.0,0.02,-0.09,0.01,0.15,0.02,0.03,-0.1,0.04,0.26,0.04,0.13,0.24,-0.05,-0.05
coronavirus_symptoms.txt,0.21,0.16,0.03,0.1,0.05,0.02,1.0,0.13,0.08,0.19,0.13,0.24,0.05,0.09,0.02,0.05,0.13,0.05,0.33,0.13
documents_for_express_entry.txt,0.35,0.26,0.26,0.35,0.28,-0.09,0.13,1.0,0.58,0.3,0.24,0.39,0.14,0.33,0.2,0.36,0.22,0.09,0.39,0.55
express_entry.txt,0.19,0.17,0.31,0.27,0.04,0.01,0.08,0.58,1.0,0.23,0.15,0.24,-0.0,0.14,0.16,0.14,0.16,0.05,0.14,0.28
funding_culture_history_sport.txt,0.32,0.17,0.21,0.26,0.19,0.15,0.19,0.3,0.23,1.0,0.2,0.26,0.17,0.19,0.26,0.26,0.08,0.2,0.2,0.18


In [8]:
# Using FAISS indexes
k_nearest = 4
search_vector = vectors.copy().astype(np.float32)
faiss.normalize_L2(search_vector)

cpu_similarities, cpu_similarities_ids = index.search(search_vector, k=k_nearest)
cpu_similarities = np.around(np.clip(cpu_similarities, 0, 3), decimals=2)

sim = pd.DataFrame(cpu_similarities)
sim = sim.where(sim >= 0.3).fillna('')
sim_ids = pd.DataFrame(cpu_similarities_ids)
sim_ids = sim_ids.where(sim != '', '')

df_out = pd.DataFrame(df.absolute_path)
for i in range(3):
    df_out[f'{i+1}_id'] = sim_ids[i+1].map(df.absolute_path)
    df_out[str(i+1)] = sim[i+1]

df_out = df_out.fillna('')
df_out

Unnamed: 0,absolute_path,1_id,1,2_id,2,3_id,3
0,C:\Users\BLUO\Downloads\report\tests\resources...,C:\Users\BLUO\Downloads\report\tests\resources...,0.47,C:\Users\BLUO\Downloads\report\tests\resources...,0.47,C:\Users\BLUO\Downloads\report\tests\resources...,0.42
1,C:\Users\BLUO\Downloads\report\tests\resources...,C:\Users\BLUO\Downloads\report\tests\resources...,0.57,C:\Users\BLUO\Downloads\report\tests\resources...,0.5,C:\Users\BLUO\Downloads\report\tests\resources...,0.47
2,C:\Users\BLUO\Downloads\report\tests\resources...,,,,,,
3,C:\Users\BLUO\Downloads\report\tests\resources...,C:\Users\BLUO\Downloads\report\tests\resources...,0.33,,,,
4,C:\Users\BLUO\Downloads\report\tests\resources...,C:\Users\BLUO\Downloads\report\tests\resources...,0.73,C:\Users\BLUO\Downloads\report\tests\resources...,0.51,C:\Users\BLUO\Downloads\report\tests\resources...,0.37
5,C:\Users\BLUO\Downloads\report\tests\resources...,C:\Users\BLUO\Downloads\report\tests\resources...,0.73,C:\Users\BLUO\Downloads\report\tests\resources...,0.31,C:\Users\BLUO\Downloads\report\tests\resources...,0.3
6,C:\Users\BLUO\Downloads\report\tests\resources...,C:\Users\BLUO\Downloads\report\tests\resources...,0.58,C:\Users\BLUO\Downloads\report\tests\resources...,0.55,C:\Users\BLUO\Downloads\report\tests\resources...,0.39
7,C:\Users\BLUO\Downloads\report\tests\resources...,C:\Users\BLUO\Downloads\report\tests\resources...,0.37,C:\Users\BLUO\Downloads\report\tests\resources...,0.31,,
8,C:\Users\BLUO\Downloads\report\tests\resources...,C:\Users\BLUO\Downloads\report\tests\resources...,0.58,C:\Users\BLUO\Downloads\report\tests\resources...,0.31,,
9,C:\Users\BLUO\Downloads\report\tests\resources...,C:\Users\BLUO\Downloads\report\tests\resources...,0.32,C:\Users\BLUO\Downloads\report\tests\resources...,0.3,,
