In [None]:
!pip install faiss-gpu
!pip install sentence_transformers

In [None]:
import pandas as pd 
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import os
import pickle

In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
parquet_file = "/kaggle/input/wikipedia-20230701"
faiss_index_path = "/kaggle/working/wikipedia_embeddings.index"

each file will                                            
contain several documents formatted as json ojects, one per line, with                                         
the following structure

	{"id": "", "revid": "", "url": "", "title": "", "text": "..."}

In [None]:
document_embedding = []
name_list = os.listdir(parquet_file)
name_list.sort()
misc = ['number.parquet','other.parquet','wiki_2023_index.parquet']
for idx,file_name in enumerate(name_list):
    print(file_name)
    if file_name.endswith(".parquet") and file_name not in misc:
        print(f"processing {file_name} {idx}")
        path = os.path.join(parquet_file,file_name)
        df = pd.read_parquet(path)
        df.text = df.text.apply(lambda x: x.split('==')[0]) # only title and abstract
        sentences = df.text.tolist()
        embeddings = model.encode(sentences, normalize_embeddings=True)
        del df, sentences # free some memory
        document_embedding.extend(embeddings)

before start of each secotion in a wikipedia article the section heading is written as "==heading==" so we are spliting on == taking taking till first split to get article title and abstract

In [None]:
document_embedding = np.array(document_embedding)
index = faiss.IndexFlatL2(document_embedding.shape[1])
index.add(document_embedding)
faiss.write_index(index, faiss_index_path)
print(f"Faiss Index Successfully Saved to '{faiss_index_path}'")

The standout feature of faiss::IndexFlatL2 lies in its simplicity and effectiveness. It serves as a brute-force index that meticulously sifts through data points using L2 distances for comparison.
Speed and accuracy are paramount in data retrieval tasks, making faiss::IndexFlatL2 a valuable asset for applications requiring precise similarity searches.
