In [1]:
!pip install langchain langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain)
  Downloading langchain_core-0.3.47-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.7 (from langchain)
  Downloading langchain_text_splitters-0.3.7-py3-none-any.whl.metadata (1.9 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none

In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [5]:
!pip install einops tiktoken



In [6]:
from langchain.vectorstores import FAISS
import faiss
import torch
from langchain.text_splitter import CharacterTextSplitter
import pandas as pd
import numpy as np
import concurrent.futures
import pickle
from transformers import pipeline
from langchain_text_splitters import TokenTextSplitter
from sentence_transformers import SentenceTransformer

pd.set_option('display.max_colwidth', 100)
# nlp = spacy.load("es_core_news_sm")
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))

Tesla T4


In [55]:
import logging
import os

class DocumentsStore:
    def __init__(self, pickle_df_path: str, chunk_size: int = 512, chunk_overlap: int = 50, embeddings_pickle_path: str = None):
        self.__df = pd.read_pickle(pickle_df_path)
        self.__df = self.__df.reset_index(drop=True)
        self.__embeddings = None
        self.__chunks: list[str] = []
        self.__chunks_ids: list[int] = []
        self.__chunk_size = chunk_size
        self.__chunk_overlap = chunk_overlap
        self.__pickle_df_path = pickle_df_path
        self.__embeddings_pickle_path = embeddings_pickle_path
        self.__generate_embeddings()

    @property
    def df(self):
        return self.__df

    @property
    def embeddings(self):
        return self.__embeddings

    @property
    def chunks(self):
        return self.__chunks

    @property
    def chunks_ids(self):
        return self.__chunks_ids
    
    @property
    def chunk_size(self):
        return self.__chunk_size
    
    @property
    def chunk_overlap(self):
        return self.__chunk_overlap

    def __create_chunks(self) -> None:
        text_splitter = TokenTextSplitter(
            chunk_size=self.__chunk_size,
            chunk_overlap=self.__chunk_overlap
        )

        try:
            for i, r in self.__df.iterrows():
                new_chunks = text_splitter.split_text(r["Text"])
                self.__chunks.extend(new_chunks)
                self.__chunks_ids.extend([i] * len(new_chunks))
            
            logging.info('Chunking finished')
        except Exception as e:
            logging.error(f"Error during chunking: {e}")
            raise e

    def __generate_embeddings(self):
        self.__create_chunks()

        if self.__embeddings_pickle_path and os.path.exists(self.__embeddings_pickle_path):
            try:
                with open(self.__embeddings_pickle_path, 'rb') as f:
                    self.__embeddings = pickle.load(f)
                logger.info('Embeddings loaded from pickle')
                return
            except Exception as e:
                logger.error(f"Error loading embeddings from pickle: {e}")
        
        if len(self.__chunks) == 0:
            logging.warning('No chunks to process')
            return

        model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)
        
        pool = model.start_multi_process_pool()
        self.__embeddings = model.encode_multi_process(
            self.__chunks,
            pool,
            prompt_name="passage"
        )
        model.stop_multi_process_pool(pool)

        # Save embeddings to disk
        try:
            with open('store-embeddings-nomic.pkl', 'wb') as f:
                pickle.dump(self.__embeddings, f)
        except Exception as e:
            logging.error(f"Error saving embeddings: {e}")

        logging.info('Embeddings generated')
    
    def chunk_text_from_id(self, id: int) -> str | None:
        if id < 0 or id >= len(self.__chunks):
            return None
        return self.__chunks[id]

    def document_text_from_chunk_id(self, chunk_id: int) -> str | None:
        if chunk_id < 0 or chunk_id >= len(self.__chunks_ids):
            return None
        return self.__df.iloc[self.__chunks_ids[chunk_id]]["Text"]

    def save_to_pickle(self, file_name: str) -> None:
        try:
            with open(file_name, 'wb') as f:
                pickle.dump(self, f)
        except Exception as e:
            logging.error(f"Error saving to pickle: {e}")
            raise e
    
    @staticmethod
    def load_from_pickle(file_name: str) -> 'DocumentsStore':
        try:
            with open(file_name, 'rb') as f:
                return pickle.load(f)
        except Exception as e:
            logging.error(f"Error loading from pickle: {e}")
            raise e

In [56]:
store = DocumentsStore(
    "/kaggle/input/preprocessed-texts/preprocessed-with-marking.pkl",
    embeddings_pickle_path="/kaggle/working/store-embeddings-nomic.pkl"
)
store.embeddings.shape

(72996, 768)

In [57]:
store.save_to_pickle("store-better-preprocess.pkl")

In [58]:
store = DocumentsStore.load_from_pickle("store-better-preprocess.pkl")

In [61]:
(store2.chunks_ids == store.chunks_ids)

True

In [40]:
store.save_to_pickle("store-best-preprocess.pkl")

In [4]:
df = pd.read_pickle("/kaggle/input/preprocessed-texts/preprocessed-with-marking.pkl")
df

Unnamed: 0,Path,Text
0,pack\AIBR\2006-Aymaras-peruanos-y-chilenos-en-los-Andes-ariqueños-Resistencia-y-conflicto-frente...,"introducción el presidente recibió el siguiente telegrama del gobierno de nueva imperial, señor ..."
0,pack\AIBR\2006-Cambios-de-género-y-discriminación-laboral-en-el-sector-financiero-colombiano-El-...,introducción e l presente artículo es producto de un estudio de caso realizado en una de las emp...
0,pack\AIBR\2006-Consecuencias-personales-en-la-ruptura-de-la-vida-laboral-El-caso-de-Telefónica.pdf,"introducción e l propósito al iniciar este trabajo era estudiar, desde una perspectiva antropoló..."
0,pack\AIBR\2006-Construcción-de-modelos-de-género-a-partir-de-textos-de-la-tradición-oral-en-Extr...,"introducción todos somos conscientes de que pertenecemos, formamos parte o nos consideran adscri..."
0,pack\AIBR\2006-Contribuciones-feministas-a-problemas-epistemológicos-de-la-disciplina-antropológ...,omo señala dileonardo a comienzos de la década de los 1990 la investigación antropológica de ori...
...,...,...
0,pack\Revista-Española-de-Antropología-Americana\2024-Paridad-y-violencia-política-contra-las-muj...,1. introducción. 2. paridad de género en presidencias municipales en chiapas. 3. registro de muj...
0,pack\Revista-Española-de-Antropología-Americana\2024-Reorientación-de-la-educación-superior-inte...,1. introducción. 2. baja california: escenario de pluridiversidad. 3. la pluridiversidad más all...
0,pack\Revista-Española-de-Antropología-Americana\2024-Una-aproximación-metodológica-a-la-lectura-...,1. introducción. 2. los signos de quipus. 3. los términos españoles en quechua y aimara. 4. los ...
0,pack\Revista-Española-de-Antropología-Americana\2024-Valdivia-fase-3-transformación-de-las-figur...,1. introducción. 2. el inicio de la desigualdad. 3. utilización de las figuritas. 4. mujeres div...


In [6]:
# model = SentenceTransformer("sentence-transformers/static-similarity-mrl-multilingual-v1", trust_remote_code=True)
# model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", trust_remote_code=True)
# model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/103k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [10]:
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=50)

In [11]:
chunks, chunks_ids = [], []
for i, r in df.iterrows():
    if type(r["Text"]) != str:
        continue

    new_chunks = text_splitter.split_text(r["Text"])
    chunks.extend(new_chunks)
    chunks_ids.extend([i] * len(new_chunks))

len(chunks)

72996

In [10]:
with open("cohere-chunks-500-50-nomic.pkl", "wb") as f:
    pickle.dump((chunks, chunks_ids), f)

In [11]:
pool = model.start_multi_process_pool()
embeddings_np = model.encode_multi_process(chunks, pool, prompt_name="passage")
model.stop_multi_process_pool(pool)
embeddings_np.shape

Chunks:   0%|          | 0/20 [00:00<?, ?it/s]

(74928, 768)

In [12]:
np.save("cohere-embeddings-500-50-nomic.bin", embeddings_np)

In [13]:
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)

faiss.write_index(index, "cohere-index-500-50-nomic.faiss")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c7612319-5d00-4a6f-9b1b-c6d20fb016a0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>