## Пресет, делает эмбеддинги по мешку, LLM, тфидф строит фаисс индекс для поиска потом

In [None]:
import torch
import faiss
from transformers import AutoTokenizer, AutoModel
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

In [None]:
class Embedder:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", device=None):
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def embed(self, texts):
        # texts - список строк
        encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
        
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        
        token_embeddings = model_output.last_hidden_state
        attention_mask = encoded_input['attention_mask'].unsqueeze(-1)
        
        summed = torch.sum(token_embeddings * attention_mask, dim=1)
        counts = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
        mean_pooled = summed / counts
        
        embeddings = mean_pooled.cpu().numpy()
        embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)
        return embeddings

In [None]:
class BagOfWordsEmbedder:
    def __init__(self, max_features=10000):
        self.vectorizer = CountVectorizer(max_features=max_features)
        self.fitted = False

    def fit(self, texts):
        self.vectorizer.fit(texts)
        self.fitted = True

    def embed(self, texts):
        if not self.fitted:
            self.fit(texts)
        vectors = self.vectorizer.transform(texts).toarray().astype(np.float32)
        vectors /= np.linalg.norm(vectors, axis=1, keepdims=True) + 1e-10
        return vectors

In [None]:
class TfidfEmbedder:
    def __init__(self, max_features=10000):
        self.vectorizer = TfidfVectorizer(max_features=max_features)
        self.fitted = False

    def fit(self, texts):
        self.vectorizer.fit(texts)
        self.fitted = True

    def embed(self, texts):
        if not self.fitted:
            self.fit(texts)
        vectors = self.vectorizer.transform(texts).toarray().astype(np.float32)
        vectors /= np.linalg.norm(vectors, axis=1, keepdims=True) + 1e-10
        return vectors

In [None]:
class FaissIndex:
    def __init__(self, dimension):
        self.dimension = dimension
        self.index = faiss.IndexFlatIP(dimension)
    
    def add(self, embeddings):
        self.index.add(embeddings.astype(np.float32))
    
    def save(self, path):
        faiss.write_index(self.index, path)
    
    def load(self, path):
        self.index = faiss.read_index(path)
    
    def search(self, query_embeddings, top_k=5):
        D, I = self.index.search(query_embeddings.astype(np.float32), top_k)
        return D, I

In [None]:
texts = [
        "Hello world",
        "Hi there",
        "Goodbye world",
        "Hello from the other side",
        "I love machine learning",
        "Transformers are amazing"
    ]

In [None]:
embedder = Embedder()
embeddings = embedder.embed(texts)

In [None]:
index = FaissIndex(dimension=embeddings.shape[1])
index.add(embeddings)
index.save("faiss_index.bin")

In [None]:
index2 = FaissIndex(dimension=embeddings.shape[1])
index2.load("faiss_index.bin")

In [None]:
query = ["I enjoy machine learning and AI"]
query_emb = embedder.embed(query)

In [None]:
distances, indices = index2.search(query_emb, top_k=3)
print("Top matches:")
for dist, idx in zip(distances[0], indices[0]):
  print(f"Text: {texts[idx]} | Score (cosine sim): {dist:.4f}")