In [1]:
import pandas as pd
answers = pd.read_csv('Answers.csv',encoding='latin1')
questions = pd.read_csv('Questions.csv', encoding='latin1')
tags = pd.read_csv('Tags.csv', encoding='latin1')

In [2]:
print(answers.head())

    Id  OwnerUserId          CreationDate  ParentId  Score  \
0   92         61.0  2008-08-01T14:45:37Z        90     13   
1  124         26.0  2008-08-01T16:09:47Z        80     12   
2  199         50.0  2008-08-01T19:36:46Z       180      1   
3  269         91.0  2008-08-01T23:49:57Z       260      4   
4  307         49.0  2008-08-02T01:49:46Z       260     28   

                                                Body  
0  <p><a href="http://svnbook.red-bean.com/">Vers...  
1  <p>I wound up using this. It is a kind of a ha...  
2  <p>I've read somewhere the human eye can't dis...  
3  <p>Yes, I thought about that, but I soon figur...  
4  <p><a href="http://www.codeproject.com/Article...  


In [3]:
print(questions.head())

    Id  OwnerUserId          CreationDate            ClosedDate  Score  \
0   80         26.0  2008-08-01T13:57:07Z                   NaN     26   
1   90         58.0  2008-08-01T14:41:24Z  2012-12-26T03:45:49Z    144   
2  120         83.0  2008-08-01T15:50:08Z                   NaN     21   
3  180    2089740.0  2008-08-01T18:42:19Z                   NaN     53   
4  260         91.0  2008-08-01T23:22:08Z                   NaN     49   

                                               Title  \
0  SQLStatement.execute() - multiple queries in o...   
1  Good branching and merging tutorials for Torto...   
2                                  ASP.NET Site Maps   
3                 Function for creating color wheels   
4  Adding scripting functionality to .NET applica...   

                                                Body  
0  <p>I've written a database generation script i...  
1  <p>Are there any really good tutorials explain...  
2  <p>Has anyone got experience creating <strong>... 

In [4]:
print(tags.head())

   Id             Tag
0  80            flex
1  80  actionscript-3
2  80             air
3  90             svn
4  90     tortoisesvn


In [5]:
qa_df = answers.merge(questions, left_on='ParentId', right_on='Id', suffixes=('_answer', '_question'))
print(qa_df.head())

   Id_answer  OwnerUserId_answer   CreationDate_answer  ParentId  \
0         92                61.0  2008-08-01T14:45:37Z        90   
1        124                26.0  2008-08-01T16:09:47Z        80   
2        199                50.0  2008-08-01T19:36:46Z       180   
3        269                91.0  2008-08-01T23:49:57Z       260   
4        307                49.0  2008-08-02T01:49:46Z       260   

   Score_answer                                        Body_answer  \
0            13  <p><a href="http://svnbook.red-bean.com/">Vers...   
1            12  <p>I wound up using this. It is a kind of a ha...   
2             1  <p>I've read somewhere the human eye can't dis...   
3             4  <p>Yes, I thought about that, but I soon figur...   
4            28  <p><a href="http://www.codeproject.com/Article...   

   Id_question  OwnerUserId_question CreationDate_question  \
0           90                  58.0  2008-08-01T14:41:24Z   
1           80                  26.0  2008-08-

In [6]:
sample = qa_df.head(3000).copy()
import re
def clean_text(text: str)-> str:
    text = re.sub(r'<[^>]+>','', text)
    text = text.lower()
    text = re.sub(r'[^a-z\s]','', text)
    return text


sample['clean_questions'] = sample['Body_question'].apply(clean_text)
sample['clean_answers'] = sample['Body_answer'].apply(clean_text)
print(sample.head())

   Id_answer  OwnerUserId_answer   CreationDate_answer  ParentId  \
0         92                61.0  2008-08-01T14:45:37Z        90   
1        124                26.0  2008-08-01T16:09:47Z        80   
2        199                50.0  2008-08-01T19:36:46Z       180   
3        269                91.0  2008-08-01T23:49:57Z       260   
4        307                49.0  2008-08-02T01:49:46Z       260   

   Score_answer                                        Body_answer  \
0            13  <p><a href="http://svnbook.red-bean.com/">Vers...   
1            12  <p>I wound up using this. It is a kind of a ha...   
2             1  <p>I've read somewhere the human eye can't dis...   
3             4  <p>Yes, I thought about that, but I soon figur...   
4            28  <p><a href="http://www.codeproject.com/Article...   

   Id_question  OwnerUserId_question CreationDate_question  \
0           90                  58.0  2008-08-01T14:41:24Z   
1           80                  26.0  2008-08-

In [13]:
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import chromadb

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Предобработка текстов
sample['qa_text'] = sample['clean_questions'] + " " + sample['clean_answers']
texts = sample['qa_text'].tolist()
ids = [str(i) for i in range(len(sample))]
questions = sample['clean_questions'].tolist()
answers = sample['clean_answers'].tolist()
tags = sample['Tags'].tolist() if 'Tags' in sample.columns else [None] * len(sample)
urls = sample['Link'].tolist() if 'Link' in sample.columns else [None] * len(sample)

# Настройка клиента ChromaDB
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="StackOverflowQnA")

# Параметры загрузки
embedding_batch_size = 64
chroma_batch_size = 1000

# Буферы для накопления перед загрузкой в Chroma
buffer_ids = []
buffer_embeddings = []
buffer_documents = []
buffer_metadatas = []

# Процессинг и загрузка
with torch.no_grad():
    for start_idx in tqdm(range(0, len(texts), embedding_batch_size), desc="Обработка батчей"):
        end_idx = start_idx + embedding_batch_size

        batch_texts = texts[start_idx:end_idx]
        batch_ids = ids[start_idx:end_idx]
        batch_questions = questions[start_idx:end_idx]
        batch_answers = answers[start_idx:end_idx]
        batch_tags = tags[start_idx:end_idx]
        batch_urls = urls[start_idx:end_idx]

        # Эмбеддинг
        batch_embeddings = model.encode(
            batch_texts,
            batch_size=embedding_batch_size,
            convert_to_tensor=True,
            device=device,
            show_progress_bar=False
        ).cpu()

        # Добавление в буферы
        for i in range(len(batch_texts)):
            buffer_ids.append(batch_ids[i])
            buffer_documents.append(batch_texts[i])
            buffer_embeddings.append(batch_embeddings[i].tolist())  # <-- исправлено здесь
            buffer_metadatas.append({
    "question": batch_questions[i],
    "answer": batch_answers[i],
    "tags": batch_tags[i] if batch_tags[i] is not None else "",
    "url": batch_urls[i] if batch_urls[i] is not None else ""
})


        # Загрузка, если буфер достиг лимита
        if len(buffer_ids) >= chroma_batch_size:
            collection.add(
                ids=buffer_ids,
                documents=buffer_documents,
                embeddings=buffer_embeddings,
                metadatas=buffer_metadatas
            )
            buffer_ids, buffer_embeddings, buffer_documents, buffer_metadatas = [], [], [], []

# Финальная загрузка оставшихся данных
if buffer_ids:
    collection.add(
        ids=buffer_ids,
        documents=buffer_documents,
        embeddings=buffer_embeddings,
        metadatas=buffer_metadatas
    )

# Проверка готовности
print(f"Итого в коллекции: {collection.count()} документов.")


Обработка батчей: 100%|██████████| 47/47 [02:36<00:00,  3.34s/it]


Итого в коллекции: 3000 документов.


In [19]:
def semantic_search(query: str, collection, model, top_k=5, device='cpu'):
    """
    Выполняет семантический поиск по коллекции Chroma.
    
    Args:
        query (str): текст запроса.
        collection: объект коллекции Chroma.
        model: модель SentenceTransformer для вычисления эмбеддингов.
        top_k (int): число возвращаемых результатов.
        device (str): устройство для вычислений ('cpu' или 'cuda').
        
    Returns:
        list of dict: список топ-N результатов с полями:
            - 'question' (str)
            - 'answer' (str)
            - 'tags' (str)
            - 'url' (str)
            - 'score' (float) — косинусное сходство
    """
    # вычисляем эмбеддинг запроса
    query_embedding = model.encode([query], convert_to_tensor=True, device=device).cpu().tolist()[0]
    
    # ищем ближайших соседей в Chroma
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=['metadatas', 'documents', 'distances']
    )
    
    # формируем вывод
    hits = []
    for i in range(len(results['ids'][0])):
        hits.append({
            'question': results['metadatas'][0][i].get('question', ''),
            'answer': results['metadatas'][0][i].get('answer', ''),
            'tags': results['metadatas'][0][i].get('tags', ''),
            'score': 1 - results['distances'][0][i]  # расстояние, обратим в сходство
        })
    return hits


In [23]:
#проверка на тех же вопросах, что в лабе4
query1 = "what i need to do if webhook crashed my telegram bot"
top_results = semantic_search(query1, collection, model)
for i, res in enumerate(top_results, 1):
    print(f"Результат #{i} (score={res['score']:.3f}):")
    print(f"Вопрос: {res['question']}")
    print(f"Ответ: {res['answer']}")
    print(f"Теги: {res['tags']}")
    print(f"Ссылка: {res['url']}\n")

Результат #1 (score=-0.442):
Вопрос: of all the forms of captcha available which one is the least crackable while remaining fairly human readable

Ответ: i agree with thomas captcha is on its way out but if you must use it recaptcha  is a pretty good provider with a simple api

Теги: 
Ссылка: 

Результат #2 (score=-0.459):
Вопрос: of all the forms of captcha available which one is the least crackable while remaining fairly human readable

Ответ: 
  i believe that captcha is dying if someone really wants to break it it will be broken i read somewhere dont remember where about a site that gave you free porn in exchange for answering captchas to they can be rendered obsolete by bots so why bother


anyone who really wants to break this padlock can use a pair of bolt cutters so why bother with the lock
anyone who really wants to steal this car can drive up with a tow truck so why bother locking my car
anyone who really wants to open this safe can cut it open with an oxyacetylene torch so w

In [24]:
query2 = "how to download a file"
top_results2 = semantic_search(query2, collection, model)
for i, res in enumerate(top_results2, 1):
    print(f"Результат #{i} (score={res['score']:.3f}):")
    print(f"Вопрос: {res['question']}")
    print(f"Ответ: {res['answer']}")
    print(f"Теги: {res['tags']}")
    print(f"Ссылка: {res['url']}\n")

Результат #1 (score=-0.070):
Вопрос: what is the best way to implement from a web page a download action using aspnet 

log files for a action are created in a directory called application rootlogs  i have the full path and want to provide a button that when clicked will download the log file from the iis server to the users local pc

Ответ: does this help

httpwwwwestwindcomweblogpostsaspx

responsecontenttype  applicationoctetstream
responseappendheadercontentdispositionattachment filenamelogfiletxt
responsetransmitfile servermappathlogfiletxt 
responseend


responsetransmitfile is the accepted way of sending large files instead of responsewritefile

Теги: 
Ссылка: 

Результат #2 (score=-0.072):
Вопрос: on a phpbased web site i want to send users a download package after they have filled out a short form the siteinitiated download should be similar to sites like downloadcom which say your download will begin in a moment

a couple of possible approaches i know about and browser compat

In [25]:
query3 = "how to segment a picture"
top_results3 = semantic_search(query3, collection, model)
for i, res in enumerate(top_results2, 1):
    print(f"Результат #{i} (score={res['score']:.3f}):")
    print(f"Вопрос: {res['question']}")
    print(f"Ответ: {res['answer']}")
    print(f"Теги: {res['tags']}")
    print(f"Ссылка: {res['url']}\n")

Результат #1 (score=-0.070):
Вопрос: what is the best way to implement from a web page a download action using aspnet 

log files for a action are created in a directory called application rootlogs  i have the full path and want to provide a button that when clicked will download the log file from the iis server to the users local pc

Ответ: does this help

httpwwwwestwindcomweblogpostsaspx

responsecontenttype  applicationoctetstream
responseappendheadercontentdispositionattachment filenamelogfiletxt
responsetransmitfile servermappathlogfiletxt 
responseend


responsetransmitfile is the accepted way of sending large files instead of responsewritefile

Теги: 
Ссылка: 

Результат #2 (score=-0.072):
Вопрос: on a phpbased web site i want to send users a download package after they have filled out a short form the siteinitiated download should be similar to sites like downloadcom which say your download will begin in a moment

a couple of possible approaches i know about and browser compat

Чувствительность к словам:
Классический поиск работает только при точных совпадениях слов. Семантический — ищет по смыслу, даже если слова различаются.

Работа с синонимами:
Классический поиск игнорирует синонимы. Семантический — способен находить релевантные ответы с другими словами той же сути.

Переформулировки:
Классический не справляется с перефразированием запроса. Семантический распознаёт разные формулировки одного вопроса.

Ошибки поиска:
Классический делает меньше ошибок за счёт точности, но может пропустить полезные результаты. Семантический находит больше, но иногда включает смежные, не совсем релевантные темы.

In [26]:
#проверка на вопросе, который точно есть в датасете
query4 = "what is the best way to implement from a web page a download action using aspnet"
top_results4 = semantic_search(query4, collection, model)
for i, res in enumerate(top_results2, 1):
    print(f"Результат #{i} (score={res['score']:.3f}):")
    print(f"Вопрос: {res['question']}")
    print(f"Ответ: {res['answer']}")
    print(f"Теги: {res['tags']}")
    print(f"Ссылка: {res['url']}\n")

Результат #1 (score=-0.070):
Вопрос: what is the best way to implement from a web page a download action using aspnet 

log files for a action are created in a directory called application rootlogs  i have the full path and want to provide a button that when clicked will download the log file from the iis server to the users local pc

Ответ: does this help

httpwwwwestwindcomweblogpostsaspx

responsecontenttype  applicationoctetstream
responseappendheadercontentdispositionattachment filenamelogfiletxt
responsetransmitfile servermappathlogfiletxt 
responseend


responsetransmitfile is the accepted way of sending large files instead of responsewritefile

Теги: 
Ссылка: 

Результат #2 (score=-0.072):
Вопрос: on a phpbased web site i want to send users a download package after they have filled out a short form the siteinitiated download should be similar to sites like downloadcom which say your download will begin in a moment

a couple of possible approaches i know about and browser compat

# Отчет по лабораторной работе: Семантический поиск с Chroma

## 1. Хранение данных в векторном виде

Для реализации семантического поиска использовались эмбеддинги — векторные представления текстов, полученные при помощи модели (например, SentenceTransformers или аналогичной). Каждый текст (вопрос/ответ) преобразуется в вектор фиксированной длины, например, 384 или 768 float-значений.

**Объём хранения эмбеддингов:**

- Один float32 = 4 байта.
- Эмбеддинг размерности 384 → 384 × 4 = **1.5 КБ на запись**.
- Для 10 000 записей: **~15 МБ памяти**.

## 2. Использование Chroma

Chroma — это векторное хранилище, использующее Approximate Nearest Neighbors (ANN) для быстрого поиска по эмбеддингам.

- Индексация производится через библиотеки вроде FAISS или HNSW.
- Поиск происходит не по полному перебору, а по ближайшим к искомому вектору (approximate search).
- Обновление: записи можно **добавлять и удалять**, обновление происходит через API `add_documents()`, `delete()`.

```python
import chromadb
from sentence_transformers import SentenceTransformer

client = chromadb.Client()
collection = client.create_collection("stackoverflow")

# Генерация эмбеддингов
model = SentenceTransformer('all-MiniLM-L6-v2')
texts = qa_df['Title'].tolist()
embeddings = model.encode(texts)

# Добавление в Chroma
collection.add(
    documents=texts,
    embeddings=embeddings.tolist(),
    ids=[str(i) for i in range(len(texts))]
)
```
## 3. Запросы к Chroma
```python
query = "what is the best way to implement from a web page a download action using aspnet"
query_embedding = model.encode([query])

results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=5
)

for res in results['documents'][0]:
    print(res)

```
## 4. Сравнение поиска
- **Тип поиска: TF-IDF**
Принцип: Совпадение по словам
Пример результата: Точное совпадение формулировки
Недостатки: Не распознает синонимы и перефразировки

- **Тип поиска: Семантический (Chroma)**
Принцип: Поиск по смыслу (векторы)
Пример результата: Синонимичные или близкие по смыслу результаты
Недостатки: Требует больше памяти, эмбеддинги