In [1]:
!pip install transformers langchain



In [2]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [3]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.13 (from langchain-community)
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.27 (from langchain-community)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

Загружаем эмбеддинги и тексты

In [4]:
import numpy as np

# Путь к сохранённым данным
embeddings_path = "embeddings.npy"  # Путь к файлу с эмбеддингами
texts_path = "texts.txt"  # Путь к файлу с текстами

# Загрузка эмбеддингов
embeddings = np.load(embeddings_path)
print(f"Эмбеддинги загружены: {embeddings.shape}")

# Загрузка текстов
with open(texts_path, "r", encoding="utf-8") as f:
    texts = [line.strip() for line in f]
print(f"Тексты загружены: {len(texts)}")


Эмбеддинги загружены: (53542, 384)
Тексты загружены: 53542


In [7]:
import faiss
import numpy as np

# Убедимся, что количество эмбеддингов совпадает с количеством текстов
assert len(embeddings) == len(texts), "Количество эмбеддингов и текстов должно совпадать!"

# Создание FAISS индекса
embedding_dim = embeddings.shape[1]  # Размерность эмбеддингов
index = faiss.IndexFlatL2(embedding_dim)  # Индекс для поиска по L2-норме (евклидово расстояние)

# Добавление эмбеддингов в индекс
index.add(embeddings)
print(f"Добавлено {index.ntotal} эмбеддингов в индекс.")


Добавлено 53542 эмбеддингов в индекс.


In [8]:
# Сохранение индекса на диск
faiss.write_index(index, "faiss_index")
print("Индекс сохранён.")


Индекс сохранён.


In [None]:
# # Загрузка индекса с диска
# index = faiss.read_index("faiss_index")
# print("Индекс загружен.")


In [62]:
from transformers import AutoTokenizer, AutoModel
import torch

# Загрузка предобученной модели и токенизатора
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Компактная модель для векторизации текстов
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Перенос модели на устройство (GPU, если доступен)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Функция для получения эмбеддингов с использованием GPU
def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    tokens = {key: value.to(device) for key, value in tokens.items()}  # Перенос данных на устройство
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Среднее по токенам

In [20]:
def retrieve_texts(query_embedding, index, texts, k=5):
    """
    Выполняет поиск ближайших соседей в индексе FAISS.
    query_embedding: np.array, эмбеддинг запроса (размерность должна совпадать с индексом)
    index: faiss.Index, индекс FAISS
    texts: list[str], список текстов
    k: int, количество ближайших соседей
    """
    distances, indices = index.search(query_embedding.reshape(1, -1), k)
    results = [(texts[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
    return results

# Пример использования
query_embedding = embeddings[0]  # Пример: первый эмбеддинг как запрос
retrieved_texts = retrieve_texts(query_embedding, index, texts, k=5)

# Вывод результатов
for text, distance in retrieved_texts:
    print(f"Текст: {text} (Расстояние: {distance})")


Текст: (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with (Расстояние: 0.0)
Текст: by specific pepsin inactivators. The pH activity curve of the purified enzyme showed two optima near pH 3 and 4. The relative activities at these optimal pH values were affected by salt concentration. Experimental evidence indicated that the two-optima phenomenon is a property of a single enzyme species. (Расстояние: 0.7951406836509705)
Текст: of cystine, aspartic acid, and serine. It inhibited trypsin in a molar ratio of 1 : 1 and alpha-chymotrypsin in a molar ratio of 2 : 1. It, however, inhibited neither pepsin nor pronase. It was relatively stable to heat treatment in the acidic medium, but not in the alkal

In [23]:
# Пример использования
query = "What are the common treatments for cancer?"
query_embedding = get_embedding(query)  # Получаем эмбеддинг запроса
results = retrieve_texts(query_embedding, index, texts, k=5)

# Вывод результатов
for text, distance in results:
    print(f"Текст: {text} (Расстояние: {distance})")

Текст: appears to have a future as an adjunct to existing therapy in order to control as much as to cure residual tumour. (Расстояние: 41.236751556396484)
Текст: of the treatment are looked at. (Расстояние: 41.60153579711914)
Текст: demonstrates the urgent need to complete randomized controlled trials of treatment in this group. (Расстояние: 41.81917953491211)
Текст: medical or surgical therapeutic measures applied may be fully effective. (Расстояние: 41.88026428222656)
Текст: and made recommendations for surgical management. (Расстояние: 41.95117950439453)


In [61]:
import requests

# Настройка API
api_key = "4Cs8hBRAzeJjgfAYR6ilpTFOtsTvCqrr"
model_LLM = "mistral-large-latest"
base_url = "https://api.mistral.ai/v1/chat/completions"

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

def generate_answer_with_mistral(prompt):
    payload = {
        "model": model_LLM,
        "temperature": 0.7,
        "top_p": 0.95,
        "max_tokens": 500,
        "messages": [{"role": "user", "content": prompt}]
    }

    response = requests.post(base_url, json=payload, headers=headers)
    response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"]

# Пример генерации ответа
prompt = "What are the latest advancements in cancer treatments?"
response = generate_answer_with_mistral(prompt)
print(f"Ответ: {response}")


Ответ: Cancer treatment is a rapidly evolving field, with numerous advancements being made in recent years. Here are some of the latest developments:

1. **Immunotherapy**: This approach uses the body's own immune system to fight cancer. Recent advancements include:
   - **CAR T-cell therapy**: This involves engineering the patient's own T cells to recognize and attack cancer cells. It has shown promising results in certain types of leukemia and lymphoma.
   - **Checkpoint inhibitors**: Drugs like PD-1/PD-L1 and CTLA-4 inhibitors help the immune system recognize and attack cancer cells. These have been approved for various types of cancer, including melanoma, lung cancer, and renal cell carcinoma.
   - **Cancer vaccines**: These are designed to treat existing cancers by stimulating the body's immune response. Some recent vaccines are showing promise in clinical trials.

2. **Targeted Therapies**: These drugs target specific molecules involved in cancer growth and progression. Recent ad

In [52]:
def generate_answer_with_mistral(prompt):
    if not isinstance(prompt, str):
        raise ValueError(f"Prompt должен быть строкой, получен {type(prompt)}")

    payload = {
        "model": model,
        "temperature": 0.7,
        "top_p": 0.95,
        "max_tokens": 500,
        "messages": [{"role": "user", "content": prompt}]
    }

    response = requests.post(base_url, json=payload, headers=headers)
    response.raise_for_status()  # Вызывает исключение при HTTP-ошибках
    return response.json()["choices"][0]["message"]["content"]


In [24]:
def rag_pipeline(query, index, texts, mistral_api_key, k=5):
    """
    Полная цепочка RAG: поиск + генерация.
    query: текстовый запрос
    query_embedding: np.array, эмбеддинг запроса.
    index: faiss.Index, индекс FAISS.
    texts: list[str], список текстов.
    mistral_api_key: str, ключ API для Mistral.
    k: int, количество ближайших соседей.
    """
    # 1. Поиск релевантных текстов
    query_embedding = get_embedding(query)
    retrieved_texts = retrieve_texts(query_embedding, index, texts, k)
    context = "\n\n".join([text for text, _ in retrieved_texts])

    # 2. Формирование запроса к модели
    prompt = f"Based on the following context, answer the query:\n\n{context}\n\nQuery:\n\n {query}"

    # 3. Генерация ответа
    answer = generate_answer_with_mistral(prompt)

    return {"answer": answer, "context": context}


In [64]:
# Пример использования
query = "What are the latest advancements in cancer treatments?"
result = rag_pipeline(query, index, texts, api_key, k=5)

# Вывод результата
print(f"Ответ: {result['answer']}")
print("\nКонтекст:")
print(result["context"])

Ответ: The provided context does not specifically mention the latest advancements in cancer treatments, but it does discuss some aspects of cancer management. Here are a few key points from the text:

1. **Combination Therapy**: The text mentions that a certain approach "appears to have a future as an adjunct to existing therapy" to control or cure residual tumors. This suggests that combination therapies, using multiple treatment methods, are being considered or implemented.

2. **Surgical Management**: The text highlights recommendations for surgical management, indicating that surgical techniques continue to be an important part of cancer treatment.

3. **Randomized Controlled Trials**: The text emphasizes the need for randomized controlled trials, which are essential for evaluating the effectiveness and safety of new treatments.

For a more comprehensive and up-to-date list of the latest advancements in cancer treatments, you might want to look into recent developments in fields su