In [3]:
!wget ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0001.xml.gz

--2024-12-21 18:07:21--  ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0001.xml.gz
           => ‘pubmed24n0001.xml.gz’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 130.14.250.10, 130.14.250.11, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pubmed/baseline ... done.
==> SIZE pubmed24n0001.xml.gz ... 19648513
==> PASV ... done.    ==> RETR pubmed24n0001.xml.gz ... done.
Length: 19648513 (19M) (unauthoritative)


2024-12-21 18:07:23 (27.4 MB/s) - ‘pubmed24n0001.xml.gz’ saved [19648513]



In [2]:
!gunzip pubmed24n0001.xml.gz

In [4]:
!pip install lxml



In [2]:
from lxml import etree

# Укажите путь к файлу
file_path = "/content/pubmed24n0001.xml"

# Чтение и парсинг XML-файла
def parse_pubmed_file(file_path):
    records = []

    # Открываем и парсим файл
    with open(file_path, "rb") as f:
        tree = etree.parse(f)

    # Ищем все статьи
    for article in tree.xpath("//PubmedArticle"):
        # Извлекаем PMIDs, заголовки и аннотации
        pmid = article.findtext(".//PMID")
        title = article.findtext(".//ArticleTitle")
        abstract = article.findtext(".//AbstractText")

        # Сохраняем только записи с аннотациями
        if abstract:
            records.append({
                "pmid": pmid,
                "title": title,
                "abstract": abstract
            })

    return records

# Пример использования
parsed_records = parse_pubmed_file(file_path)

# Вывод первых 5 записей
for record in parsed_records[:5]:
    print(f"PMID: {record['pmid']}\nTitle: {record['title']}\nAbstract: {record['abstract']}\n")


PMID: 21
Title: [Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)].
Abstract: (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost.

PMID: 24
Title: Influence of a new virostatic compound on the induction of enzymes in rat liver.
Abstract: The virostatic compound N,N-diethyl-4-[2-(2-oxo-3-tetradecyl-1-imidazolidinyl)-ethyl]-1-piperazinecarboxamide-hydrochloride (5531) was analyzed as to its effect on the induction of tryptophan-pyrrolase and tyrosineaminotransferase in rat liver. 1. The basic activity of the enzymes was not influenced by the substance either in no

In [3]:
len(parsed_records)

15401

In [4]:
len(parsed_records[0].get('abstract'))

395

In [5]:
# parsed_records[0].get('abstract')
max([len(x.get('abstract')) for x in parsed_records])

4104

In [6]:
!pip install transformers langchain



In [9]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Параметры разбиения текста
CHUNK_SIZE = 350  # Максимальная длина одного фрагмента (в символах)
CHUNK_OVERLAP_SCALE = 0.1  # Процент перекрытия между фрагментами
CHUNK_OVERLAP = int(CHUNK_SIZE * CHUNK_OVERLAP_SCALE)

# Создание текстового разделителя
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " "],  # Разделители: параграфы, строки, пробелы
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

# Преобразование данных в формат Document
documents = [
    Document(page_content=record["abstract"], metadata={"id": record["pmid"]})
    for record in parsed_records if record.get("abstract")
]

# Разделение текстов на фрагменты
splits = splitter.split_documents(documents)

# Вывод информации о результатах
print(f"Количество исходных документов: {len(documents)}")
print(f"Количество фрагментов после разбиения: {len(splits)}")


Количество исходных документов: 15401
Количество фрагментов после разбиения: 53542


In [8]:
import torch
torch.cuda.is_available()

False

In [None]:
from sentence_transformers import SentenceTransformer
import torch

# Проверка доступности GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

# Загрузка модели
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name, device=device)

# Пример данных (замените `texts` на ваши фрагменты)
texts = [split.page_content for split in splits]

# Получение эмбеддингов
embeddings = model.encode(
    texts,
    batch_size=32,  # Размер батча, регулируйте в зависимости от памяти GPU
    convert_to_tensor=True,  # Эмбеддинги сразу возвращаются в формате PyTorch тензоров
    show_progress_bar=True  # Показывать прогресс выполнения
)

print(f"Создано {len(embeddings)} эмбеддингов.")


Используемое устройство: cpu


Batches:   0%|          | 0/1674 [00:00<?, ?it/s]

In [None]:
import numpy as np

# Сохранение эмбеддингов и соответствующих текстов
np.save("embeddings.npy", np.array(embeddings))
with open("texts.txt", "w", encoding="utf-8") as f:
    for text in texts:
        f.write(text + "\n")

print("Эмбеддинги и тексты успешно сохранены!")


Эмбеддинги и тексты успешно сохранены!


In [None]:
# Перенос индекса с GPU на CPU
cpu_index = faiss.index_gpu_to_cpu(gpu_index)

# Сохранение индекса на диск
faiss.write_index(cpu_index, "faiss_index.index")
print("Индекс успешно сохранён на диск!")

Индекс успешно сохранён на диск!


In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Пусть у нас есть следующие данные
# embeddings: тензор или numpy-массив эмбеддингов (например, 384-мерные векторы)
# texts: список текстов, соответствующих этим эмбеддингам
# metadata: (опционально) список метаданных, например, {"id": id текста, "source": "PubMed"}

# Пример для метаданных
metadata = [{"id": i, "source": "abstract"} for i in range(len(texts))]

# Создание векторного хранилища Chroma
vectorstore = Chroma.from_embeddings(
    embeddings=embeddings,
    texts=texts,
    metadatas=metadata,
    persist_directory="chroma_index",  # Путь для сохранения индекса
)

# Сохранение индекса
vectorstore.persist()
print("Индекс успешно создан и сохранён.")


Проверка Chroma

In [None]:
# Загрузка индекса
vectorstore = Chroma(persist_directory="chroma_index", embedding_function=None)

# Проверка данных
retriever = vectorstore.as_retriever()
results = retriever.get_relevant_documents("What are common cancer treatments?")
for result in results:
    print(f"Текст: {result.page_content}")
    print(f"Метаданные: {result.metadata}")


In [None]:
# Создание retriever для поиска
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
print("Поисковый механизм настроен.")

In [None]:
# Тестирование поиска
query = "What are common cancer treatments?"
results = retriever.get_relevant_documents(query)

# Вывод релевантных документов
print("Релевантные документы:")
for result in results:
    print(f"Текст: {result.page_content}")
    print(f"Метаданные: {result.metadata}")

In [None]:
from langchain_community.llms import HuggingFaceHub
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [None]:
# Создаём LLM на основе Hugging Face
llm_pipeline = pipeline("text-generation", model="bigscience/bloomz-560m", device=0, max_length=150,  # Общая длина (включая входные данные)
    max_new_tokens=100)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

In [None]:

# Форматирование документов
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Определяем цепочку Retrieval + Generation
rag_chain = RetrievalQA.from_chain_type(
    retriever=retriever,
    llm=llm,
    return_source_documents=True
)


# Пример запроса
query = "What are the common cancer treatments?"
result = rag_chain({"query": query})

# Вывод результата
print("Ответ:")
print(result["result"])

print("\nИспользованные документы:")
for doc in result["source_documents"]:
    print(f"- {doc.page_content}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=100) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Ответ:
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

about the extension of the disease.

protein in blood of patients with cancer.

remain the drug treatment of choice.

of patients who have had malignant disease are discussed.

Question: What are the common cancer treatments?
Helpful Answer: chemotherapy

Использованные документы:
- about the extension of the disease.
- protein in blood of patients with cancer.
- remain the drug treatment of choice.
- of patients who have had malignant disease are discussed.


In [None]:
import requests

# Настройки API Mistral
api_key = "MY_MISTRAL_API"  # Ваш ключ API
model = "mistral-large-latest"  # Выбранная модель
base_url = "https://api.mistral.ai/v1/chat/completions"

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

# Функция для вызова Mistral API
def call_mistral_api(prompt, max_tokens=500, temperature=0.7, top_p=0.95):
    payload = {
        "model": model,
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": max_tokens,
        "messages": [{"role": "user", "content": prompt}]
    }

    response = requests.post(base_url, json=payload, headers=headers)
    response.raise_for_status()  # Вызывает исключение при ошибке
    return response.json()["choices"][0]["message"]["content"]

# Пример запроса к Mistral
test_prompt = "What are the latest advancements in cancer treatments?"
print("Тестовый ответ Mistral:")
print(call_mistral_api(test_prompt))


In [None]:
from langchain.chains import RetrievalQA

# Настройка RAG с использованием Mistral API
class MistralLLM:
    def __init__(self, api_key, model, base_url, max_tokens=500, temperature=0.7, top_p=0.95):
        self.api_key = api_key
        self.model = model
        self.base_url = base_url
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        self.max_tokens = max_tokens
        self.temperature = temperature
        self.top_p = top_p

    def generate(self, prompt):
        payload = {
            "model": self.model,
            "temperature": self.temperature,
            "top_p": self.top_p,
            "max_tokens": self.max_tokens,
            "messages": [{"role": "user", "content": prompt}]
        }

        response = requests.post(self.base_url, json=payload, headers=self.headers)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"]

# Инициализация генеративной модели Mistral
mistral_llm = MistralLLM(
    api_key="my_mistral_api",
    model="mistral-large-latest",
    base_url="https://api.mistral.ai/v1/chat/completions"
)

# Создание RAG-цепочки
def rag_chain(query, retriever, mistral_llm):
    # Шаг 1: Поиск релевантных документов
    results = retriever.get_relevant_documents(query)
    context = "\n\n".join([doc.page_content for doc in results])

    # Шаг 2: Формирование запроса к модели Mistral
    prompt = f"Используя следующие данные, ответь на запрос: {query}\n\nКонтекст:\n{context}"

    # Шаг 3: Генерация ответа
    response = mistral_llm.generate(prompt)

    return {"result": response, "source_documents": results}

# Пример использования
query = "What are the latest advancements in cancer treatments?"
result = rag_chain(query, retriever, mistral_llm)

# Вывод результата
print("Ответ:")
print(result["result"])

print("\nИспользованные документы:")
for doc in result["source_documents"]:
    print(f"- {doc.page_content}")
