In [None]:
!pip install chromadb
!pip install feedparser
!pip install pdfplumber

In [None]:
from PyPDF2 import PdfReader
import io
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import re
import feedparser
from urllib.parse import quote_plus
import json
import requests
import tempfile
import os
from pdfplumber import open as pdf_open
from typing import List, Dict

In [None]:
def fetch_arxiv_papers(query="transformers", max_results=50):
  url = f"http://export.arxiv.org/api/query?search_query={query}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending"
  feed = feedparser.parse(url)
  papers = []
  for entry in feed.entries:
    links = entry.links
    pdf_link = None
    for l in links:
      if l['type']=='application/pdf':
        pdf_link = l['href']
        papers.append({
              "id": entry.id.split('/abs/')[-1],
	            "title": entry.title,
	            "summary": entry.summary,
	            "link": entry.link,
              "pdf": pdf_link,
              "published": entry.published	        })
        break
  return papers


In [None]:
def fetch_arxiv_by_ids(paper_ids):
    papers = []
    for pid in paper_ids:
        url = f"http://export.arxiv.org/api/query?search_query=id:{pid}&start=0&max_results=1"
        feed = feedparser.parse(url)
        entry = feed.entries[0]
        pdf_link = None
        for link in entry.links:
            if link['type'] == 'application/pdf':
                pdf_link = link['href']
                break

        papers.append({
            "id": entry.id.split('/abs/')[-1],
            "title": entry.title,
            "summary": entry.summary,
            "link": entry.link,
            "pdf": pdf_link,
            "published": entry.published
        })

    return papers

In [None]:
paper_ids = ["1706.03762", #Attention is all you need
             "2601.09732", #Benchmarking Cross-Lingual Semantic Alignment in Multilingual Embeddings
             "1810.04805", #BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
             "2005.14165", #Language Models are Few-Shot Learners
             "1907.11692", #RoBERTa: A Robustly Optimized BERT Pretraining Approach
             "2106.04554", #A Survey of Transformers
             "2508.09834", #Speed Always Wins: A Survey on Efficient Architectures for Large Language Models
             ]

In [None]:
fetch_arxiv_by_ids(paper_ids)

In [None]:
def extract_text_from_pdf(pdf_url):
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()

        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(response.content)
            tmp_path = tmp_file.name

        # 2. Извлечение текста с улучшенной обработкой
        text_parts = []
        with pdf_open(tmp_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                # Извлекаем текст с сохранением структуры
                words = page.extract_words(
                    x_tolerance=3,          # расстояние между словами по горизонтали
                    y_tolerance=3,          # расстояние между словами по вертикали
                    keep_blank_chars=False,
                    use_text_flow=True,     # учитывать поток текста
                    extra_attrs=["fontname", "size"]  # дополнительная информация
                )

                # Собираем текст из отдельных слов
                page_text = " ".join([word["text"] for word in words])

                if page_text:
                    # 3. Очищаем текст от артефактов
                    cleaned = clean_page_text(page_text, page_num)
                    text_parts.append(cleaned)

        # 4. Удаляем временный файл
        os.unlink(tmp_path)

        # 5. Объединяем и финально чистим
        full_text = "\n".join(text_parts)
        full_text = re.sub(r'\n\s*\n', '\n\n', full_text)  # убираем пустые строки
        return full_text.strip()

    except Exception as e:
        print(f"Ошибка при обработке {pdf_url}: {e}")
        return ""

def clean_page_text(text, page_num):
    """Очищает текст одной страницы от типичных артефактов"""
    # 1. Убираем номера страниц (обычно в начале/конце)
    text = re.sub(rf'^\s*{page_num}\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(rf'\s*{page_num}\s*$', '', text)

    # 2. Убираем колонтитулы (типичные шаблоны)
    text = re.sub(
        r'^.*?(?:arXiv|Submitted|Published|Volume|Issue|Page).*?$',
        '', text, flags=re.IGNORECASE | re.MULTILINE
    )

    # 3. Исправляем "слипшиеся" слова (частая проблема PDF)
    # Вставляем пробел между: буква+цифра, цифра+буква, буква+специальный символ
    text = re.sub(r'([a-zA-Z])([0-9])', r'\1 \2', text)
    text = re.sub(r'([0-9])([a-zA-Z])', r'\1 \2', text)
    text = re.sub(r'([a-zA-Z])([^\w\s])', r'\1 \2', text)
    text = re.sub(r'([^\w\s])([a-zA-Z])', r'\1 \2', text)

    # 4. Нормализуем пробелы и переносы
    text = re.sub(r'\s+', ' ', text)      # множественные пробелы → один
    text = re.sub(r' *\n *', '\n', text)  # пробелы вокруг переносов
    text = re.sub(r'\n+', '\n', text)     # множественные переносы → один

    # 5. Убираем одиночные символы (часто артефакты)
    text = re.sub(r'\n.\n', '\n', text)

    return text.strip()

In [None]:
def split_into_chunks(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    chunks = []

    current_chunk = ""
    for paragraph in paragraphs:
        if len(current_chunk) + len(paragraph) <= chunk_size:
            current_chunk += " " + paragraph
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Если абзац слишком большой — делим его
            if len(paragraph) > chunk_size:
                words = paragraph.split()
                temp_chunk = ""
                for word in words:
                    if len(temp_chunk) + len(word) < chunk_size - overlap:
                        temp_chunk += " " + word
                    else:
                        chunks.append(temp_chunk.strip())
                        temp_chunk = temp_chunk[-overlap:] + " " + word
                if temp_chunk:
                    chunks.append(temp_chunk.strip())
                current_chunk = ""
            else:
                current_chunk = paragraph
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


In [None]:
def process_papers_to_chunks(query, max_results=50, chunk_size=500, overlap=100, exact_ids=False):
    if exact_ids:
      papers = fetch_arxiv_by_ids(query)
    else:
      papers = fetch_arxiv_papers(query=query, max_results=max_results)
    processed_docs = []

    for paper in papers:
        if not paper["pdf"]:
            continue
        full_text = extract_text_from_pdf(paper["pdf"])
        if not full_text:
            continue

        chunks = split_into_chunks(full_text, chunk_size=chunk_size, overlap=overlap)

        for i, chunk in enumerate(chunks):
            processed_docs.append({
                "paper_id": paper["id"],
                "title": paper["title"],
                "chunk_index": i,
                "content": chunk,
                "url": paper["link"]
            })

    return processed_docs

docs = process_papers_to_chunks("transformers", max_results=10, chunk_size=800, overlap=100) + process_papers_to_chunks(paper_ids, exact_ids=True)

#with open("processed_chunks.json", "w", encoding="utf-8") as f:
#    json.dump(docs, f, ensure_ascii=False, indent=2)

In [None]:
chroma_client = chromadb.PersistentClient(path="./data")
try:
  chroma_client.delete_collection("arxiv_collection")
except:
  pass

embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = chroma_client.create_collection(name="arxiv_collection", embedding_function=embedding_function)

In [None]:
documents = [doc['content'] for doc in docs]
metadatas = [{"title": doc['title'], "paper_id": doc['paper_id'], "url": doc['url']} for doc in docs]
ids = [f"{doc['paper_id']}_chunk_{doc['chunk_index']}" for doc in docs]

collection.add(documents=documents, metadatas=metadatas, ids=ids)

In [None]:
def search(query, n=5):
    vector = embedding_function([query])
    results = collection.query(
        query_embeddings=vector,
        n_results=n,
        include=["documents"]
    )
    res = " \n".join(str(item) for item in results['documents'][0])
    return res

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-4B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
questions=[
    "Какой порог SA рекомендует Semanscope для быстрой фильтрации моделей с достаточным кросс-языковым выравниванием?",
    "Почему mBERT и LaBSE, имея одинаковую BERT-base архитектуру, показывают различие в SA на 40%?",
    "Что такое механизм внимания, для какой цели он применяется?",
    "В чём заключаются вычислительные выгоды для вычисления attention-представлений по сравнению с последовательными RNN-кодировщиками?",
    "Какое преимущество few-shot промптов, по сравнению с one-shot промптами?",
    "Что такое Winograd-Style Tasks в контексте задач для трансформеров?",
    "Какие ключевые недостатки были у GPT-3?",
    "Что такое иерархические нейросети-трансформеры?",
    "Что предложили в трансформерах взамен батч-нормализации? И это стало работать лучше",
    "Какая вычислительная сложность у трансформерной нейросети?"


]

In [None]:
russian_prompt = questions[5]
translate_prompt = """
Translate question in Russian about computer science to English
"""
translate_messages = [
    {"role": "system", "content": translate_prompt},
    {"role": "user", "content": russian_prompt}
]
text = tokenizer.apply_chat_template(
    translate_messages,
    tokenize=False,
    add_generation_prompt=False,
    enable_thinking=False
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=100
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0])+1:].tolist()

prompt = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
print(prompt)
chunks = search(prompt, n=20)
initial_prompt = """
You are an assistant, you give an answer on user's question using earlier retrieved fragments from Arxiv papers.
The request is in this form
user: <user's request>
===
assistant:
<chunk_1>
<chunk_2>
...
user: Answer (in Russian):
You give an answer in Russian language based on retrieved chunks with links
If there is no information in text chunks, and you are not confident - print only "Я не знаю, мне недостаточно информации".
Don't create something yourself!
In chunks there maybe words with no space delimeter
"""
messages = [
    {"role": "system", "content": initial_prompt},
    {"role": "user", "content": prompt},
    {"role": "assistant", "content": chunks},
    {"role": "user", "content": "Answer (in Russian):"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=False,
    enable_thinking=False
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

try:
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("Answer:", content)

What are Winograd-style tasks in the context of tasks for transformers?
Answer: Я не знаю, мне недостаточно информации.
