### Import Libraries

In [1]:
import os
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


### Loading PDF

In [2]:
file_path = ("./paper.pdf")
loader = PyMuPDFLoader(file_path)
docs = loader.load()
context = docs[0].page_content

### Qwen Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-4B"

In [2]:
import torch

In [3]:
torch.device("cuda:0")

device(type='cuda', index=0)

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.07s/it]


In [6]:
# prepare the model input
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]

In [7]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)

In [8]:
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

In [9]:
# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

In [10]:
# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0


In [11]:
thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)

thinking content: <think>
Okay, the user is asking for a short introduction to large language models. Let me start by recalling what I know about them. Large language models are AI systems that process and generate human-like text. They're based on deep learning, specifically transformer architecture.

I should mention their key components, like neural networks and training on vast amounts of text data. Maybe explain how they understand context and generate coherent responses. Also, highlight their applications in various fields such as chatbots, content creation, coding, etc.

Wait, the user wants it to be short, so I need to be concise. Avoid technical jargon but still be accurate. Maybe start with a definition, then key features, and then applications. Also, note that they're trained on massive datasets, which allows them to handle complex tasks. Don't forget to mention that they can be fine-tuned for specific uses. Let me check if I'm missing anything important. Oh, right, they use

### Prompt + Qwen

In [1]:
import os
from langchain.document_loaders import PyMuPDFLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Load the document
file_path = "./paper.pdf"
loader = PyMuPDFLoader(file_path)
docs = loader.load()

# Extract context from the PDF (first page, for simplicity)
context = docs[0].page_content

In [3]:
# Step 2: Load the model and tokenizer (as per Hugging Face recommended method)
model_name = "Qwen/Qwen3-4B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  4.09it/s]


In [4]:
# Step 3: Prepare the question and the prompt
question = "Who are the authors of this paper?"
prompt = f"""Given the following context from the pdf:
{context}

Answer the following question: "{question}"

Please answer the following question based on the context. Do not add any explanations or extra information.
"""

In [5]:
# Step 4: Tokenize the input and prepare the model input
messages = [
    {"role": "user", "content": prompt}
]

# Use the tokenizer's apply_chat_template method for formatting
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False  # Optional: controls thinking mode, default is True
)

# Tokenize the input text
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

In [6]:
# Step 5: Generate the response
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=100  # Adjust based on your needs
)

In [None]:
# Step 6: Extract the generated content
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

# Parse thinking content
try:
    # Look for the thinking token (e.g., </think>) in the output
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

In [None]:
# Step 7: Print the result
print("Thinking content:", thinking_content)
print("Answer:", content)

Thinking content: 
Answer: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, and Illia Polosukhin.
