In [1]:
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython.display import display, Markdown, Latex
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.notebook import tqdm

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x733c4210fcb0>

In [2]:
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B").to(device)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [3]:
prompt = "Differentiate f(x) = \\sqrt{\\cos(x^2)}"
messages = [
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt", add_special_tokens=False).to(model.device)

In [5]:
output = model.generate(
    **model_inputs,
    max_new_tokens=6000,
    pad_token_id=tokenizer.eos_token_id
)
# Extract only the newly generated tokens for each sequence in the batch
generated_ids = [
    output_ids[len(model_inputs.input_ids[0]):] for output_ids in output
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
all_ids = tokenizer.batch_decode(output, skip_special_tokens=False)

print(response[0])

Okay, so I need to differentiate the function f(x) = sqrt(cos(x²)). Hmm, let me think about how to approach this. I remember from calculus that when you have a function inside another function, you need to use the chain rule. So, the outer function here is the square root, and the inner function is cos(x²). 

First, let me rewrite the function to make it clearer. The square root of cos(x²) can be written as [cos(x²)]^(1/2). That might make it easier to see the layers. So, outer function is something raised to the 1/2 power, and the inner function is cos(x²). 

Now, the chain rule says that the derivative of a composite function is the derivative of the outer function evaluated at the inner function multiplied by the derivative of the inner function. So, if I let u = cos(x²), then f(x) = sqrt(u) = u^(1/2). 

Let me write that down step by step:

1. Let u = cos(x²). Then f(x) = sqrt(u) = u^(1/2).
2. First, find the derivative of f with respect to u. The derivative of u^(1/2) is (1/2)u^(-

In [6]:
def multiline_literal(s):
    # Get the repr of the string, which handles escaping 
    escaped = repr(s)
    # Remove the outer quotes that repr adds
    content = escaped[1:-1]
    # Replace \\n with actual newlines (for display in triple quotes)
    formatted_content = content.replace('\\n', '\n')
    # Format as triple-quoted string with the actual newlines
    return f'"""\n{formatted_content}\n"""'

print(multiline_literal(all_ids[0]))


"""
<｜begin▁of▁sentence｜><｜User｜>Differentiate f(x) = \\sqrt{\\cos(x^2)}<｜Assistant｜><think>
Okay, so I need to differentiate the function f(x) = sqrt(cos(x²)). Hmm, let me think about how to approach this. I remember from calculus that when you have a function inside another function, you need to use the chain rule. So, the outer function here is the square root, and the inner function is cos(x²). 

First, let me rewrite the function to make it clearer. The square root of cos(x²) can be written as [cos(x²)]^(1/2). That might make it easier to see the layers. So, outer function is something raised to the 1/2 power, and the inner function is cos(x²). 

Now, the chain rule says that the derivative of a composite function is the derivative of the outer function evaluated at the inner function multiplied by the derivative of the inner function. So, if I let u = cos(x²), then f(x) = sqrt(u) = u^(1/2). 

Let me write that down step by step:

1. Let u = cos(x²). Then f(x) = sqrt(u) = u^(1/2).

In [7]:
s = """
<｜begin▁of▁sentence｜><｜User｜>Differentiate f(x) = \\sqrt{\\cos(x^2)}<｜Assistant｜><think>
Okay, so I need to differentiate the function f(x) = sqrt(cos(x²)). Hmm, let me think about how to approach this. I remember that when you have a composite function, you need to use the chain rule. Yeah, the chain rule is for differentiating functions that are composed of multiple functions. 

First, let me rewrite the function to make it clearer. The square root can be written as something raised to the power of 1/2, so f(x) = (cos(x²))^(1/2). That might make it easier to see the layers. So, the outside function is something squared, and the inside function is cos(x²). 

Alright, so to apply the chain rule, I need to take the derivative of the outside function evaluated at the inside function, and then multiply it by the derivative of the inside function. So, in mathematical terms, if f(x) = g(h(x)), then f'(x) = g'(x) + h'(x).
"""

model_inputs = tokenizer(s, add_special_tokens=False, return_tensors="pt").to(model.device)

regenerated_output = model.generate(
    **model_inputs,
    max_new_tokens=1000,
    pad_token_id=tokenizer.eos_token_id
)
# Extract only the newly generated tokens for each sequence in the batch
generated_ids = [
    output_ids[len(model_inputs.input_ids[0]):] for output_ids in regenerated_output
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
print(response[0])


Wait, no, that's not quite right. The chain rule says f'(x) = g'(h(x)) * h'(x). Right, I was getting confused earlier. So, I need to evaluate the derivative of the outside function at the inside function, and then multiply it by the derivative of the inside function.

So, let's break it down. Let me define g(u) = u^(1/2), where u = cos(x²). Then f(x) = g(u). So, the derivative f'(x) would be g'(u) * u', where g'(u) is the derivative of u^(1/2) with respect to u, and u' is the derivative of cos(x²) with respect to x.

Okay, let's find g'(u). The derivative of u^(1/2) with respect to u is (1/2)u^(-1/2). So, g'(u) = (1/2)u^(-1/2). Now, u is cos(x²), so I need to find u' = d/dx [cos(x²)].

To find u', I need to use the chain rule again because cos(x²) is a composite function. So, let's let v = x², then u = cos(v). So, the derivative u' is the derivative of cos(v) with respect to v times the derivative of v with respect to x.

The derivative of cos(v) with respect to v is -sin(v), and the d