# Installing Important Dependencies

In [1]:
%pip install -U transformers accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.55.2-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metada

# Changing Default Download Location

In [2]:
import os

# Change this to your bigger storage path
os.environ["HF_HOME"] = "E:\\Resoning Based Qwen3-4B LLM Model for Test\\HuggingFace"

# Optional: also set PyTorch cache
os.environ["TORCH_HOME"] = "E:\\Resoning Based Qwen3-4B LLM Model for Test\\Pytorch"

## Local Inference on GPU
Model page: https://huggingface.co/Qwen/Qwen3-4B


# Using Pipeline for Download and Demo

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Qwen/Qwen3-4B")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Device set to use cuda:0


[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
   {'role': 'assistant',
    'content': '<think>\nOkay, the user is asking "Who are you?" and I need to respond as a helpful and informative AI assistant. First, I should clearly state my identity as Qwen, developed by Alibaba Cloud. I should mention my capabilities in handling various tasks like answering questions, creating content, and programming. It\'s important to highlight that I\'m designed to be helpful and follow instructions.\n\nI should also explain that I can assist with different types of questions, such as answering factual questions, creative writing, problem-solving, and more. Maybe include examples of the kinds of tasks I can perform to make it concrete. Additionally, I should mention that I can understand and respond in multiple languages, which is a key feature.\n\nI need to keep the response friendly and approachable, encouraging the user to ask questions or request help. Also, make sure to avoid any

# Using Detailed Steps to Install and generate response

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offload for layers that don't fit GPU
)

device_map = {
    "transformer.wte": "cuda",
    "transformer.wpe": "cuda",
    # offload large transformer blocks to cpu or customize as per your memory
    # or use "auto" with offload enabled
}

def load_model():
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
    model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen3-4B",
				quantization_config=bnb_config,
        device_map="auto"
    )
    return tokenizer, model

def generate_response(tokenizer, model, messages, max_new_tokens=10000):
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        enable_thinking=False
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,         # Enables randomness (optional, helps with creativity)
        temperature=0.7,        # Controls creativity; lower=more deterministic
        top_p=0.9               # Nucleus sampling parameter for diversity
    )

    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    return response

In [5]:
# Load model/tokenizer once
tokenizer, model = load_model()

# Later, generate responses anytime like this:
messages = [{"role": "user", "content": "Hello, how are you?"}]
print(generate_response(tokenizer, model, messages))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Hello! I'm an AI assistant developed by Alibaba Group. I'm here to help you with any questions or tasks you have. How can I assist you today?


## Remote Inference via Inference Providers
Ensure you have a valid **HF_TOKEN** set in your environment. You can get your token from [your settings page](https://huggingface.co/settings/tokens). Note: running this may incur charges above the free tier.
The following Python example shows how to run the model remotely on HF Inference Providers, automatically selecting an available inference provider for you.
For more information on how to use the Inference Providers, please refer to our [documentation and guides](https://huggingface.co/docs/inference-providers/en/index).

In [None]:
import os
os.environ['HF_TOKEN'] = 'YOUR_TOKEN_HERE'

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.environ["HF_TOKEN"],
)

completion = client.chat.completions.create(
    model="Qwen/Qwen3-4B",
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
)

print(completion.choices[0].message)

# Generate Response

In [6]:
messages = [
    {"role": "system", "content": "You are a helpful assistant. Please answer questions briefly and directly without reasoning steps."},
    {"role": "user", "content": "How to teach a bird to speak?"}
]

print(generate_response(tokenizer, model, messages))

Birds cannot be taught to speak like humans. However, some parrots can learn to mimic human speech through repetition and positive reinforcement. Speak clearly and consistently, and the bird may imitate sounds over time.


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
from datetime import datetime

def save_output(session_name, user_input, ai_output, file_path='/content/drive/MyDrive/Colab Notebooks/Reasoning Outputs/results.txt'):
    with open(file_path, 'a') as f:  # append mode
        f.write(f"\n\n==== Session: {session_name} ====\n")
        f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write(f"User Input:\n{user_input}\n\n")
        f.write(f"AI Output:\n{ai_output}\n")
        f.write("\n" + "="*30 + "\n")  # separator for clarity


In [None]:
session_count = 1  # or generate session name dynamically

while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit']:
        print("Exiting chat...")
        break

    messages = [{"role": "user", "content": user_input}]
    ai_output = generate_response(tokenizer, model, messages, max_new_tokens=1024)

    print("AI:", ai_output)

    session_name = f"Session_{session_count}"
    save_output(session_name, user_input, ai_output)

    session_count += 1
