## Local Inference on GPU 
Model page: https://huggingface.co/large-traversaal/Alif-1.0-8B-Instruct


In [1]:
%%capture
!pip install transformers
!pip install -q gradio
!pip install bitsandbytes

In [2]:
import torch
import gradio as gr
from threading import Thread
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer, TextIteratorStreamer


model_id = "large-traversaal/Alif-1.0-8B-Instruct"

# 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load tokenizer and model in 4-bit
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)

2025-07-24 09:23:51.569436: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753349031.768553      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753349031.830065      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

In [6]:
import gradio as gr
import torch
from threading import Thread
from transformers import TextIteratorStreamer


# --- Main Change 1: Function to format the prompt with history ---
def format_prompt_with_history(message, history):
    # The base instruction for the model
    chat_prompt = """You are Urdu Chatbot.
### Instruction:
Below is an instruction that describes a task. Write a response in urdu that appropriately completes the request. Don't say you don't know unless you really don't.
Please do not give long answers unless specifically asked for it. Always try to answer in maximum of 2-3 lines.
"""
    
    # Add past conversations to the prompt
    for user_msg, bot_msg in history:
        chat_prompt += f"### Input:\n{user_msg}\n### Response:\n{bot_msg}\n"
        
    # Add the current user message
    chat_prompt += f"### Input:\n{message}\n### Response:\n"
    
    return chat_prompt

# --- Main Change 2: The generation function now accepts history ---
# This function is now a generator to stream output token-by-token
def generate_response(message, history):
    # Format the prompt to include the conversation history
    prompt = format_prompt_with_history(message, history)
    
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=500,
        do_sample=True,
        top_p=0.95,
        top_k=50,
        temperature=0.7,
        repetition_penalty=1.2,
    )

    # Run generation in a separate thread
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Yield new text as it's generated
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer

# --- Main Change 3: Using gr.ChatInterface for a Gemini-like UI ---
iface = gr.ChatInterface(
    fn=generate_response,
    title="Urdu Chatbot 🤖 (Alif-1.0-8B)",
    description="Aap Urdu mein kuch bhi pooch sakte hain. (Ask me anything in Urdu)",
    chatbot=gr.Chatbot(height=500), # Sets the height of the chat window
    textbox=gr.Textbox(placeholder="...یہاں اپنا سوال لکھیں", container=False, scale=7),
    examples = [
        'شہر کراچی کی کیا اہمیت ہے؟',
        'صحت مند رہنے کے لئے پانچ تجاویز دیں۔',
        'پاکستان کی تاریخ پر ایک مختصر مضمون لکھیں۔'
    ],
    # retry_btn="Retry 🔁",
    # undo_btn="Undo ↩️",
    # clear_btn="Clear ✨",
)

iface.launch(share=True, debug=True)

  chatbot=gr.Chatbot(height=500), # Sets the height of the chat window


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://ba870a536f94d40f05.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://ba870a536f94d40f05.gradio.live


