In [1]:
import os
from datetime import datetime

In [2]:
# Create directories in Google Drive for persistence
DRIVE_BASE = '/content/drive/MyDrive/Colab_ML'
MODEL_CACHE = f'{DRIVE_BASE}/models'
PIP_CACHE = f'{DRIVE_BASE}/pip_cache'

os.makedirs(MODEL_CACHE, exist_ok=True)
os.makedirs(PIP_CACHE, exist_ok=True)

print(f"Google Drive mounted")
print(f"Model cache: {MODEL_CACHE}")
print(f"Pip cache: {PIP_CACHE}")

Google Drive mounted
Model cache: /content/drive/MyDrive/Colab_ML/models
Pip cache: /content/drive/MyDrive/Colab_ML/pip_cache


In [3]:
# Set HuggingFace to use Google Drive for model storage
os.environ['TRANSFORMERS_CACHE'] = MODEL_CACHE
os.environ['HF_HOME'] = MODEL_CACHE
# Configure pip to use Google Drive cache
os.environ['PIP_CACHE_DIR'] = PIP_CACHE

In [None]:
# from huggingface_hub import snapshot_download

# model_name = "Qwen/Qwen2.5-32B-Instruct"
# snapshot_download(
#     repo_id=model_name,
#     local_dir=MODEL_CACHE,
#     local_dir_use_symlinks=False,  # Important: no symlinks
#     resume_download=True,
#     max_workers=1,  # Download one file at a time for stability
#     ignore_patterns=["*.gguf", "*.bin"]  # bin is for old pytorch format, gguf is for llama.cpp
# )

# print("✓ Download complete!")

# # Verify all files are present
# print("\nVerifying files...")
# files = os.listdir(MODEL_CACHE)
# safetensor_files = [f for f in files if f.endswith('.safetensors')]
# print(f"Found {len(safetensor_files)} safetensors files")

In [8]:
# !pip list

In [None]:
!pip install --cache-dir={PIP_CACHE} bitsandbytes -q # not do this because it saves memory but slows down inference

In [6]:
!pip install --cache-dir={PIP_CACHE} flash-attn --no-build-isolation -q # used in HF transformers workflow

In [5]:
!pip install --cache-dir={PIP_CACHE} vllm -q

Collecting vllm
  Downloading vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl.metadata (17 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (217 bytes)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lm-format-enforcer==0.11.3 (from vllm)
  Downloading lm_format_enforcer-0.11.3-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<0.8.0,>=0.7.11 (from vllm)
  Downloading llguidance-0.7.30-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting outlines_core==0.2.11 (from vllm)
  Downloading outlines_core-0.2.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting diskcache==5.6.3 (from vllm)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting lark==1.2.2 (from vllm)
  Downloading lark-1.2.2-py3-none-any.whl.met

In [7]:
# HF Transformer workflow, slow

import torch
from transformers import AutoModelForCausalLM

print("\nLoading model...")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_CACHE,
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2",  # 20-30% faster
    # load_in_8bit=True,
    local_files_only=True  # Use only local files, don't re-download
)

print("\n Model loaded successfully!")
print(model.hf_device_map)
print(model.config.use_cache)




Loading model...


Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]


 Model loaded successfully!


In [6]:
# vLLM workflow, faster

from vllm import LLM, SamplingParams

model = LLM(
    model=MODEL_CACHE,
    trust_remote_code=True,
    tensor_parallel_size=1,
    dtype="bfloat16",
    max_model_len=10000,  # max context length
    gpu_memory_utilization=0.9
)

INFO 10-13 17:45:19 [utils.py:233] non-default args: {'trust_remote_code': True, 'dtype': 'bfloat16', 'max_model_len': 10000, 'disable_log_stats': True, 'model': '/content/drive/MyDrive/Colab_ML/models'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 10-13 17:45:19 [model.py:547] Resolved architecture: Qwen2ForCausalLM
INFO 10-13 17:45:19 [model.py:1510] Using max model len 10000
INFO 10-13 17:45:19 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 10-13 17:56:13 [llm.py:306] Supported_tasks: ['generate']


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_CACHE,
    trust_remote_code=True,
    local_files_only=True
)

In [14]:
def generate_text(prompt, system_message, max_tokens, engine='vllm'):
    """Generate text from prompt"""
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt}
    ]

    text_input = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    t0 = datetime.now()

    if engine=="vllm": # model is vllm
        sampling_params = SamplingParams(temperature=0, max_tokens=int(max_tokens))
        output = model.generate([text_input], sampling_params)
        output = output[0].outputs[0].text
    else: # model is HF Transformer
        model_inputs = tokenizer([text_input], return_tensors="pt").to(model.device)
        with torch.no_grad():
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=int(max_tokens),
                do_sample=False, # greedy decoding - much faster
                use_cache=True, # ensure KV cache is enabled
            )
        output = tokenizer.batch_decode(
            generated_ids[:, model_inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        )[0]

    delta_t = (datetime.now() - t0).total_seconds()
    print(f"generation took {delta_t} secs")

    return output

In [24]:
# Create Gradio Interface and use as an API

import gradio as gr

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Qwen2.5 32B")
    gr.Markdown("Powerful AI model for text generation")

    with gr.Tab("💬 Text Generation"):
        gr.Markdown("### Generate text with custom prompts")

        with gr.Row():
            with gr.Column():
                gen_prompt = gr.Textbox(
                    label="Prompt",
                    placeholder="Enter your prompt here...",
                    lines=5
                )
                gen_system = gr.Textbox(
                    label="System Message",
                    value="You are a helpful assistant.",
                    lines=2
                )

                gen_max_tokens = gr.Slider(
                    minimum=100,
                    maximum=4000,
                    value=1000,
                    step=100,
                    label="Max Tokens"
                )

                gen_button = gr.Button("Generate", variant="primary")

            with gr.Column():
                gen_output = gr.Textbox(
                    label="Generated Text",
                    lines=15,
                    interactive=False
                )

        gen_button.click(
            fn=generate_text,
            inputs=[gen_prompt, gen_system, gen_max_tokens],
            outputs=gen_output
        )

        # Example prompts
        gr.Examples(
            examples=[
                ["Explain quantum computing in simple terms", "You are a helpful assistant.", 500, 0.7, 0.9],
                ["Write a professional email requesting a meeting", "You are a professional business writer.", 300, 0.5, 0.9],
                ["Create a creative story about a time traveler", "You are a creative fiction writer.", 1000, 0.9, 0.95]
            ],
            inputs=[gen_prompt, gen_system, gen_max_tokens]
        )

print("\n" + "="*60)
print("LAUNCHING GRADIO INTERFACE")
print("="*60)

# Launch with share=True to get public URL
demo.launch(
    share=True,  # Creates public URL automatically!
    debug=False,
    server_name="0.0.0.0",
    server_port=7865
)

print("\n✓ Gradio interface launched!")
print("✓ Use the public URL above to access from anywhere")
print("✓ Share the URL with others if needed")
print("\nKeep this notebook running to maintain access!")


LAUNCHING GRADIO INTERFACE
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


<IPython.core.display.Javascript object>


✓ Gradio interface launched!
✓ Use the public URL above to access from anywhere
✓ Share the URL with others if needed

Keep this notebook running to maintain access!


In [None]:
prompt = """
Here's a job description:

.......


Here's my resume:

.......

Can you evaluate how well this I fit this job?
"""

response = generate_text(prompt, system_message="You're a helpful assistant", max_tokens=1000)



Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

generation took 29.876203 secs


In [16]:
len(prompt)

7913

In [17]:
len(response)

3850

In [None]:
response