<a href="https://colab.research.google.com/github/hanhanwu/Hanhan_LangGraph_Exercise/blob/main/AI_for_BI/run_deepseek_r1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# About


* DeepSeek R1 Model List: https://github.com/deepseek-ai/DeepSeek-R1

In [1]:
%%capture --no-stderr
%pip install fastapi nest-asyncio pyngrok uvicorn
%pip install vllm

## Start vLLM Server

* On Google Colab, have to use <b>T4 GPU as runtime</b>! 💖
* Reference: https://github.com/naufalhakim23/deepseek-r1-playground/blob/main/deepseek_r1_distill_qwen_fast_api.ipynb

In [2]:
# Load and run the model:
import subprocess
import time
import os
import subprocess


model = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'
vllm_process = subprocess.Popen([
    'vllm',
    'serve',  # Subcommand must follow vllm
    model,
    '--trust-remote-code',
    '--dtype', 'half',
    '--max-model-len', '16384',
    '--enable-chunked-prefill', 'false',
    '--tensor-parallel-size', '1'
], stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True)

In [3]:
import requests
import time
from typing import Tuple
import sys

def check_vllm_status(url: str = "http://localhost:8000/health") -> bool:
    """Check if VLLM server is running and healthy."""
    try:
        response = requests.get(url)
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        return False

def monitor_vllm_process(vllm_process: subprocess.Popen, check_interval: int = 5) -> Tuple[bool, str, str]:
    """
    Monitor VLLM process and return status, stdout, and stderr.
    Returns: (success, stdout, stderr)
    """
    print("Starting VLLM server monitoring...")

    while vllm_process.poll() is None:  # While process is still running
        if check_vllm_status():
            print("✓ VLLM server is up and running!")
            return True, "", ""

        print("Waiting for VLLM server to start...")
        time.sleep(check_interval)

        # Check if there's any output to display
        if vllm_process.stdout.readable():
            stdout = vllm_process.stdout.read1().decode('utf-8')
            if stdout:
                print("STDOUT:", stdout)

        if vllm_process.stderr.readable():
            stderr = vllm_process.stderr.read1().decode('utf-8')
            if stderr:
                print("STDERR:", stderr)

    # If we get here, the process has ended
    stdout, stderr = vllm_process.communicate()
    return False, stdout.decode('utf-8'), stderr.decode('utf-8')

try:
    success, stdout, stderr = monitor_vllm_process(vllm_process)

    if not success:
        print("\n❌ VLLM server failed to start!")
        print("\nFull STDOUT:", stdout)
        print("\nFull STDERR:", stderr)
        sys.exit(1)

except KeyboardInterrupt:
    print("\n⚠️ Monitoring interrupted by user")
    stdout, stderr = vllm_process.communicate()
    if stdout: print("\nFinal STDOUT:", stdout.decode('utf-8'))
    if stderr: print("\nFinal STDERR:", stderr.decode('utf-8'))
    sys.exit(0)

Starting VLLM server monitoring...
Waiting for VLLM server to start...
STDOUT: INFO 02-09 03:21:00 __init__.py:190] Automatically detected platform cuda.

STDERR: 2025-02-09 03:20:55.304365: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739071255.551250    1841 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739071255.621097    1841 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-09 03:20:56.124676: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild Te

In [15]:
import requests
import json
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from fastapi.responses import StreamingResponse
import requests

# Request schema for input
class QuestionRequest(BaseModel):
    question: str
    # model:model now would be passed from the global model.


def ask_model(question: str):
    """
    Sends a request to the model server and fetches a response.
    """
    url = "http://localhost:8000/v1/chat/completions"  # Adjust the URL if different
    headers = {"Content-Type": "application/json"}
    data = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": question
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data)
    response.raise_for_status()  # Raise exception for HTTP errors
    return response.json()

# Usage:
result = ask_model("Write the code of binary search in python")
print(json.dumps(result, indent=2))

{
  "id": "chatcmpl-9763e4f2904c4c4288c21ff48fe2e350",
  "object": "chat.completion",
  "created": 1739071974,
  "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "reasoning_content": null,
        "content": "<think>\nAlright, I need to write the code for a binary search in Python. Let me think about how to approach this.\n\nFirst, I remember that binary search works on a sorted list. So, the list should be in order, either ascending or descending. I'll assume it's sorted in ascending order since that's the common case.\n\nThe algorithm works by repeatedly dividing the search interval in half. So, I need to keep track of low and high indices. I'll initialize low to 0 and high to the length of the list minus one.\n\nThen, while low is less than or equal to high, I calculate the middle index. Middle can be calculated as (low + high) // 2 to avoid floating points.\n\nOnce I find the middle 

In [16]:
import pprint

content = result['choices'][0]['message']['content']
pprint.pprint(content, indent=2, width=80, depth=None)

('<think>\n'
 'Alright, I need to write the code for a binary search in Python. Let me '
 'think about how to approach this.\n'
 '\n'
 'First, I remember that binary search works on a sorted list. So, the list '
 "should be in order, either ascending or descending. I'll assume it's sorted "
 "in ascending order since that's the common case.\n"
 '\n'
 'The algorithm works by repeatedly dividing the search interval in half. So, '
 "I need to keep track of low and high indices. I'll initialize low to 0 and "
 'high to the length of the list minus one.\n'
 '\n'
 'Then, while low is less than or equal to high, I calculate the middle index. '
 'Middle can be calculated as (low + high) // 2 to avoid floating points.\n'
 '\n'
 "Once I find the middle element, if it's equal to the target, I return the "
 'index. If the middle element is less than the target, I adjust low to middle '
 "+ 1 because the target must be in the right half. If it's greater, I adjust "
 'high to middle - 1 because the 