In [1]:
!pip install fastapi uvicorn torch transformers accelerate pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [2]:
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import uvicorn
import threading
from pyngrok import ngrok
import time

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Request model
class ChatRequest(BaseModel):
    message: str
    max_new_tokens: int = 200
    temperature: float = 0.7

# Response model
class ChatResponse(BaseModel):
    response: str
    status: str

model = None
tokenizer = None

In [3]:
def load_model():
    global model, tokenizer

    try:
        model_name = "jyanjain/Harshil-karia-Llama-2-7b-chat-finetune"

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True
        )

        logger.info("Model loaded successfully!")
        return True

    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        return False

print("Loading model...")
success = load_model()
if success:
    print("Model loaded successfully!")
else:
    print("Failed to load model")

Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

Model loaded successfully!


In [7]:
def generate_response(question: str, max_new_tokens: int = 512, temperature: float = 0.7) -> str:
    try:
        prompt = f"<s>[INST] You are Harshil Karia, founder of Schbang. Answer in a single short paragraph about: {question} [/INST]"

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temperature,
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True
            )

        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

        if "[INST]" in response:
            response = response.split("[INST]")[0].strip()
        if "[/INST]" in response:
            response = response.split("[/INST]")[0].strip()

        return response.strip()

    except Exception as e:
        logger.error(f"Error generating response: {str(e)}")
        return "I apologize, but I encountered an error while processing your question. Please try again."


# test_response = generate_response("Who are you?")
# print("🧪 Test Response:")
# print(test_response)

In [8]:
app = FastAPI(title="Harshil Karia Llama Chat API")

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
    try:
        max_new_tokens = min(max(request.max_new_tokens, 50), 512)
        temperature = min(max(request.temperature, 0.1), 1.0)

        logger.info(f"Received message: {request.message}")

        response = generate_response(request.message, max_new_tokens, temperature)

        return ChatResponse(response=response, status="success")

    except Exception as e:
        logger.error(f"Error in chat endpoint: {str(e)}")
        raise HTTPException(status_code=500, detail="Internal server error")


@app.get("/")
async def root():
    return {"message": "Harshil Karia Llama Chat API is running!"}

print("FastAPI app created successfully!")

FastAPI app created successfully!


In [None]:
from pyngrok import ngrok

NGROK_AUTH_TOKEN = "2cur7HzYOxd3WEq1Z6YddReYJAo_6tReATTAeaHBQ89vtMcCw"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

def setup_ngrok():
    try:
        ngrok.kill()

        public_url = ngrok.connect(8000)
        print(f"Public URL: {public_url}")
        print(f"Chat Endpoint: {public_url}/chat")
        return public_url
    except Exception as e:
        print(f"Error setting up ngrok: {e}")
        return None

def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")

# Setup ngrok
public_url = setup_ngrok()

if public_url:
    server_thread = threading.Thread(target=run_server, daemon=True)
    server_thread.start()

    time.sleep(3)
    print("Server is running!")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        print("\nServer stopped!")
        ngrok.kill()
else:
    print("Failed to setup ngrok tunnel")

Public URL: NgrokTunnel: "https://778031935767.ngrok-free.app" -> "http://localhost:8000"
Chat Endpoint: NgrokTunnel: "https://778031935767.ngrok-free.app" -> "http://localhost:8000"/chat


INFO:     Started server process [1152]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
ERROR:    [Errno 98] error while attempting to bind on address ('0.0.0.0', 8000): address already in use
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.


Server is running!
INFO:     103.184.104.58:0 - "OPTIONS /chat HTTP/1.1" 200 OK
INFO:     103.184.104.58:0 - "POST /chat HTTP/1.1" 200 OK
