# Script to test HenriAI model directly on https://www.henriai.ca/

In [1]:
!pip -q install -U "transformers>=4.42" "peft>=0.11.0" accelerate \
  "bitsandbytes>=0.43.0" fastapi "uvicorn==0.34.0" "pydantic>=2,<3" python-multipart
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O /usr/local/bin/cloudflared
!chmod +x /usr/local/bin/cloudflared

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m127.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import uvicorn, fastapi, sys
import os, gc, time, torch, re, subprocess, threading
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

ADAPTER_DIR = "/content/drive/MyDrive/HenriAI/Models/Version 1/adapters/epoch_4"
PORT = 7860
ALLOWED_ORIGINS = ["https://www.henriai.ca", "https://henriai.ca", "http://localhost:5173", "http://127.0.0.1:5500"]

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print("Loading GPT-J 6B base in 4-bit…")
bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                         bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True)
base = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/gpt-j-6B",
    quantization_config=bnb,
    device_map="auto",
    torch_dtype=torch.float16
)

print(f"Loading adapter from: {ADAPTER_DIR}")
model = PeftModel.from_pretrained(base, ADAPTER_DIR)
model.eval()

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

clear_memory()
print("Model ready.")

Loading GPT-J 6B base in 4-bit…


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

Loading adapter from: /content/drive/MyDrive/HenriAI/Models/Version 1/adapters/epoch_4


tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Model ready.


In [4]:
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=ALLOWED_ORIGINS,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class GenerateRequest(BaseModel):
    text: str
    max_new_tokens: int = 256
    temperature: float = 0.7
    top_p: float = 0.9

@app.get("/healthz")
def healthz():
    return {"ok": True}

@torch.inference_mode()
@app.post("/generate")
def generate(req: GenerateRequest):
    try:
        prompt = f"Question: {req.text}\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)

        start = time.time()
        with torch.cuda.amp.autocast():
            out = model.generate(
                **inputs,
                max_new_tokens=req.max_new_tokens,
                do_sample=True,
                temperature=req.temperature,
                top_p=req.top_p,
                use_cache=True,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id
            )
        dur = round(time.time() - start, 2)

        full = tokenizer.decode(out[0], skip_special_tokens=True)
        answer = full.split("Answer:", 1)[-1].strip() if "Answer:" in full else full.strip()

        return {"response": answer, "timing": {"seconds": dur}}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Password for website stored server-side
SERVER_PASSWORD = "henri2005"

from pydantic import BaseModel

class AuthRequest(BaseModel):
    password: str

@app.post("/auth")
def auth(req: AuthRequest):
    if req.password == SERVER_PASSWORD:
        return {"ok": True}
    return {"ok": False}


In [5]:
import uvicorn, sys

def run_server():
    uvicorn.run(app, host="0.0.0.0", port=PORT, log_level="info")

# Start FastAPI in background
t = threading.Thread(target=run_server, daemon=True)
t.start()
time.sleep(2)

# Start cloudflared tunnel
proc = subprocess.Popen(
    ["cloudflared", "tunnel", "--url", f"http://localhost:{PORT}", "--no-autoupdate"],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)

public_url = None
for line in iter(proc.stdout.readline, ''):
    if "trycloudflare.com" in line:
        m = re.search(r"https://[a-z0-9-]+\.trycloudflare\.com", line)
        if m:
            public_url = m.group(0)
            print("\n=== Copy into config.js ===")
            print("window.COLAB_API_URL = \"" + public_url + "\";")
            print("===========================================\n")
            break

print("Tunnel running.")

INFO:     Started server process [3605]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:7860 (Press CTRL+C to quit)



=== Copy into config.js ===
window.COLAB_API_URL = "https://mathematics-exactly-guided-bringing.trycloudflare.com";

Tunnel running.
