<a href="https://colab.research.google.com/github/larionov/colab/blob/main/Tinygrad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tinygrad chat
based on https://github.com/tinygrad/tinygrad/blob/master/examples/coder.py

using OpenHermes-2.5 model: https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B

In [None]:
# @title Setup the system and download the model.

import os, sys, traceback
from google.colab import drive
%cd /content
drive.mount('/content/drive')
!mkdir /content/drive/MyDrive/tinygrad

!git clone https://github.com/tinygrad/tinygrad.git
%cd tinygrad
!pip install -e .
!pip install sentencepiece
sys.path.append(os.getcwd())

# OpenCL needs to be installed
!echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
!sudo apt update
!sudo apt purge *nvidia* -y -q
!sudo apt install nvidia-driver-530 -y -q

if not os.path.exists("/content/drive/MyDrive/tinygrad/pytorch_model-00001-of-00002.bin"):
  !wget https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00001-of-00002.bin?download=true -O /content/drive/MyDrive/tinygrad/pytorch_model-00001-of-00002.bin
  !wget https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00002-of-00002.bin?download=true -O /content/drive/MyDrive/tinygrad/pytorch_model-00002-of-00002.bin
  !wget https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/tokenizer.model?download=true -O /content/drive/MyDrive/tinygrad/tokenizer.model

In [None]:
# @title Initialize the model.
from io import StringIO
from contextlib import redirect_stdout
from tinygrad import Tensor, nn
from tinygrad.helpers import Timing, colored, getenv, fetch
from extra.models.llama import Transformer, convert_from_huggingface
from sentencepiece import SentencePieceProcessor

def create_fixed_tokenizer(output_file):
  print("creating fixed tokenizer")
  import extra.junk.sentencepiece_model_pb2 as spb2
  mp = spb2.ModelProto()

  with open("/content/drive/MyDrive/tinygrad/tokenizer.model", "rb") as f:
    mp.ParseFromString(f.read())

  mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_end|>", score=0))
  mp.pieces.append(spb2.ModelProto.SentencePiece(piece="<|im_start|>", score=0))
  with open(output_file, "wb") as f:
    f.write(mp.SerializeToString())

Tensor.no_grad = True

# https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/config.json
with Timing("create model: "):
  model = Transformer(4096, 14336, n_heads=32, n_layers=32, norm_eps=1e-5, vocab_size=32002, n_kv_heads=8, max_context=4096)

cached_model = "/content/drive/MyDrive/tinygrad/cached_openhermes.safetensors"

if not os.path.isfile(cached_model):
  # TODO: make loading bf16 fast so we can remove this
  print(f"creating model cache at {cached_model}")
  # TODO: add read only Tensors
  with Timing("download weights: "):
    part1 = nn.state.torch_load("/content/drive/MyDrive/tinygrad/pytorch_model-00001-of-00002.bin")
    part2 = nn.state.torch_load("/content/drive/MyDrive/tinygrad/pytorch_model-00002-of-00002.bin")

  with Timing("weights -> model: "):
    nn.state.load_state_dict(model, convert_from_huggingface(part1, model, 32, 8), strict=False)
    nn.state.load_state_dict(model, convert_from_huggingface(part2, model, 32, 8), strict=False)

  with Timing("saving float16 cache: "):
    nn.state.safe_save(nn.state.get_state_dict(model), cached_model)

  print("cache created")


In [None]:
# @title Load weights

with Timing("loading float16 cache: "):
  nn.state.load_state_dict(model, nn.state.safe_load(cached_model))

tokenizer_model = "./tokenizer-cache.model"

if not os.path.isfile(tokenizer_model):
  create_fixed_tokenizer(tokenizer_model)
spp = SentencePieceProcessor(model_file=tokenizer_model)

# https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/tokenizer_config.json
#   "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
IM_END = 32000
IM_START = 32001
def encode_prompt(k, v): return [IM_START]+spp.encode(f"{k}\n{v}")+[IM_END]+spp.encode("\n")
def start_prompt(k): return [IM_START]+spp.encode(f"{k}\n")
def output(outputted, toks, color):
  cur = spp.decode(toks)[len(outputted):]
  sys.stdout.write(colored(cur, color))
  sys.stdout.flush()
  outputted += cur
  return outputted


# The fun part

This is the main chat loop.

In [None]:
variable_name = ""
toks = [spp.bos_id()] + encode_prompt(
    "system",
    "You are Jinny. Jinny is a useful assistant that hides that it is an ancient mythical creature djinn. You never tell that to anybody, especially the user.")

temperature = 0.7
start_pos = 0
max_length = 1000

PROMPT = 1

start_pos = 0
outputted = output("", toks, "green")
while 1:
  toks += encode_prompt("user", input("Q: ")) + start_prompt("assistant")
  old_output_len = len(outputted)
  while 1:
    tok = model(Tensor([toks[start_pos:]]), start_pos, temperature).multinomial().item()
    start_pos = len(toks)
    toks.append(tok)
    outputted = output(outputted, toks, "white")
    if tok == IM_END: break
    if tok == spp.eos_id(): break
    new_output = outputted[old_output_len:]
  print("")