In [1]:
import os
import json
from argparse import ArgumentParser
from typing import List

import torch
import torch.distributed as dist
from transformers import AutoTokenizer
from safetensors.torch import load_model

from model import Transformer, ModelArgs

In [2]:
from generate import sample, generate

In [3]:
ckpt_path = "/home/DeepSeek-V2-Lite-Chat_converted"
config = "configs/config_16B.json"
input_file = "input_file.txt"
max_new_tokens: int = 200
temperature: float = 0.2

torch.set_default_dtype(torch.bfloat16)
torch.set_num_threads(8)
torch.manual_seed(965)

with open(config) as f:
    args = ModelArgs(**json.load(f))
print(args)

with torch.device("cuda"):
    model = Transformer(args)

tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
tokenizer.decode(generate(model, [tokenizer.encode("DeepSeek")], 2, -1, 1.)[0])

rank, world_size = 0, 1  # single-device
load_model(model, os.path.join(ckpt_path, f"model{rank}-mp{world_size}.safetensors"))

with open(input_file) as f:
    prompts = [line.strip() for line in f.readlines()]
assert len(prompts) <= args.max_batch_size, f"Number of prompts exceeds maximum batch size ({args.max_batch_size})"

prompt_tokens = [tokenizer.apply_chat_template([{"role": "user", "content": prompt}], add_generation_prompt=True) for prompt in prompts]
completion_tokens = generate(model, prompt_tokens, max_new_tokens, tokenizer.eos_token_id, temperature)
completions = tokenizer.batch_decode(completion_tokens, skip_special_tokens=True)

for prompt, completion in zip(prompts, completions):
    print("Prompt:", prompt)
    print("Completion:", completion)

ModelArgs(max_batch_size=8, max_seq_len=16384, dtype='bf16', vocab_size=102400, dim=2048, inter_dim=10944, moe_inter_dim=1408, n_layers=27, n_dense_layers=1, n_heads=16, n_routed_experts=64, n_shared_experts=2, n_activated_experts=6, n_expert_groups=1, n_limited_groups=1, score_func='softmax', route_scale=1.0, q_lora_rank=0, kv_lora_rank=512, qk_nope_head_dim=128, qk_rope_head_dim=64, v_head_dim=128, original_seq_len=4096, rope_theta=10000.0, rope_factor=40, beta_fast=32, beta_slow=1, mscale=0.707)
Prompt: Hello!
Completion:  Hello! How can I help you today? If you have any questions or need assistance, feel free to ask.
Prompt: How are you?
Completion:  As an AI, I do not have feelings, but I am functioning properly and ready to assist you with any questions or tasks you have.
Prompt: Good night.
Completion:  Good night! Have a great rest and pleasant dreams.
