In [1]:
import torch
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig
# import os
# os.environ["http_proxy"]="127.0.0.1:7890"
# os.environ["https_proxy"]="127.0.0.1:7890"

model_id="Qwen/Qwen1.5-7B-Chat"
bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer=AutoTokenizer.from_pretrained(model_id)
model=AutoModelForCausalLM.from_pretrained(model_id,quantization_config=bnb_config,device_map={"":0})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
device='cuda'

text="For quantized models, we advise you to use the GPTQ, AWQ, and GGUF correspondents."


def format_chat_prompt(message,chat_history):
    prompt=''
    for turn in chat_history:
        user_message,bot_message=turn
        prompt=f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
    prompt=f"{prompt}\nUser: {message}\nAssistant:"
    return prompt

def inferenceChat(message,chat_history):
    formatted_prompt=format_chat_prompt(message,chat_history)
    inputs=tokenizer(formatted_prompt,return_tensors='pt').to(device)
    outputs=model.generate(**inputs,max_new_tokens=512)
    bot_message=tokenizer.decode(outputs[0],skip_special_tokens=True)
    chat_history.append((message,bot_message))
    return "", chat_history

def inference2C(text,outlen=512):
    task=[
        {"role": "system", "content": "You will be provided with a sentence in English, and your task is to translate it into Chinese."},
        {"role": "user", "content": text}
    ]

    chat_template=tokenizer.apply_chat_template(
        task,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs=tokenizer([chat_template],return_tensors='pt').to(device)

    generated_ids=model.generate(
        model_inputs.input_ids,
        max_new_tokens=outlen
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response=tokenizer.batch_decode(generated_ids,skip_special_tokens=True)[0]
    return response

def inference2E(text,outlen=512):
    task=[
        {"role": "system", "content": "You will be provided with a sentence in Chinese, and your task is to translate it into English."},
        {"role": "user", "content": text}
    ]

    chat_template=tokenizer.apply_chat_template(
        task,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs=tokenizer([chat_template],return_tensors='pt').to(device)

    generated_ids=model.generate(
        model_inputs.input_ids,
        max_new_tokens=outlen
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response=tokenizer.batch_decode(generated_ids,skip_special_tokens=True)[0]
    return response

def inferenceRw(text,outlen=512):
    task=[
        {"role": "system", "content": "Rewrite the text without changing the meaning of the sentence."},
        {"role": "user", "content": text}
    ]

    chat_template=tokenizer.apply_chat_template(
        task,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs=tokenizer([chat_template],return_tensors='pt').to(device)

    generated_ids=model.generate(
        model_inputs.input_ids,
        max_new_tokens=outlen
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response=tokenizer.batch_decode(generated_ids,skip_special_tokens=True)[0]
    return response

def inferenceGramma(text,outlen=1024):
    task=[
        {"role": "system", "content": "Correct grammatical errors of the text, do not modify them if there are no gramatical mistakes."},
        {"role": "user", "content": text}
    ]

    chat_template=tokenizer.apply_chat_template(
        task,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs=tokenizer([chat_template],return_tensors='pt').to(device)

    generated_ids=model.generate(
        model_inputs.input_ids,
        max_new_tokens=outlen
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response=tokenizer.batch_decode(generated_ids,skip_special_tokens=True)[0]
    return response

def inferenceSummary(text,withinchar=100,outlen=512):
    task=[
        {"role": "system", "content": "Make a summary of the text within {char} words".format(char=withinchar)},
        {"role": "user", "content": text}
    ]

    chat_template=tokenizer.apply_chat_template(
        task,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs=tokenizer([chat_template],return_tensors='pt').to(device)

    generated_ids=model.generate(
        model_inputs.input_ids,
        max_new_tokens=outlen
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response=tokenizer.batch_decode(generated_ids,skip_special_tokens=True)[0]
    return response


In [3]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("# Assistants based on Qwen1.5-7B")
    gr.Markdown('## Translate English to Chinese')
    gr.Interface(fn=inference2C,
                  inputs=[gr.Textbox(label='English to Chinese.')],
                  outputs=[gr.Textbox(label='Translation results.',show_label=True,show_copy_button=True)]
                  )
    gr.Markdown('## Translate Chinese to English')
    gr.Interface(fn=inference2E,
                  inputs=[gr.Textbox(label='Chinese to English.')],
                  outputs=[gr.Textbox(label='Translation results.',show_label=True,show_copy_button=True)]
                  )
    gr.Markdown('## Rewrite')
    gr.Interface(fn=inferenceRw,
                  inputs=[gr.Textbox(label='Input text')],
                  outputs=[gr.Textbox(label='Output',show_label=True,show_copy_button=True)]
                  )
    gr.Markdown('## Grammar correction')
    gr.Interface(fn=inferenceGramma,
                  inputs=[gr.Textbox(label='Input Text')],
                  outputs=[gr.Textbox(label='Output',show_label=True,show_copy_button=True)]
                  )
    gr.Markdown('## Summary')
    gr.Interface(fn=inferenceSummary,
                  inputs=[gr.Textbox(label='Input Text'),
                          gr.Slider(label='Max words',value=50,maximum=200,minimum=5)],
                  outputs=[gr.Textbox(label='Output',show_label=True,show_copy_button=True)]
                  )
    gr.Markdown('## Just Chat')
    chatbot=gr.Chatbot(height=240)
    msg=gr.Textbox(label='Inputs')
    btn=gr.Button('Submit')
    clear=gr.ClearButton(components=[msg,chatbot],value='Clear')
    btn.click(inferenceChat,inputs=[msg,chatbot],outputs=[msg,chatbot])
    msg.submit(inferenceChat,inputs=[msg,chatbot],outputs=[msg,chatbot])
gr.close_all()
demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://a9235217295483156b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "/home/tony/anaconda3/lib/python3.11/site-packages/gradio/queueing.py", line 527, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tony/anaconda3/lib/python3.11/site-packages/gradio/route_utils.py", line 261, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tony/anaconda3/lib/python3.11/site-packages/gradio/blocks.py", line 1786, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tony/anaconda3/lib/python3.11/site-packages/gradio/blocks.py", line 1338, in call_function
    prediction = await anyio.to_thread.run_sync(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tony/anaconda3/lib/python3.11/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread