### ollama聊天模板

In [None]:
import torch
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
if hasattr(torch.cuda, 'empty_cache'):
      torch.cuda.empty_cache()

import ollama
import re
models_list = [model['name'] for model in  ollama.list()['models']]
print('models_list:\n', models_list)

In [33]:
#文档：https://github.com/ollama/ollama/blob/main/docs/modelfile.md
#ollama会自动调用一些共享内存，即使设置了无系统回退也会这样
def process_input(question):
  #gemma:2b, qwen:4b-chat-v1.5-q6_K，qwen:7b-chat, qwen:14b-chat
  #gemma:7b-instruct-q8_0
  messages.append({'role': 'user', 'content': question})
  # messages[{'role': 'user', 'content': question}]
  print('用户：', question)
  print('Bot：', end='')
  stream = ollama.chat(
      model='qwen:4b-chat-v1.5-q6_K',#可以长多轮对话，速度也挺快
      # model='qwen:7b-chat-v1.5-q5_0',#跟14b一样对话轮次少
      # model ='qwen:32b-chat-v1.5-q4_0', #30s一个字
      # model='qwen:14b-chat',#CPU运行可以多轮，速度可以接受
      # model='qwen:32b-chat-v1.5-q2_K',
      messages=messages,
      stream=True,
      #option中的参数含义：https://github.com/ggerganov/llama.cpp/tree/master/examples/main#number-of-tokens-to-predict
      options={
      'num_predict': 512,#生成的最大tokens数,default:128
      'temperature': 0.7,#default:0.8
      # 'top_p':0.9,#default 0.9,
      # 'top_k':20#defalut:40
      'num_ctx':2048,#default=2048
      'num_gpu':6,#使用gpu运行模型的层数，num_gpu=0时不使用gpu，为1的时候使用gpu内存。
      'repeat_penalty':1.2, #default=1.1
      'stop':["AI assistant:"],#对话停止的输入
      }
  )
  for chunk in stream:
    result = chunk['message']['content']
    result = re.sub('\n+','\n', result) 
    yield result

messages =[]
#加上system很快爆内存
messages = [{'role':'system','content':'you are a help assistant'}]
def get_system(messages):
  cur_system = [message['content'] for message in messages if message['role']=="system"]
  if cur_system:
    cur_system = cur_system[-1]
  else:
    cur_system =None
  return(cur_system)
while (True):
  question = input()
  if len(question)==0:
    break
  # 查看当前对话轮次
  epoch = [1 for message in messages if message['role']=="assistant"]
  epoch = len(epoch)
  #裁剪
  if epoch >=15:
    #获取最新的system:假设记录里面有多个system
    curr_system = get_system(messages)
    #定义裁剪的轮速
    cut_num =4
    now_num = 1
    mess_ind =0
    if cut_num>epoch:
      print('Error!,裁剪轮次超过对话记录！')
      break
    while(now_num<=cut_num):
      # print(now_num, mess_ind, messages[mess_ind]['role'])
      if messages[mess_ind]['role'] == 'assistant':
        now_num +=1
        mess_ind += 1
        continue
      mess_ind += 1
      
    messages = messages[mess_ind:].copy()
    #如果system的记录被删除了，则在开头加上
    if (not get_system(messages))&(curr_system is not None):
      messages.insert(0, {'role':'system','content':curr_system})

    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
    if hasattr(torch.cuda, 'empty_cache'):
      torch.cuda.empty_cache()
  if len(question)==0:
    break
  outword=""
  for res in process_input(question):
    outword+= res
    print(res, end='', flush=True)
  # print(stream['message']['content'],flush=True)
  messages.append({'role': 'assistant','content':outword})
  print()

用户： test
Bot：This is a test response from an AI language model. Is there anything specific you would like me to assist with?

用户： test
Bot：I'm sorry, but I am not able to generate content or complete tasks on my own without further guidance and information provided by the user.
If there is something specific that you need assistance with, please let me know what it is so that I can provide you with the best possible advice and support.

you are a help assistant
None
[{'role': 'system', 'content': 'you are a help assistant'}, {'role': 'user', 'content': 'test'}, {'role': 'assistant', 'content': "I'm sorry, but I am not able to generate content or complete tasks on my own without further guidance and information provided by the user.\nIf there is something specific that you need assistance with, please let me know what it is so that I can provide you with the best possible advice and support.\n"}]
用户： test
Bot：I'm sorry, but again, as an AI language model, my capabilities are limited to 

In [None]:
# #存储聊天记录
# len(messages)##28>=24的时候应该裁剪
import pickle
# with open(r'E:\LargeModel\Language_Model\Text_generation\Gemma\chat_message.pkl', 'wb') as save_file:
#     pickle.dump(messages, save_file)

#读取pickle
with open(r'E:\LargeModel\Language_Model\Text_generation\Gemma\chat_message.pkl', 'rb') as save_file:
    messages = pickle.load(save_file)
print(len(messages))

### 常规输入模型转为Promot格式--ollama似乎不需要

In [None]:
def translate(input_list, is_translate=True, is_input=True):
    '''
    将input_list转为Promot格式
    is_translate:是否进行转换
    '''
    if not is_translate:
        return(input_list)
    input_list_copy = input_list.copy()
    start = "<|im_start|>"
    end= "<|im_end|>"
    for index in range(len(input_list_copy)):
        if type(input_list_copy[index]) == dict:
            input_list_copy[index]=(f"{start}{input_list_copy[index]['role']}\n{input_list_copy[index]['content']}{end}\n")
    if is_input:
        input_list_copy.append(f"{start}{'assistant'}\n")
    return(input_list_copy)

messages = [{'role':'system','content':'你被邀请来陪用户聊天'}]
messages.append({'role': 'user', 'content': "你能跟我聊天吗"})
print(''.join(translate(messages)))
print('*'*40)
outword='''对不起，我不知道你的名字是什么。'''
messages.append({'role': 'assistant','content':outword})
print(''.join(translate(messages, is_input=False)))

### ai回复并发声

In [None]:

def text_voice(engine, rate=-1, volume=-1, voice=-1):
    '''
    三个参数：
    rate:速率
    volume:音量，【0,1】
    voice:声音，0为男声，1为女声
    '''
    if rate!=-1:
        engine.setProperty('rate', rate)
    if volume!=-1:
        engine.setProperty('volume',volume)
    if voice!=-1:
        voices = engine.getProperty('voices')       
        engine.setProperty('voice', voices[voice].id)
    return engine

#pyttsx3无法说话
import pyttsx3
engine = pyttsx3.init()
engine = text_voice(engine, rate=130)
(r,ve,vs) = (engine.getProperty('rate'),engine.getProperty('volume'), engine.getProperty('voices'))
print('目前语速是：',r,'\t','语音音量是:',ve)


import ollama
# import speech
stream = ollama.chat(
    model='gemma:2b',
    messages=[{'role': 'user', 'content': 'do you have a happy memorary'}],
    stream=True,
)
outword2=""
outword=""
outword3=""

for chunk in stream:
    outword3 = chunk['message']['content']
    outword2 += outword3
    outword += outword3
    #语音播放
    if outword3 in {',','.','。','，','!','！', '?', '？',':', '：'}:
        outword = outword.replace('\n','').replace('*','')
        # speech.say(outword)
        engine.say(outword)
        
        outword=""
    engine.runAndWait()
    #打印对话内容
    print(outword3, end='', flush=True)