In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
!python3 -m pip install --no-cache-dir llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122
!python3 -m pip install googlesearch-python bs4 charset-normalizer requests-html lxml_html_clean

Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu122
Collecting llama-cpp-python==0.3.4
  Downloading https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl (445.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m445.2/445.2 MB[0m [31m295.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: llama-cpp-python
Successfully installed llama-cpp-python-0.3.4
Found existing installation: llama_cpp_python 0.3.4
Uninstalling llama_cpp_python-0.3.4:
  Successfully uninstalled llama_cpp_python-0.3.4
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.8.tar.gz (67.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 MB[0m [31m222.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [

In [22]:
import torch
if not torch.cuda.is_available():
    raise Exception('You are not using the GPU runtime. Change it first or you will suffer from the super slow inference speed!')
else:
    print('You are good to go!')

You are good to go!


### 架好LLM和inference function `generate response`

In [23]:
torch.cuda.empty_cache()

In [24]:
from llama_cpp import Llama
# Load the model onto GPU

def generate_response(_model: Llama, _messages: str) -> str:
    '''
    This function will inference the model with given messages.
    '''
    _output = _model.create_chat_completion(
        _messages,
        # stop=["<|eot_id|>", "<|end_of_text|>"], # 這是LLama的special token, 遇到了就暫停不要繼續亂說話
        max_tokens=512,    # This argument is how many tokens the model can generate.
        temperature=0,      # This argument is the randomness of the model. 0 means no randomness. You will get the same result with the same input every time. You can try to set it to different values.
        repeat_penalty=2.0, # The repeat_penalty is set to 2.0. This means that the model will be strongly penalized for repeating tokens, making it more likely to generate diverse and less repetitive text.
    )["choices"][0]["message"]["content"]
    return _output

### 網路搜尋函式，讓agent可以上網找資訊

In [25]:
from typing import List
from googlesearch import search as _search
from bs4 import BeautifulSoup
from charset_normalizer import detect
import asyncio
from requests_html import AsyncHTMLSession
import urllib3
urllib3.disable_warnings()

async def worker(s:AsyncHTMLSession, url:str):
    try:
        header_response = await asyncio.wait_for(s.head(url, verify=False), timeout=10)
        if 'text/html' not in header_response.headers.get('Content-Type', ''):
            return None
        r = await asyncio.wait_for(s.get(url, verify=False), timeout=10)
        return r.text
    except:
        return None

async def get_htmls(urls):
    session = AsyncHTMLSession()
    tasks = (worker(session, url) for url in urls)
    return await asyncio.gather(*tasks)

# keyword 關鍵字 / 回覆數量 n_results (不能太高喔 => 會收到 HTTP 429 error)
async def search(keyword: str, n_results: int=3) -> List[str]:
    '''
    This function will search the keyword and return the text content in the first n_results web pages.
    Warning: You may suffer from HTTP 429 errors if you search too many times in a period of time. This is unavoidable and you should take your own risk if you want to try search more results at once.
    The rate limit is not explicitly announced by Google, hence there's not much we can do except for changing the IP or wait until Google unban you (we don't know how long the penalty will last either).
    '''
    keyword = keyword[:100]
    # First, search the keyword and get the results. Also, get 2 times more results in case some of them are invalid.
    # 這邊用了GoogleSearch 的 search 函式，回傳一堆URLs
    urls = list(_search(keyword, n_results * 2, lang="zh", unique=True))
    # Then, get the HTML from the results. Also, the helper function will filter out the non-HTML urls.
    # 從URL的server去拿取HTML
    htmls = await get_htmls(urls)
    # 打包成 url html 的 pair
    url_html_pairs = [(url, html) for url, html in zip(urls, htmls) if html is not None]
    url_text_pairs = []
    for url, html in url_html_pairs:
      # Parse the HTML. 用 beautifulSoup將資訊解析成HTML該有的樣子(Beautiful soup object)
        bs_object = BeautifulSoup(html, 'html.parser')
        if detect(bs_object.encode()).get('encoding') == 'utf-8':

            text_content = ''.join(bs_object.get_text().split())
            url_text_pairs.append((url, text_content))

    # Return the first n URL-result pairs
    return url_text_pairs[:n_results]

In [30]:
class LLMAgent():
    def __init__(self, role_description: str, task_description: str, llm_path:str):
        self.role_description = role_description   # Role means who this agent should act like. e.g. the history expert, the manager......
        self.task_description = task_description    # Task description instructs what task should this agent solve.
        self.llm_path = llm_path  # LLM indicates which LLM backend this agent is using.
        self.llm = Llama(model_path=self.llm_path, verbose=False, n_gpu_layers=-1, n_ctx=16384)
        self.tokenizer = self.llm.tokenizer()

    from llama_cpp import Llama  # Ensure this is imported

    async def inference(self, message: str) -> str:
          # Get search results
          search_results = await search(message, n_results=3)

          # Build base content without references
          base_content = (
              f"Task: {self.task_description}\n"
              f"Query: {message}\n"
              f"Reference Data:\n"
          )

          # Calculate available tokens (16384 context - 4096 response buffer)
          max_input_tokens = 16384 - 4096  # Adjust buffer as needed
          used_tokens = len(self.tokenizer.encode(base_content))
          remaining_tokens = max_input_tokens - used_tokens

          # Truncate reference info
          reference_info = []
          if search_results:
              for url, text in search_results:
                  combined = f"({url}) {text}"
                  tokens = self.tokenizer.encode(combined)
                  reference_info.append(self.tokenizer.decode(tokens[:remaining_tokens//3]))  # Split tokens between 3 results
                  remaining_tokens -= len(tokens)

          # Build final messages
          messages = [
              {
                  "role": "system",
                  "content": self.tokenizer.decode(
                      self.tokenizer.encode(f"{self.role_description}\nCurrent knowledge cutoff: May 2025")[:512]
                  )
              },
              {
                  "role": "user",
                  "content": base_content + "\n".join(reference_info[:3])
              }
          ]
          return generate_response(self.llm, messages)

上網搜尋功能可以了

In [31]:
# search_results = await search("名偵探柯南：貝克街的亡靈講了什麼？", n_results = 3)
# reference_info = "\n".join(
#     f"({url}) {text}" for url, text in search_results
# ) if search_results else "No search results found"
# print(reference_info)


## Test
因為Qwen-14b太大了，只要load進去GPU，RAM就佔了10.8GB，每次都要重新restart清空GPU。

In [32]:
message =

role_1 = "很會解釋電影的影評"
task_2 = "用繁體中文解釋柯南電影"
Agent1 = LLMAgent(role, task, llm_path='/content/drive/MyDrive/Colab Notebooks/ML_models/DarkIdol-Llama-3.1-8B-Instruct-1.2-Uncensored-IQ2_M.gguf')



llama_new_context_with_model: n_ctx_per_seq (16384) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


In [33]:
await Agent1.inference("名偵探柯南：貝克街的亡靈講了什麼？")

'《名侦探柯南：贝克街的亡灵》是一部动画电影，讲述了两位主角——工藤新一（也就是江户川小兰）和他的朋友们在体验由“茧”构建的人造世界时遇到的事件。他们选择穿越到19世纪伦敦去破解历史上的悬案之一：“开膛手杰克”的连环杀人。这部电影以一位名叫泽田弘树的天才儿童，在相继研发出“DNA追踪系统”和 “诺亚方舟AI ”之后自殺離世为開場，讲述了柯南等人的体验过程。'

原本的:
《名偵探柯南：貝克街的亡靈》是1986年上映的一部經典動畫電影，也是所有漫迷心中的白月光。這是一部充滿了對福爾摩斯致敬意味的作品。\n\n**一、故事背景與設定**\n影片將舞台設置在20世紀初的大正時代日本和現代的東京兩大時空交織下進行探案解謎的故事，柯南一行人被卷入了一個涉及虛擬城市“米花市”的神秘案件。這個城市的建立是為了慶祝一個名為吉田鶴右衛門的人而命名。\n\n**二、主要情節**\n1. **現代東京部分：尋找真相的開始與發展過程中的重重困難阻礙柯南等人解開了連環殺人案，並揭露出幕後黑手。同時還阻止了一場針對米花市數據庫的大規模破壞行動。\n2.“大正時代”日本的部分則是通過電腦模擬的方式進行虛構的探險，在這個過程中他們遇到了許多困難和挑戰。\n\n**三、主題思想**\n1. **對真相與歷史責任感：影片強調了對於過去錯誤行為應該承擔起相應的责任，並努力去修正它們。\n2.“科技”雙刃劍的概念也得到了很好的體現。一方面它可以幫助人們解決問題；另一方面也可能會帶來新的挑戰和風險。\n\n**四、角色塑造**\n1. 柯南：他展露出極高的智慧與推理能力，在整個案件中起到了關鍵作用，並且表現出了對於真相的執著追求。\n2.“工藤新一”則是通過柯楠這個形象來進行自我成長。他在面對困難時始終保持樂觀積極的心態。\n\n總之，《名偵探コナン：ベイカー街の亡霊》是一部兼具娛樂性和教育意義的作品，它不僅僅是一個簡單的推理故事還讓我們看到了人性中的光明面以及對於過去錯誤行為應該承擔起相應責任的重要性。

## 加入web search
《名侦探柯南：贝克街的亡灵》是20世纪初一部极具影响力的电影，它巧妙地将现实与虚拟世界结合，并通过一个引人入胜的故事探讨了社会问题和人性。这部电影不仅是一部娱乐作品,更是一次对经典文学致敬。\n\n**一、故事背景**\n\n影片以天才少年泽田弘树的自杀为开端——他因无法适应日本教育体系而前往美国，最终却在完成“诺亚方舟”人工智能后选择结束自己的生命。“茧”的虚拟现实游戏本应是场娱乐体验,但因为"諾亞之船"(Noah\'s Ark)的人工智能系统入侵，“50名孩子中只要有一个通关便放过他们”，否则将用特殊电磁波杀害所有参与者。柯南一行人选择了1893年伦敦的贝克街，试图解开开膛手杰克试图谋杀艾琳·爱德华斯的历史悬案。\n\n**二、主要情节**\n\n- **现实世界**\n  - 柯楠等人受邀参加“茧”的发布会,但游戏开始后却陷入危机。\n  \n    在现实中，“诺亚方舟”是泽田弘树创造的人工智能，它入侵了虚拟系统，并切断所有与外界的联系。柯南的父亲——著名侦探作家优作也出席活动并负责监修历史事件。\n\n- **虚幻世界**\n  - 柯楠一行人选择1893年伦敦作为游戏场景。\n    在这里,他们遇到了哈德森太太、莫里亚蒂教授和开膛手杰克。柯南通过推理发现，被杀害的女性是凶手的母亲，并且他因为怨恨而杀死了她。\n\n**三、“诺亞之船”的目的**\n\n“諾亜方舟”希望改变日本社会现状,避免世袭制导致的社会问题。“茧"游戏参与者多为富二代、权贵后代。在现实世界中，他们骄纵跋扈；但在虚拟游戏中却不得不放下身段与柯南等人合作。\n\n**四、“诺亞之船”的启示**\n\n1. **教育体制的反思**\n   - 日本现行制度无法包容像泽田弘树这样极具天赋但性格孤僻的孩子。他被老师和同学视为怪胎，最终只能选择离开日本前往美国。\n2 . 社会阶层固化\n    富二代们在现实世界中骄奢淫逸,但在虚拟游戏中却不得不放下身段