[NeuralChat] Integrate PhotoAI backend into NeuralChat (#689)

intel · Nov 20, 2023 · da138c4 · da138c4
1 parent becf6bf
commit da138c4
Show file tree

Hide file tree

Showing 21 changed files with 755 additions and 355 deletions.
diff --git a/.github/workflows/unit-test-neuralchat.yml b/.github/workflows/unit-test-neuralchat.yml
@@ -103,7 +103,9 @@ jobs:
           && pip install yacs uvicorn optimum optimum[habana] \
           && pip install sentence_transformers unstructured markdown rouge_score \
           && pip install einops \
-          && pip install --upgrade accelerate"
+          && pip install --upgrade accelerate \
+          && pip install urllib3 \
+          && pip install langid "
 
       - name: Run UT
         run: |

diff --git a/...ion_for_transformers/neural_chat/examples/deployment/photo_ai/backend/README.md b/...ion_for_transformers/neural_chat/examples/deployment/photo_ai/backend/README.md
@@ -77,7 +77,7 @@ export IMAGE_SERVER_IP="your.server.ip"
 
 # Configurate photoai.yaml
 
-You can customize the configuration file 'photoai.yaml' to match your environment setup. Here's a table to help you understand the configurable options:
+You can customize the configuration file `photoai.yaml` to match your environment setup. Here's a table to help you understand the configurable options:
 
 |  Item               | Value                                  |
 | ------------------- | ---------------------------------------|
@@ -91,6 +91,23 @@ You can customize the configuration file 'photoai.yaml' to match your environmen
 | tasks_list          | ['voicechat', 'photoai']               |
 
 
+# Configurate Environment Variables
+
+Configurate all of the environment variables in file `run.sh` using `export XXX=xxx`. Here's a table of all the variables needed to configurate.
+
+|  Variable           | Value                                  |
+| ------------------- | ---------------------------------------|
+| MYSQL_HOST          | 127.0.0.1 if you deploy mysql on local server.  |
+| MYSQL_USER          | default: 'root'                                   |
+| MYSQL_PASSWORD      | password of the specified user        |
+| MYSQL_PORT          | default: 3306                                  |
+| MYSQL_DB            | default: 'ai_photos'                                |
+| IMAGE_SERVER_IP     | The IP of server which you store user uploaded images      |
+| IMAGE_ROOT_PATH     | local path of image storing path                     |
+| RETRIEVAL_FILE_PATH | local path of where you store retrieval files               |
+| GOOGLE_API_KEY      | your google api key to get gps infomation from images           |
+
+
 # Run the PhotoAI server
 To start the PhotoAI server, use the following command:
 

diff --git a/..._extension_for_transformers/neural_chat/examples/deployment/photo_ai/backend/photoai.yaml b/..._extension_for_transformers/neural_chat/examples/deployment/photo_ai/backend/photoai.yaml
@@ -21,10 +21,10 @@
 #                             SERVER SETTING                                    #
 #################################################################################
 host: 0.0.0.0
-port: 9000
+port: 7000
 
-model_name_or_path: "meta-llama/Llama-2-7b-chat-hf"
-device: "auto"
+model_name_or_path: "./Llama-2-7b-chat-hf"
+device: "cpu"
 
 asr:
     enable: true
@@ -49,5 +49,12 @@ ner:
         spacy_model: "en_core_web_lg"
         bf16: true
 
+retrieval:
+    enable: true
+    args:
+        input_path: "./photoai_retrieval_docs/default/input_path"
+        persist_dir: "./photoai_retrieval_docs/default/persist_dir"
+        response_template: "We cannot find suitable content to answer your query."
+        append: False
 
-tasks_list: ['voicechat', 'photoai']
+tasks_list: ['textchat', 'voicechat', 'retrieval', 'photoai']
diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/photo_ai/backend/run.sh b/intel_extension_for_transformers/neural_chat/examples/deployment/photo_ai/backend/run.sh
@@ -35,4 +35,4 @@ export MYSQL_PASSWORD="root"
 export MYSQL_HOST="127.0.0.1"
 export MYSQL_DB="ai_photos"
 
-numactl -l -C 0-55 python -m photoai 2>&1 | tee run.log
+nohup numactl -l -C 0-55 python -m photoai 2>&1 &
diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/rag/askdoc.py b/intel_extension_for_transformers/neural_chat/examples/deployment/rag/askdoc.py
@@ -21,8 +21,8 @@
 def main():
     server_executor = NeuralChatServerExecutor()
     server_executor(
-        config_file="./askgm.yaml",
-        log_file="./askgm.log")
+        config_file="./askdoc.yaml",
+        log_file="./askdoc.log")
 
 
 if __name__ == "__main__":

diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/rag/askdoc.yaml b/intel_extension_for_transformers/neural_chat/examples/deployment/rag/askdoc.yaml
@@ -20,7 +20,7 @@
 #################################################################################
 #                             SERVER SETTING                                    #
 #################################################################################
-host: 127.0.0.1
+host: 0.0.0.0
 port: 8000
 
 model_name_or_path: "./Llama-2-7b-chat-hf"
@@ -29,13 +29,10 @@ device: "auto"
 retrieval:
     enable: true
     args:
-        input_path: "./docs"
-        persist_dir: "./example_persist"
+        input_path: "./askdoc_docs"
+        persist_dir: "./askdoc_persist"
         response_template: "We cannot find suitable content to answer your query, please contact AskGM to find help. Mail: ask.gm.zizhu@intel.com."
-        append: True
-
-safety_cheker:
-    enable: true
+        append: False
 
 tasks_list: ['textchat', 'retrieval']
 
diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/rag/run.sh b/intel_extension_for_transformers/neural_chat/examples/deployment/rag/run.sh
@@ -16,15 +16,15 @@
 # limitations under the License.
 
 # Kill the exist and re-run
-ps -ef |grep 'askgm' |awk '{print $2}' |xargs kill -9
+ps -ef |grep 'askdoc' |awk '{print $2}' |xargs kill -9
 
 # KMP
 export KMP_BLOCKTIME=1
 export KMP_SETTINGS=1
 export KMP_AFFINITY=granularity=fine,compact,1,0
 
 # OMP
-export OMP_NUM_THREADS=56
+export OMP_NUM_THREADS=52
 export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so
 
 # tc malloc
@@ -35,4 +35,4 @@ export MYSQL_PASSWORD="root"
 export MYSQL_HOST="127.0.0.1"
 export MYSQL_DB="fastrag"
 
-numactl -l -C 0-55 askdoc -m askgm 2>&1 | tee run.log
+nohup numactl -l -C 0-51 python -m askdoc 2>&1 &
diff --git a/...extension_for_transformers/neural_chat/pipeline/plugins/retrieval/indexing/html_parser.py b/...extension_for_transformers/neural_chat/pipeline/plugins/retrieval/indexing/html_parser.py
@@ -0,0 +1,193 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+from urllib.parse import urlparse, urlunparse
+import multiprocessing
+import urllib3
+import langid
+import PyPDF2
+from bs4 import BeautifulSoup
+import os
+import re
+from .context_utils import uni_pro
+
+
+urllib3.disable_warnings()
+
+class Crawler:
+    def __init__(self, pool=None):
+        if pool:
+            assert isinstance(pool, (str, list, tuple)), 'url pool should be str, list or tuple'
+        self.pool = pool
+        self.headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng, \
+            */*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, \
+            like Gecko) Chrome/113.0.0.0 Safari/537.36'
+            }
+        self.fetched_pool = set()
+
+    def get_sublinks(self, soup):
+        sublinks = []
+        for links in soup.find_all('a'):
+            sublinks.append(str(links.get('href')))
+        return sublinks
+
+    def get_hyperlink(self, soup, base_url):
+        sublinks = []
+        for links in soup.find_all('a'):
+            link = str(links.get('href'))
+            if link.startswith('#') or link is None or link == 'None':
+                continue
+            suffix = link.split('/')[-1]
+            if '.' in suffix and suffix.split('.')[-1] not in ['html', 'htmld']:
+                continue
+            link_parse = urlparse(link)
+            base_url_parse = urlparse(base_url)
+            if link_parse.path == '':
+                continue
+            if link_parse.netloc != '':
+                # keep crawler works in the same domain
+                if link_parse.netloc != base_url_parse.netloc:
+                    continue
+                sublinks.append(link)
+            else:
+                sublinks.append(urlunparse((base_url_parse.scheme,
+                                           base_url_parse.netloc,
+                                           link_parse.path,
+                                           link_parse.params,
+                                           link_parse.query,
+                                           link_parse.fragment)))
+        return sublinks
+
+    def fetch(self, url, headers=None, max_times=5):
+        if not headers:
+            headers = self.headers
+        while max_times:
+            if not url.startswith('http') or not url.startswith('https'):
+                url = 'http://' + url
+            print(f'start fetch {url}...')
+            try:
+                response = requests.get(url, headers=headers, verify=True)
+                if response.status_code != 200:
+                    print(f'fail to fetch {url}, respose status code: {response.status_code}')
+                else:
+                    return response
+            except Exception as e:
+                print(f'fail to fetch {url}, cased by {e}')
+            max_times -= 1
+        return None
+
+    def process_work(self, sub_url, work):
+        response = self.fetch(sub_url)
+        if response is None:
+            return []
+        self.fetched_pool.add(sub_url)
+        soup = self.parse(response.text)
+        base_url = self.get_base_url(sub_url)
+        sublinks = self.get_hyperlink(soup, base_url)
+        if work:
+            work(sub_url, soup)
+        return sublinks
+
+    def crawl(self, pool, work=None, max_depth=10, workers=10):
+        url_pool = set()
+        for url in pool:
+            base_url = self.get_base_url(url)
+            response = self.fetch(url)
+            soup = self.parse(response.text)
+            sublinks = self.get_hyperlink(soup, base_url)
+            self.fetched_pool.add(url)
+            url_pool.update(sublinks)
+            depth = 0
+            while len(url_pool) > 0 and depth < max_depth:
+                print(f'current depth {depth} ...')
+                mp = multiprocessing.Pool(processes=workers)
+                results = []
+                for sub_url in url_pool:
+                    if sub_url not in self.fetched_pool:
+                        results.append(mp.apply_async(self.process_work, (sub_url, work)))
+                mp.close()
+                mp.join()
+                url_pool = set()
+                for result in results:
+                    sublinks = result.get()
+                    url_pool.update(sublinks)
+                depth += 1
+
+    def parse(self, html_doc):
+        soup = BeautifulSoup(html_doc, 'lxml')
+        return soup
+
+    def download(self, url, file_name):
+        print(f'download {url} into {file_name}...')
+        try:
+            r = requests.get(url, stream=True, headers=self.headers, verify=True)
+            f = open(file_name, "wb")
+            for chunk in r.iter_content(chunk_size=512):
+                if chunk:
+                    f.write(chunk)
+        except Exception as e:
+            print(f'fail to download {url}, cased by {e}')
+
+    def get_base_url(self, url):
+        result = urlparse(url)
+        return urlunparse((result.scheme, result.netloc, '', '', '', ''))
+
+    def clean_text(self, text):
+        text = text.strip().replace('\r', '\n')
+        text = re.sub(' +', ' ', text)
+        text = re.sub('\n+', '\n', text)
+        text = text.split('\n')
+        return '\n'.join([i for i in text if i and i != ' '])
+
+
+
+
+
+def load_html_data(url):
+    crawler = Crawler()
+    res = crawler.fetch(url)
+    if res == None:
+        return None
+    soup = crawler.parse(res.text)
+    all_text = crawler.clean_text(soup.select_one('body').text)
+    main_content = ''
+    for element_name in ['main', 'container']:
+        main_block = None
+        if soup.select(f'.{element_name}'):
+            main_block = soup.select(f'.{element_name}')
+        elif soup.select(f'#{element_name}'):
+            main_block = soup.select(f'#{element_name}')
+        if main_block:
+            for element in main_block:
+                text = crawler.clean_text(element.text)
+                if text not in main_content:
+                    main_content += f'\n{text}'
+            main_content = crawler.clean_text(main_content)
+
+    main_content = main_content.replace('\n', '')
+    main_content = main_content.replace('\n\n', '')
+    main_content = uni_pro(main_content)
+    main_content = re.sub(r'\s+', ' ', main_content)
+
+    # {'text': all_text, 'main_content': main_content}
+
+    return main_content