Skip to content

Commit

Permalink
[NeuralChat] Integrate PhotoAI backend into NeuralChat (#689)
Browse files Browse the repository at this point in the history
  • Loading branch information
letonghan committed Nov 20, 2023
1 parent becf6bf commit da138c4
Show file tree
Hide file tree
Showing 21 changed files with 755 additions and 355 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/unit-test-neuralchat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ jobs:
&& pip install yacs uvicorn optimum optimum[habana] \
&& pip install sentence_transformers unstructured markdown rouge_score \
&& pip install einops \
&& pip install --upgrade accelerate"
&& pip install --upgrade accelerate \
&& pip install urllib3 \
&& pip install langid "
- name: Run UT
run: |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ export IMAGE_SERVER_IP="your.server.ip"

# Configurate photoai.yaml

You can customize the configuration file 'photoai.yaml' to match your environment setup. Here's a table to help you understand the configurable options:
You can customize the configuration file `photoai.yaml` to match your environment setup. Here's a table to help you understand the configurable options:

| Item | Value |
| ------------------- | ---------------------------------------|
Expand All @@ -91,6 +91,23 @@ You can customize the configuration file 'photoai.yaml' to match your environmen
| tasks_list | ['voicechat', 'photoai'] |


# Configurate Environment Variables

Configurate all of the environment variables in file `run.sh` using `export XXX=xxx`. Here's a table of all the variables needed to configurate.

| Variable | Value |
| ------------------- | ---------------------------------------|
| MYSQL_HOST | 127.0.0.1 if you deploy mysql on local server. |
| MYSQL_USER | default: 'root' |
| MYSQL_PASSWORD | password of the specified user |
| MYSQL_PORT | default: 3306 |
| MYSQL_DB | default: 'ai_photos' |
| IMAGE_SERVER_IP | The IP of server which you store user uploaded images |
| IMAGE_ROOT_PATH | local path of image storing path |
| RETRIEVAL_FILE_PATH | local path of where you store retrieval files |
| GOOGLE_API_KEY | your google api key to get gps infomation from images |


# Run the PhotoAI server
To start the PhotoAI server, use the following command:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
# SERVER SETTING #
#################################################################################
host: 0.0.0.0
port: 9000
port: 7000

model_name_or_path: "meta-llama/Llama-2-7b-chat-hf"
device: "auto"
model_name_or_path: "./Llama-2-7b-chat-hf"
device: "cpu"

asr:
enable: true
Expand All @@ -49,5 +49,12 @@ ner:
spacy_model: "en_core_web_lg"
bf16: true

retrieval:
enable: true
args:
input_path: "./photoai_retrieval_docs/default/input_path"
persist_dir: "./photoai_retrieval_docs/default/persist_dir"
response_template: "We cannot find suitable content to answer your query."
append: False

tasks_list: ['voicechat', 'photoai']
tasks_list: ['textchat', 'voicechat', 'retrieval', 'photoai']
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ export MYSQL_PASSWORD="root"
export MYSQL_HOST="127.0.0.1"
export MYSQL_DB="ai_photos"

numactl -l -C 0-55 python -m photoai 2>&1 | tee run.log
nohup numactl -l -C 0-55 python -m photoai 2>&1 &
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
def main():
server_executor = NeuralChatServerExecutor()
server_executor(
config_file="./askgm.yaml",
log_file="./askgm.log")
config_file="./askdoc.yaml",
log_file="./askdoc.log")


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#################################################################################
# SERVER SETTING #
#################################################################################
host: 127.0.0.1
host: 0.0.0.0
port: 8000

model_name_or_path: "./Llama-2-7b-chat-hf"
Expand All @@ -29,13 +29,10 @@ device: "auto"
retrieval:
enable: true
args:
input_path: "./docs"
persist_dir: "./example_persist"
input_path: "./askdoc_docs"
persist_dir: "./askdoc_persist"
response_template: "We cannot find suitable content to answer your query, please contact AskGM to find help. Mail: ask.gm.zizhu@intel.com."
append: True

safety_cheker:
enable: true
append: False

tasks_list: ['textchat', 'retrieval']

Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@
# limitations under the License.

# Kill the exist and re-run
ps -ef |grep 'askgm' |awk '{print $2}' |xargs kill -9
ps -ef |grep 'askdoc' |awk '{print $2}' |xargs kill -9

# KMP
export KMP_BLOCKTIME=1
export KMP_SETTINGS=1
export KMP_AFFINITY=granularity=fine,compact,1,0

# OMP
export OMP_NUM_THREADS=56
export OMP_NUM_THREADS=52
export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so

# tc malloc
Expand All @@ -35,4 +35,4 @@ export MYSQL_PASSWORD="root"
export MYSQL_HOST="127.0.0.1"
export MYSQL_DB="fastrag"

numactl -l -C 0-55 askdoc -m askgm 2>&1 | tee run.log
nohup numactl -l -C 0-51 python -m askdoc 2>&1 &
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# !/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import requests
from urllib.parse import urlparse, urlunparse
import multiprocessing
import urllib3
import langid
import PyPDF2
from bs4 import BeautifulSoup
import os
import re
from .context_utils import uni_pro


urllib3.disable_warnings()

class Crawler:
def __init__(self, pool=None):
if pool:
assert isinstance(pool, (str, list, tuple)), 'url pool should be str, list or tuple'
self.pool = pool
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng, \
*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, \
like Gecko) Chrome/113.0.0.0 Safari/537.36'
}
self.fetched_pool = set()

def get_sublinks(self, soup):
sublinks = []
for links in soup.find_all('a'):
sublinks.append(str(links.get('href')))
return sublinks

def get_hyperlink(self, soup, base_url):
sublinks = []
for links in soup.find_all('a'):
link = str(links.get('href'))
if link.startswith('#') or link is None or link == 'None':
continue
suffix = link.split('/')[-1]
if '.' in suffix and suffix.split('.')[-1] not in ['html', 'htmld']:
continue
link_parse = urlparse(link)
base_url_parse = urlparse(base_url)
if link_parse.path == '':
continue
if link_parse.netloc != '':
# keep crawler works in the same domain
if link_parse.netloc != base_url_parse.netloc:
continue
sublinks.append(link)
else:
sublinks.append(urlunparse((base_url_parse.scheme,
base_url_parse.netloc,
link_parse.path,
link_parse.params,
link_parse.query,
link_parse.fragment)))
return sublinks

def fetch(self, url, headers=None, max_times=5):
if not headers:
headers = self.headers
while max_times:
if not url.startswith('http') or not url.startswith('https'):
url = 'http://' + url
print(f'start fetch {url}...')
try:
response = requests.get(url, headers=headers, verify=True)
if response.status_code != 200:
print(f'fail to fetch {url}, respose status code: {response.status_code}')
else:
return response
except Exception as e:
print(f'fail to fetch {url}, cased by {e}')
max_times -= 1
return None

def process_work(self, sub_url, work):
response = self.fetch(sub_url)
if response is None:
return []
self.fetched_pool.add(sub_url)
soup = self.parse(response.text)
base_url = self.get_base_url(sub_url)
sublinks = self.get_hyperlink(soup, base_url)
if work:
work(sub_url, soup)
return sublinks

def crawl(self, pool, work=None, max_depth=10, workers=10):
url_pool = set()
for url in pool:
base_url = self.get_base_url(url)
response = self.fetch(url)
soup = self.parse(response.text)
sublinks = self.get_hyperlink(soup, base_url)
self.fetched_pool.add(url)
url_pool.update(sublinks)
depth = 0
while len(url_pool) > 0 and depth < max_depth:
print(f'current depth {depth} ...')
mp = multiprocessing.Pool(processes=workers)
results = []
for sub_url in url_pool:
if sub_url not in self.fetched_pool:
results.append(mp.apply_async(self.process_work, (sub_url, work)))
mp.close()
mp.join()
url_pool = set()
for result in results:
sublinks = result.get()
url_pool.update(sublinks)
depth += 1

def parse(self, html_doc):
soup = BeautifulSoup(html_doc, 'lxml')
return soup

def download(self, url, file_name):
print(f'download {url} into {file_name}...')
try:
r = requests.get(url, stream=True, headers=self.headers, verify=True)
f = open(file_name, "wb")
for chunk in r.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
except Exception as e:
print(f'fail to download {url}, cased by {e}')

def get_base_url(self, url):
result = urlparse(url)
return urlunparse((result.scheme, result.netloc, '', '', '', ''))

def clean_text(self, text):
text = text.strip().replace('\r', '\n')
text = re.sub(' +', ' ', text)
text = re.sub('\n+', '\n', text)
text = text.split('\n')
return '\n'.join([i for i in text if i and i != ' '])





def load_html_data(url):
crawler = Crawler()
res = crawler.fetch(url)
if res == None:
return None
soup = crawler.parse(res.text)
all_text = crawler.clean_text(soup.select_one('body').text)
main_content = ''
for element_name in ['main', 'container']:
main_block = None
if soup.select(f'.{element_name}'):
main_block = soup.select(f'.{element_name}')
elif soup.select(f'#{element_name}'):
main_block = soup.select(f'#{element_name}')
if main_block:
for element in main_block:
text = crawler.clean_text(element.text)
if text not in main_content:
main_content += f'\n{text}'
main_content = crawler.clean_text(main_content)

main_content = main_content.replace('\n', '')
main_content = main_content.replace('\n\n', '')
main_content = uni_pro(main_content)
main_content = re.sub(r'\s+', ' ', main_content)

# {'text': all_text, 'main_content': main_content}

return main_content
Loading

0 comments on commit da138c4

Please sign in to comment.