labring · yangchuansheng · Nov 15, 2023 · Nov 7, 2023 · Nov 7, 2023 · Nov 10, 2023
diff --git a/.github/workflows/fastgpt-image-personal.yml b/.github/workflows/fastgpt-image-personal.yml
@@ -0,0 +1,52 @@
+name: Build FastGPT images in Personal warehouse
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - 'projects/app/**'
+      - 'packages/**'
+    branches:
+      - 'main'
+jobs:
+  build-fastgpt-images:
+    runs-on: ubuntu-20.04
+    if: github.repository != 'labring/FastGPT'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          driver-opts: network=host
+      - name: Cache Docker layers
+        uses: actions/cache@v3
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-buildx-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-buildx-
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GH_PAT }}
+      - name: Set DOCKER_REPO_TAGGED based on branch or tag
+        run: |
+          echo "DOCKER_REPO_TAGGED=ghcr.io/${{ github.repository_owner }}/fastgpt:latest" >> $GITHUB_ENV
+      - name: Build and publish image for main branch or tag push event
+        env:
+          DOCKER_REPO_TAGGED: ${{ env.DOCKER_REPO_TAGGED }}
+        run: |
+          docker buildx build \
+          --build-arg name=app \
+          --label "org.opencontainers.image.source=https://github.com/${{ github.repository_owner }}/FastGPT" \
+          --label "org.opencontainers.image.description=fastgpt image" \
+          --push \
+          --cache-from=type=local,src=/tmp/.buildx-cache \
+          --cache-to=type=local,dest=/tmp/.buildx-cache \
+          -t ${DOCKER_REPO_TAGGED} \
+          -f Dockerfile \
+          .
diff --git a/.github/workflows/fastgpt-image.yml b/.github/workflows/fastgpt-image.yml
@@ -5,8 +5,6 @@ on:
     paths:
       - 'projects/app/**'
       - 'packages/**'
-    branches:
-      - 'main'
     tags:
       - 'v*.*.*'
 jobs:
@@ -53,9 +51,8 @@ jobs:
           docker buildx build \
           --build-arg name=app \
           --platform linux/amd64,linux/arm64 \
-          --label "org.opencontainers.image.source=  https://github.com/  ${{ github.repository_owner }}/FastGPT" \
+          --label "org.opencontainers.image.source=https://github.com/${{ github.repository_owner }}/FastGPT" \
           --label "org.opencontainers.image.description=fastgpt image" \
-          --label "org.opencontainers.image.licenses=Apache" \
           --push \
           --cache-from=type=local,src=/tmp/.buildx-cache \
           --cache-to=type=local,dest=/tmp/.buildx-cache \

diff --git a/.github/workflows/preview-image.yml b/.github/workflows/preview-image.yml
@@ -24,7 +24,7 @@ jobs:
         with:
           driver-opts: network=host
       - name: Cache Docker layers
-        uses: actions/cache@v2
+        uses: actions/cache@v3
         with:
           path: /tmp/.buildx-cache
           key: ${{ runner.os }}-buildx-${{ github.sha }}
@@ -48,6 +48,7 @@ jobs:
           --label "org.opencontainers.image.source=  https://github.com/  ${{ github.repository_owner }}/FastGPT" \
           --label "org.opencontainers.image.description=fastgpt-pr image" \
           --label "org.opencontainers.image.licenses=Apache" \
+          --push \
           --cache-from=type=local,src=/tmp/.buildx-cache \
           --cache-to=type=local,dest=/tmp/.buildx-cache \
           -t ${DOCKER_REPO_TAGGED} \

diff --git a/docSite/content/docs/installation/upgrading/46.md b/docSite/content/docs/installation/upgrading/46.md
@@ -7,20 +7,20 @@ toc: true
 weight: 836
 ---
 
-未正式发布。
+# V4.6 版本加入了简单的团队功能，可以邀请其他用户进来管理资源。该版本升级后无法执行旧的升级脚本，且无法回退。
 
-V4.6 版本加入了简单的团队功能，可以邀请其他用户进来管理资源。该版本升级后无法执行旧的升级脚本，且无法回退。
+# 1. 更新镜像并变更配置文件
 
-## 1. 更新镜像并变更配置文件
-
-更新镜像至 latest 或者 v4.6 版本。商业版镜像更新至 V0.2.
+更新镜像至 latest 或者 v4.6 版本。商业版镜像更新至 V0.2.1
 
 最新配置可参考: [V46版本最新 config.json](/docs/development/configuration)，商业镜像配置文件也更新，参考最新的飞书文档。
 
 
-## 2. 执行初始化 API
+# 2. 执行初始化 API
+
+发起 2 个 HTTP 请求（{{rootkey}} 替换成环境变量里的`rootkey`，{{host}}替换成自己域名）
 
-发起 1 个 HTTP 请求（{{rootkey}} 替换成环境变量里的`rootkey`，{{host}}替换成自己域名）
+**该初始化接口可能速度很慢，返回超时不用管，注意看日志即可，需要注意的是，需确保initv46成功后，在执行initv46-2**
 
 1. https://xxxxx/api/admin/initv46
 
@@ -30,16 +30,25 @@ curl --location --request POST 'https://{{host}}/api/admin/initv46' \
 --header 'Content-Type: application/json'
 ```
 
+2. https://xxxxx/api/admin/initv46-2
+
+```bash
+curl --location --request POST 'https://{{host}}/api/admin/initv46-2' \
+--header 'rootkey: {{rootkey}}' \
+--header 'Content-Type: application/json'
+```
+
 初始化内容：
 1. 创建默认团队
 2. 初始化 Mongo 所有资源的团队字段
 3. 初始化 Pg 的字段
+4. 初始化 Mongo Data
 
-**该初始化接口可能速度很慢，返回超时不用管，注意看日志即可**
-
-
-## 功能介绍
 
-### Fast GPT V4.6
+# V4.6功能介绍
 
 1. 新增 - 团队空间
+2. 新增 - 多路向量（多个向量映射一组数据）
+3. 新增 - tts语音
+4. 线上环境新增 - ReRank向量召回，提高召回精度
+5. 优化 - 知识库导出，可直接触发流下载，无需等待转圈圈
diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts
@@ -0,0 +1,131 @@
+import { getErrText } from '../error/utils';
+import { countPromptTokens } from './tiktoken';
+
+/**
+ * text split into chunks
+ * maxLen - one chunk len. max: 3500
+ * overlapLen - The size of the before and after Text
+ * maxLen > overlapLen
+ * markdown
+ */
+export const splitText2Chunks = (props: { text: string; maxLen: number; overlapLen?: number }) => {
+  const { text = '', maxLen, overlapLen = Math.floor(maxLen * 0.2) } = props;
+  const tempMarker = 'SPLIT_HERE_SPLIT_HERE';
+
+  const stepReg: Record<number, RegExp> = {
+    0: /^(#\s[^\n]+)\n/gm,
+    1: /^(##\s[^\n]+)\n/gm,
+    2: /^(###\s[^\n]+)\n/gm,
+    3: /^(####\s[^\n]+)\n/gm,
+
+    4: /(\n\n)/g,
+    5: /([\n])/g,
+    6: /[。]|(?!<[^a-zA-Z])\.\s/g,
+    7: /([！？]|!\s|\?\s)/g,
+    8: /([；]|;\s)/g,
+    9: /([，]|,\s)/g
+  };
+
+  const splitTextRecursively = ({
+    text = '',
+    step,
+    lastChunk,
+    overlayChunk
+  }: {
+    text: string;
+    step: number;
+    lastChunk: string;
+    overlayChunk: string;
+  }) => {
+    if (text.length <= maxLen) {
+      return [text];
+    }
+    const reg = stepReg[step];
+    const isMarkdownSplit = step < 4;
+
+    if (!reg) {
+      // use slice-maxLen to split text
+      const chunks: string[] = [];
+      let chunk = '';
+      for (let i = 0; i < text.length; i += maxLen - overlapLen) {
+        chunk = text.slice(i, i + maxLen);
+        chunks.push(chunk);
+      }
+      return chunks;
+    }
+
+    // split text by special char
+    const splitTexts = text
+      .replace(reg, isMarkdownSplit ? `${tempMarker}$1` : `$1${tempMarker}`)
+      .split(`${tempMarker}`)
+      .filter((part) => part);
+
+    let chunks: string[] = [];
+    for (let i = 0; i < splitTexts.length; i++) {
+      let text = splitTexts[i];
+      let chunkToken = countPromptTokens(lastChunk, '');
+      const textToken = countPromptTokens(text, '');
+
+      // next chunk is too large / new chunk is too large(The current chunk must be smaller than maxLen)
+      if (textToken >= maxLen || chunkToken + textToken > maxLen * 1.4) {
+        // last chunk is too large, push it to chunks, not add to next chunk
+        if (chunkToken > maxLen * 0.7) {
+          chunks.push(lastChunk);
+          lastChunk = '';
+          overlayChunk = '';
+        }
+        // chunk is small, insert to next chunks
+        const innerChunks = splitTextRecursively({
+          text,
+          step: step + 1,
+          lastChunk,
+          overlayChunk
+        });
+        if (innerChunks.length === 0) continue;
+        chunks = chunks.concat(innerChunks);
+        lastChunk = '';
+        overlayChunk = '';
+        continue;
+      }
+
+      // size less than maxLen, push text to last chunk
+      lastChunk += text;
+      chunkToken += textToken; // Definitely less than 1.4 * maxLen
+
+      // size over lapLen, push it to next chunk
+      if (
+        overlapLen !== 0 &&
+        !isMarkdownSplit &&
+        chunkToken >= maxLen - overlapLen &&
+        textToken < overlapLen
+      ) {
+        overlayChunk += text;
+      }
+      if (chunkToken >= maxLen) {
+        chunks.push(lastChunk);
+        lastChunk = overlayChunk;
+        overlayChunk = '';
+      }
+    }
+
+    /* If the last chunk is independent, it needs to be push chunks. */
+    if (lastChunk && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastChunk)) {
+      chunks.push(lastChunk);
+    }
+
+    return chunks;
+  };
+
+  try {
+    const chunks = splitTextRecursively({ text, step: 0, lastChunk: '', overlayChunk: '' });
+
+    const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
+
+    return {
+      chunks,
+      tokens
+    };
+  } catch (err) {
+    throw new Error(getErrText(err));
+  }
+};
diff --git a/...c/global/common/tiktoken/cl100k_base.json → ...l/common/string/tiktoken/cl100k_base.json b/...c/global/common/tiktoken/cl100k_base.json → ...l/common/string/tiktoken/cl100k_base.json
diff --git a/...s/app/src/global/common/tiktoken/index.ts → ...es/global/common/string/tiktoken/index.ts b/...s/app/src/global/common/tiktoken/index.ts → ...es/global/common/string/tiktoken/index.ts
@@ -1,8 +1,8 @@
 /* Only the token of gpt-3.5-turbo is used */
-import type { ChatItemType } from '@fastgpt/global/core/chat/type';
+import type { ChatItemType } from '../../../core/chat/type';
 import { Tiktoken } from 'js-tiktoken/lite';
-import { adaptChat2GptMessages } from '@/utils/common/adapt/message';
-import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constant';
+import { adaptChat2GptMessages } from '../../../core/chat/adapt';
+import { ChatCompletionRequestMessageRoleEnum } from '../../../core/ai/constant';
 import encodingJson from './cl100k_base.json';
 
 /* init tikToken obj */
@@ -55,17 +55,6 @@ export function countMessagesTokens({ messages }: { messages: ChatItemType[] })
   return totalTokens;
 }
 
-export function sliceTextByTokens({ text, length }: { text: string; length: number }) {
-  const enc = getTikTokenEnc();
-
-  try {
-    const encodeText = enc.encode(text);
-    return enc.decode(encodeText.slice(0, length));
-  } catch (error) {
-    return text.slice(0, length);
-  }
-}
-
 /* slice messages from top to bottom by maxTokens */
 export function sliceMessagesTB({
   messages,

diff --git a/packages/global/common/string/tiktoken/type.d.ts b/packages/global/common/string/tiktoken/type.d.ts
@@ -0,0 +1,5 @@
+import type { Tiktoken } from 'js-tiktoken';
+
+declare global {
+  var TikToken: Tiktoken;
+}
diff --git a/packages/global/common/string/tools.ts b/packages/global/common/string/tools.ts
@@ -1,13 +1,15 @@
 import crypto from 'crypto';
 
+/* check string is a web link */
 export function strIsLink(str?: string) {
   if (!str) return false;
   if (/^((http|https)?:\/\/|www\.|\/)[^\s/$.?#].[^\s]*$/i.test(str)) return true;
   return false;
 }
 
-export const hashStr = (psw: string) => {
-  return crypto.createHash('sha256').update(psw).digest('hex');
+/* hash string */
+export const hashStr = (str: string) => {
+  return crypto.createHash('sha256').update(str).digest('hex');
 };
 
 /* simple text, remove chinese space and extra \n */
@@ -20,3 +22,16 @@ export const simpleText = (text: string) => {
 
   return text;
 };
+
+/* 
+    replace {{variable}} to value
+*/
+export function replaceVariable(text: string, obj: Record<string, string | number>) {
+  for (const key in obj) {
+    const val = obj[key];
+    if (!['string', 'number'].includes(typeof val)) continue;
+
+    text = text.replace(new RegExp(`{{(${key})}}`, 'g'), String(val));
+  }
+  return text || '';
+}
diff --git a/packages/global/core/ai/api.d.ts b/packages/global/core/ai/api.d.ts
@@ -0,0 +1,5 @@
+export type PostReRankProps = {
+  query: string;
+  inputs: { id: string; text: string }[];
+};
+export type PostReRankResponse = { id: string; score: number }[];
diff --git a/...cts/app/src/utils/common/adapt/message.ts → packages/global/core/chat/adapt.ts b/...cts/app/src/utils/common/adapt/message.ts → packages/global/core/chat/adapt.ts
@@ -1,7 +1,7 @@
-import type { ChatItemType } from '@fastgpt/global/core/chat/type.d';
-import { ChatRoleEnum } from '@fastgpt/global/core/chat/constants';
-import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constant';
-import type { ChatMessageItemType } from '@fastgpt/global/core/ai/type.d';
+import type { ChatItemType } from '../../core/chat/type.d';
+import { ChatRoleEnum } from '../../core/chat/constants';
+import { ChatCompletionRequestMessageRoleEnum } from '../../core/ai/constant';
+import type { ChatMessageItemType } from '../../core/ai/type.d';
 
 const chat2Message = {
   [ChatRoleEnum.AI]: ChatCompletionRequestMessageRoleEnum.Assistant,

diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts
@@ -0,0 +1,20 @@
+import { DatasetDataIndexItemType } from './type';
+
+/* ================= dataset ===================== */
+
+/* ================= collection ===================== */
+
+/* ================= data ===================== */
+export type PgSearchRawType = {
+  id: string;
+  team_id: string;
+  tmb_id: string;
+  collection_id: string;
+  data_id: string;
+  score: number;
+};
+export type PushDatasetDataChunkProps = {
+  q: string; // embedding content
+  a?: string; // bonus content
+  indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
+};