# 数据处理

In [None]:
import os
os.environ["http_proxy"] = "http://127.0.0.1:1083"
os.environ["https_proxy"] = "http://127.0.0.1:1083"

! pip install langchain_community
! pip install langchain
! pip install transformers
! pip install unstructured
! pip install markdown
! pip install markdownify
! pip install sentence-transformers
! pip install clickhouse-connect
! pip install -U langchain_qdrant

## 查看目录中文档的最大长度

In [None]:
import os

root_dir = "/root/jd_docs"
size = 0
for root, dirs, files in os.walk(root_dir, followlinks=False):
    for file in files:
        path = os.path.join(root, file)
        if os.path.getsize(path) > size:
            size = os.path.getsize(path)
size

## 查看目录中json content字段最大长度

In [None]:
import os
import json

root_dir = "/root/jd_docs"
size = 0
for root, dirs, files in os.walk(root_dir, followlinks=False):
    for file in files:
        path = os.path.join(root, file)
        with open(path, 'r') as docfile:
            doc_data = json.load(docfile)
            if size < len(doc_data["content"]):
                print(type(doc_data["content"]))
                size = len(doc_data["content"])

size

## 去标签实验

In [None]:
from markdownify import markdownify as md
import re


def is_html(content):
    # content = content.read(15)  # 读取文件开头的15个字节
    if content.startswith('<'):
        return True
    else:
        return False


def replace_unicode(match):
    code_point = int(match.group(1), 16)
    return chr(code_point)


# f = open("/root/jd_docs/account-assets/withdrawal.md")
f = open("/root/jd_docs/application-load-balancer/TLSSecurityPolicy-management.md")
lines = f.read()


# 替换unicode字符为标签
result = re.sub(r"\\[uU]([0-9a-fA-F]{4})", replace_unicode, lines)


# 如果为html 格式，转换为markdown
if is_html(result):
    result = md(result)
    print(result)

# 去html标签
pattern = re.compile(r'<[^>]+>', re.S)
result = pattern.sub(' ', result)

print(result.replace('\\t', '').replace('\\n', ''))
f.close()

## 文本转markdown

In [None]:
from markdownify import markdownify

markdownify("要对 MySQL 进行性能测试，可以按照以下步骤进行：1. **准备环境**：   - 安装必要的工具，如 `sysbench`，用于执行性能测试。   - 创建云数据库实例，如华北-北京可用区A的 MySQL 5.7 服务器。   - 选择合适的云主机规格和镜像，如 8C 32GB 的 CentOS 7.4 64位。   - 为数据库实例创建一个名为 `sbtest` 的库，并为创建的账号授予新建库的读写权限。   - 使用 `sysbench` 创建一个包含表结构、数据格式和 SQL 样式的测试样本。2. **测试环境设置**：   - 在 `sysbench` 中，设置数据库实例的参数，如最大连接数（最大 IOPS）、表大小、数据库驱动、MySQL 服务器地址、用户名、密码等。   - 设置测试用例的参数，如表结构、数据量大小、客户端数量、查询类型（如 get、set、lpush、mset 等）、查询时间（如 1800 秒）和并发数（如 32 个线程）。3. **性能压测**：   - 使用 `sysbench` 创建一个包含表结构、数据格式和 SQL 样式的测试样本，并设置测试参数。   - 使用 `sysbench` 进行性能压测，包括查询、更新和删除操作，以生成数据负载并进行压力测试。   - 使用 `sysbench` 输出查询时间、并发数、TPS 和 QPS 的统计信息，以评估数据库性能。4. **环境清理**：   - 清理测试环境，包括删除测试样本、关闭数据库实例、删除 Redis 服务器等。5. **性能指标测试**：   - 使用 `redis-benchmark` 测试 Redis 性能，包括获取、设置、LPush、MSet（10 keys）、SADD 等操作的性能指标。   - 根据测试结果，评估 Redis 性能瓶颈，如查询时间、并发数、TPS 和 QPS 的瓶颈，以及数据库性能瓶颈。通过以上步骤，可以对 MySQL 进行性能测试，评估其在高并发、大量数据和复杂查询场景下的性能表现，为优化 MySQL 服务器配置和性能优化提供参考。")

## 文本分割，拆分为定长文本


In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader
import os
os.environ["http_proxy"] = "http://127.0.0.1:1083"
os.environ["https_proxy"] = "http://127.0.0.1:1083"

root_dir = "/root/jd_docs"

loader = DirectoryLoader(
    '/root/jd_docs', glob="**/*.json", loader_cls=TextLoader)
docs = loader.load()

for new_doc in docs[0:2]:
    print(new_doc)

## 解析自定义json到docs

In [1]:
# 加载自定义模块路径
import sys
sys.path.append("..")

In [2]:
from libs.jd_doc_json_loader import JD_DOC_Loader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import DirectoryLoader


root_dir = "/root/jd_docs"
loader = DirectoryLoader(
    '/root/jd_docs', glob="**/*.json", loader_cls=JD_DOC_Loader)
docs = loader.load()
print(docs[0:2])



### splite doc by MarkdownHeaderTextSplitter

In [5]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, MarkdownTextSplitter
headers_to_split_on = [
    ("#", "#"),
    ("##", "##"),
    ("###", "###"),
]
MarkdownTextSplitter
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on)

count = 0
for doc in docs:
    # print(doc)
    # print("-------------------------------------------")
    chunked_docs = markdown_splitter.split_text(doc.page_content)
    for chunded_doc in chunked_docs:
        head = ''
        for k, v in chunded_doc.metadata.items():
            head = head+k+v+"\n"
        content = head+chunded_doc.page_content
        new_doc = doc
        new_doc.page_content = content
        print(new_doc)
        print("-------------------------------------------")

page_content='###示例说明\n* 通过请求对象:\n[http://cdn.example.com/video/standard/1K.html?fa=121&jd=121](\\"http://cdn.example.com/video/standard/1K.html?fa=121&jd=121\\")\n* 密钥设为：jdcloud1234 (由用户自行设置)\n* 鉴权配置文件失效日期为：2020年06月18日00:00:00,计算出来的秒数为1592409600\n* 则CDN服务器会构造一个用于计算signature的签名字符\n/video/standard/1K.html-1592409600-0-0-jdcloud1234\n* CDN服务器会根据该签名字符串计算signature:\nsignature = md5sum(\\"/video/standard/1K.html-1592409600-0-0-jdcloud1234\\") =06d97bc9e43ded48d991994006cfa127\n* 则请求时url为：\n[http://cdn.example.com/video/standard/1K.html?fa=121&jd=121&auth\\_token=1592409600-0-0-06d97bc9e43ded48d991994006cfa127](\\"http://cdn.example.com/video/standard/1K.html?fa=121&jd=121&auth_token=1592409600-0-0-06d97bc9e43ded48d991994006cfa127\\")\n计算出来的 signature 与用户请求中带的 signature =06d97bc9e43ded48d991994006cfa127值一致，于是鉴权通过。' metadata={'source': 'https://docs.jdcloud.com/cn/live-video/api/playurl', 'title': '播放鉴权规则', 'product': '视频直播'}
-------------------------------------------
-----------------------

## 存入clickhouse向量数据库

In [6]:
import langchain_community.vectorstores.clickhouse as clickhouse
from langchain.embeddings import HuggingFaceEmbeddings
import os
os.environ["http_proxy"] = "http://127.0.0.1:1083"
os.environ["https_proxy"] = "http://127.0.0.1:1083"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
    model_name="/root/models/moka-ai-m3e-large", model_kwargs=model_kwargs)

for doc in docs:
    embeddings.embed_documents(doc.page_content)


# settings = clickhouse.ClickhouseSettings(
#     table="jd_docs_m3e_with_url", username="default", password="Git785230", host="10.0.1.94")

# docsearch = clickhouse.Clickhouse.from_documents(
#     docs, embeddings, config=settings)

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name /root/models/moka-ai-m3e-large. Creating a new one with MEAN pooling.


TypeError: 'HuggingFaceEmbeddings' object is not callable

## 验证ck向量库

In [None]:
import langchain_community.vectorstores.clickhouse as clickhouse
from langchain.embeddings import HuggingFaceEmbeddings

model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
    model_name="/root/models/moka-ai-m3e-large", model_kwargs=model_kwargs)

settings = clickhouse.ClickhouseSettings(
    table="jd_docs_m3e_with_url", username="default", password="Git785230", host="10.0.1.94")
ck_db = clickhouse.Clickhouse(embeddings, config=settings)
ck_db._build_query_sql
ck_retriever = ck_db.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.9})
ck_retriever.get_relevant_documents("阿里云好不好")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
import clickhouse_connect

model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
    model_name="/root/models/moka-ai-m3e-large", model_kwargs=model_kwargs)
# v = embeddings.embed_query("如何创建对象存储的bucket")
# v = embeddings.embed_query("Massive 此文本嵌入模型通过**千万级**的中文句对数据集进行训练")
v = embeddings.embed_query("对象存储api如何使用")
q_emb_str = ",".join(map(str, v))

client = clickhouse_connect.get_client(
    host='10.0.1.94', username='default', password='Git785230')
q_str = f"""
        SELECT document,metadata,dist
        FROM default.jd_docs_m3e_with_url where dist < 20
         ORDER BY cosineDistance(embedding, [{q_emb_str}])
            AS dist 
        LIMIT 3 
        """
# q_str = f"""
#         SELECT document
#            metadata dist
#         FROM default.jd_docs_m3e_with_url
#          ORDER BY L2Distance(embedding, [{q_emb_str}])
#             AS dist {self.dist_order}
#         LIMIT {topk} {' '.join(settings_strs)}
#         """

r = client.query(q_str)

for row in r.result_rows:
    (new_doc, meta, dist) = row
    print(dist)

## 使用qdrant向量库

In [None]:
from langchain_qdrant import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
    model_name="/root/models/moka-ai-m3e-large", model_kwargs=model_kwargs)

url = "http://10.0.1.94:6333"
qdrant = Qdrant.from_documents(
    docs,
    embeddings,
    url=url,
    prefer_grpc=True,
    collection_name="my_documents",
)

In [None]:
from typing import List
from langchain_qdrant import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings

model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
    model_name="/root/models/moka-ai-m3e-large", model_kwargs=model_kwargs)


url = "http://10.0.1.94:6333"

qdrant = Qdrant.from_existing_collection(
    embedding=embeddings,
    collection_name="my_documents",
    url="http://10.0.1.94:6333",
)