In [1]:
from config import api_key_qwen
from aiModel import QwenModel
from ai import call_model


def table_sum_agent(structure,samples):
    tools = [
        {
        "type": "function",
        "function": {
            "name": "save_summary",
            "description": "将总结信息保存到数据库中。包对对整个表格以及每一列字段的总结。",
            "parameters": {
                "type": "object",
                "properties": {
                    "table_summary": {
                        "description": "对表格功能用户的一句话总结",
                        "type": "string"
                    },
                        "column_summaries": {
                        "description": """
                        对每一个字段的一个总结，{
                            "column1_name": "Summary for column 1 based on the samples and structure.",
                            "column2_name": "Summary for column 2 based on the samples and structure."}
                            """,
                        "type": "dict"
                    }
                },
                "required": ["table_summary"]
            },
        },
        }
    ]

    role_setting = '''
    Prompt: "根据表格的结构和样本条目，总结表格的整体用途和每列存储的信息类型。为表格的总体用途生成一句详尽的总结，并为每一列创建一个详尽的总结字典。使用'save_summary'函数保存这些总结。"


任务:
1. 分析表格结构，确定其预定功能。
2. 查看样本条目，了解每一列存储的数据类型及其对表格功能的贡献。
3. 生成表格目的的详尽单句总结。
4. 创建一个字典，为每一列生成总结，说明基于结构和样本的信息类型。
5. 使用'save_summary'工具将生成的总结保存到数据库中。

工具:
- save_summary: 将生成的总结保存到数据库中的函数。该函数需要表格的一句话总结和每一列的总结字典。

预期输出:
- 表格整体的一句话总结。
- 包含每列总结的字典。
- 成功执行'save_summary'函数以保存这些总结。

确保总结的准确性和相关性以符合所提供的结构和样本数据，并遵守使用'save_summary'函数的所需格式和详细要求。

      '''
    messages = [{"role": "system","content": role_setting}]
    messages.append({"role": "user","content": '数据表结构是：' + structure})
    messages.append({"role": "user","content": '表格样本条目是：' + samples})
    
    model_name = "qwen-plus" 
    response = QwenModel(api_key=api_key_qwen, model=model_name, temperature=0.2,tools=tools)
    total_usage = 0
    call_model(response, messages)
    messages.append(response.message_to_append)
    total_usage += response.total_tokens

    max_loop_count = 10
    loop_count = 0
    while loop_count < max_loop_count and response.tool_calls:
        args = response.function_args

        if response.function_name == "save_summary":
            try:
                # function_result = save_summary(**json.loads(args))
                return args
            except Exception as e:
                function_result = {"error": str(e)}

        messages.append({
            "role": "tool",
            "content": f"{function_result}",
            "tool_call_id":response.tool_calls['id']
        })
        
        call_model(response, messages)
        total_usage += response.total_tokens
        messages.append(response.message_to_append)
    return response.content


In [2]:
import mysql.connector
import json

# 建立数据库连接
connection = mysql.connector.connect(
    host="gz-cdb-5scrcjb5.sql.tencentcdb.com",
    user="db",
    password="dbdb905905",
    database="sele",
    port=63432
)

def get_tables(cursor):
    cursor.execute("SHOW TABLES")
    return [table[0] for table in cursor.fetchall()]

def get_table_structure(cursor, table_name):
    cursor.execute(f"SHOW CREATE TABLE {table_name}")
    return cursor.fetchone()[1]

def get_sample_data(cursor, table_name):
    cursor.execute(f"SELECT * FROM {table_name} LIMIT 5")
    rows = cursor.fetchall()
    # 将行数据转换成字符串，以便传递给AI分析器
    sample_str = "\n".join([str(row) for row in rows])
    return sample_str



In [4]:


def main():
    cursor = connection.cursor()
    tables = get_tables(cursor)
    results = []

    for table in tables:
        structure = get_table_structure(cursor, table)
        samples = get_sample_data(cursor, table)
        response = table_sum_agent(structure, samples)
        table_summary, columns_summary = table_sum_agent(**json.loads(response))
        results.append({
            "table": table,
            "summary": table_summary,
            "columns": columns_summary
        })

    cursor.close()
    connection.close()

    # 将结果保存为JSON文件
    with open('database_summary.json', 'w') as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

# if __name__ == "__main__":
#     main()


In [44]:
from langchain.vectorstores import Chroma
from langchain_community.embeddings import DashScopeEmbeddings
import json

# 假设已经有一个包含数据库所有表及其字段摘要的JSON文件
with open('database_summary.json', 'r') as f:
    database_summary = json.load(f)

# 提取字段总结和对应的字段名
field_summaries = []
metadata_list = []



In [70]:
for table in database_summary:
    # 对表级总结的处理
    field_summaries.append(table["summary"])
    metadata_list.append({"field_name": "", "table_name": table["table"]})

    # 对字段级总结的处理
    for column_name, column_summary in table["columns"].items():
        field_summaries.append(column_summary)  # 直接使用字段摘要字符串
        metadata_list.append({"field_name": column_name, "table_name": table["table"]})

# 初始化 DashScope 嵌入模型
embeddings = DashScopeEmbeddings(
    model="text-embedding-v1", dashscope_api_key="sk-cbcc1fb859b1456885a270eecbec6369"
)


In [71]:
import numpy as np
# 对每个总结进行嵌入
vecs = embeddings.embed_documents(field_summaries)

In [77]:
print(len(field_summaries))

186


In [78]:
print(len(vecs))
print(type(vecs))
print(type(vecs[0]))
print(len(vecs[0]))
print(type(vecs[0][0]))

186
<class 'list'>
<class 'list'>
1536
<class 'float'>


In [79]:
# 初始化向量存储,并添加嵌入向量和元数据
vector_store = Chroma.from_texts(
    texts=field_summaries, 
    embedding=vecs, 
    metadatas=metadata_list
    )


AttributeError: 'list' object has no attribute 'embed_documents'

In [None]:

# 搜索
query = "我需要找到存储客户地址的字段"
results = vector_store.similarity_search(query, k=2)

print(f"相关字段名:")
for result in results:
    metadata = result.metadata
    if metadata["field_name"]:  # 确保不显示表级总结
        print(f"{metadata['table_name']}.{metadata['field_name']}")  # 显示表名和字段名


In [66]:
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# 初始化嵌入模型
embeddings = OpenAIEmbeddings(openai_api_key="sk-proj-CmkZhJbSwXJv3FNYMv49T3BlbkFJHpq8bV7zuQOpe7ikfiSN")

# 准备文本数据
texts = [
    "This is the first sentence.",
    "Here is another sentence.",
    "One more sentence for the example."
]

# 获取文本嵌入向量
vecs = embeddings.embed_documents(texts)

# 初始化 Chroma 向量存储
vector_store = Chroma.from_texts(
    texts=texts,
    embedding=vecs,
    metadatas=[{"source": "example"}] * len(texts)
)

# 进行相似性搜索
query = "What is an example sentence?"
docs = vector_store.similarity_search(query)

print(docs[0].page_content)


AttributeError: 'list' object has no attribute 'embed_documents'

In [65]:
import chromadb
print(f"Chroma version: {chromadb.__version__}")


Chroma version: 0.5.0
