### Дашборд для вектороной БД
Еще один способ проанализировать данные, а именно - со строны сформированной векторной БД (metadata в качестве полей фильтрации).


In [3]:
# импорт библиотек
import pandas as pd

from dash import Dash, dcc, html, Input, Output

from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer

import plotly.express as px
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams

### Интерактивные Дашборды

In [10]:
# Подключение к Qdrant
client = QdrantClient(url="http://192.168.137.253:6333")
collection_name = "documents_collection"

# Инициализация модели SentenceTransformer
model_name = "deepvk/USER-bge-m3"
model = SentenceTransformer(model_name)

# Функция для извлечения данных из Qdrant
def fetch_data():
    scroll_result = client.scroll(
        collection_name=collection_name,
        scroll_filter=None,
        limit=100  # Ограничение на количество точек
    )
    
    points, _ = scroll_result  # Получаем список точек
    data = []
    for point in points:
        payload = point.payload
        data.append({
            "id": point.id,
            "content": payload.get("content"),
            "start_page": payload.get("start_page"),
            "end_page": payload.get("end_page"),
            "basin": payload.get("basin"),
            "Region": payload.get("file"),
            "level": payload.get("level"),
            "vector": point.vector
        })
    return pd.DataFrame(data)

# Генерация векторов для точек, у которых вектора отсутствуют
def generate_and_upload_vectors():
    points_to_update = []
    for point in client.scroll(collection_name=collection_name, limit=100)[0]:
        payload = point.payload
        if point.vector is None and payload.get("content"):
            text = payload["content"]
            vector = model.encode(text).tolist()
            points_to_update.append({
                "id": point.id,
                "vector": vector
            })

    if points_to_update:
        client.upsert(collection_name=collection_name, points=points_to_update)

# Проверяем и обновляем вектора
generate_and_upload_vectors()

# Извлечение данных
df = fetch_data()

# Проверка данных
print("Пример данных:")
print(df.head())

# Создание Dash приложения
app = Dash(__name__)

app.layout = html.Div([
    html.H1("Qdrant Dashboard"),
    
    # Добавляем Dropdown для фильтрации по бассейнам
    dcc.Dropdown(
        id="basin-filter",
        options=[{"label": row["basin"], "value": row["basin"]} for _, row in df.iterrows()],
        multi=True,
        placeholder="Фильтруйте по бассейну"
    ),
    
    # Добавляем Dropdown для фильтрации по уровням
    dcc.Dropdown(
        id="level-filter",
        options=[{"label": str(level), "value": level} for level in df['level'].unique()],
        multi=True,
        placeholder="Фильтруйте по уровню"
    ),
    
    # Добавляем Dropdown для фильтрации по файлам
    dcc.Dropdown(
        id="file-filter",
        options=[{"label": row["Region"], "value": row["Region"]} for _, row in df.iterrows()],
        multi=True,
        placeholder="Фильтруйте по файлу"
    ),
    
    # График для количества документов по бассейнам
    dcc.Graph(id="filtered-doc-count"),
    
    # График для распределения длин текстов
    dcc.Graph(id="content-length-dist"),
    
    # График для распределения количества страниц
    dcc.Graph(id="page-count-dist"),
])

@app.callback(
    Output("filtered-doc-count", "figure"),
    [Input("basin-filter", "value"),
     Input("level-filter", "value"),
     Input("file-filter", "value")]
)
def update_filtered_doc_count(selected_basins, selected_levels, selected_files):
    filtered_df = df
    if selected_basins:
        filtered_df = filtered_df[filtered_df["basin"].isin(selected_basins)]
    if selected_levels:
        filtered_df = filtered_df[filtered_df["level"].isin(selected_levels)]
    if selected_files:
        filtered_df = filtered_df[filtered_df["Region"].isin(selected_files)]
    
    filtered_stats = filtered_df.groupby("basin").size().reset_index(name='Document_Count')
    return px.bar(filtered_stats, x="basin", y="Document_Count", title="Количество документов по бассейнам")

@app.callback(
    Output("content-length-dist", "figure"),
    [Input("basin-filter", "value"),
     Input("level-filter", "value"),
     Input("file-filter", "value")]
)
def update_content_length_dist(selected_basins, selected_levels, selected_files):
    filtered_df = df
    if selected_basins:
        filtered_df = filtered_df[filtered_df["basin"].isin(selected_basins)]
    if selected_levels:
        filtered_df = filtered_df[filtered_df["level"].isin(selected_levels)]
    if selected_files:
        filtered_df = filtered_df[filtered_df["Region"].isin(selected_files)]
    
    filtered_df['Content_Length'] = filtered_df['content'].apply(lambda x: len(x) if x else 0)
    return px.histogram(filtered_df, x="Content_Length", nbins=30, title="Распределение длин текстов")

@app.callback(
    Output("page-count-dist", "figure"),
    [Input("basin-filter", "value"),
     Input("level-filter", "value"),
     Input("file-filter", "value")]
)
def update_page_count_dist(selected_basins, selected_levels, selected_files):
    filtered_df = df
    if selected_basins:
        filtered_df = filtered_df[filtered_df["basin"].isin(selected_basins)]
    if selected_levels:
        filtered_df = filtered_df[filtered_df["level"].isin(selected_levels)]
    if selected_files:
        filtered_df = filtered_df[filtered_df["Region"].isin(selected_files)]
    
    filtered_df['Page_Count'] = filtered_df['end_page'] - filtered_df['start_page'] + 1
    return px.histogram(filtered_df, x="Page_Count", nbins=30, title="Распределение количества страниц")

app.run_server(debug=True, port=8051)


Пример данных:
                                     id  \
0  00107d59-a173-4c07-9ed5-ef57e347e719   
1  0026eb60-ff4d-46fd-9430-a76244d5d5de   
2  002be76a-4750-4264-8cff-76ee866e5195   
3  00a1c933-c434-48fe-b50c-7f0a21e5bba0   
4  00d50f92-fe0e-4acb-892a-edf847ef4896   

                                             content  start_page  end_page  \
0  Продолжение рисунка 4.2 108 30 31 32 33 34 35 ...         105       117   
1  4 0.24 0.24 0.24 0.24 0.24 0.24 Фб 2.40 Монито...         258       258   
2  30 2.4 Характерные особенности антропогенной т...          30        37   
3  Введение В Книге 3 «Схемы комплексного использ...           4         4   
4  ,60 104,70 105,60 105,90 104,10 104,00 103,90 ...          30        40   

    basin        Region  level vector  
0     Дон  Книга_4.json      2   None  
1     Дон  Книга_6.json      3   None  
2  Кубань  Книга_2.json      2   None  
3  Печора  Книга_3.json      1   None  
4    Сура  Книга_1.json      2   None  




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/