In [None]:
import os
import clickhouse_connect
from dotenv import load_dotenv


# qdrant_client = QdrantClient("http://127.0.0.1:6333")

load_dotenv()

ch_client = clickhouse_connect.get_client(
    host=os.getenv("CLICKHOUSE_HOST"),
    port=int(os.getenv("CLICKHOUSE_PORT")),
    username=os.getenv("CLICKHOUSE_USERNAME"),
    password=os.getenv("CLICKHOUSE_PASSWORD"),
)


In [None]:
from config import EMBEDDING_MODEL_PATH
from emcache import OllamaBackend
from emseo.storage import VectorStoreEmbedding

backend = OllamaBackend(base_url="http://127.0.0.1:11434")
storage = VectorStoreEmbedding(backend, collection_prefix="keywords")
# storage.embedder.load_state_dict(
#     torch.load(EMBEDDING_MODEL_PATH / f"{storage.collection_name}.pt")
# )

In [None]:
from emcache.huggingface import HuggingFaceBackend
from emseo.storage import VectorStoreEmbedding


# Initialize
# backend = HuggingFaceBackend(model_name="heydariAI/persian-embeddings")
backend = HuggingFaceBackend(model_name="intfloat/multilingual-e5-large")
storage = VectorStoreEmbedding(
    backend,
    collection_prefix="keywords",
    # qdrant_url="http://185.8.172.121:6333/",
)


In [None]:
from qdrant_client import QdrantClient

qdrant_client = QdrantClient("http://185.226.93.137:6333")

In [None]:
collection_name = "keywords_huggingface_xmanii_maux-gte-persian-v3"

In [None]:
from qdrant_client.models import Distance, PointStruct, QueryRequest, VectorParams


qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=768,
        distance=Distance.COSINE,
    ),
)

In [None]:
qdrant_client.upload_collection(
    collection_name=collection_name,
    vectors=...,
    payload=
)

In [None]:
qdrant_client.get_collection(collection_name).info()

### Initialize Qdrant

In [None]:
from pathlib import Path
import uuid
import torch
import tqdm.auto as tqdm
from qdrant_client.http.models import PointStruct
from qdrant_client.http.exceptions import ApiException

chunk_size = 128
query_websites = Path("queries/website_list.sql").read_text(encoding="utf-8")
query_keywords = Path("queries/keywords_by_website.sql").read_text(encoding="utf-8")
websites = ch_client.query_df(query_websites)

website_pbar = tqdm.tqdm(websites["website"], desc="Websites")
for website in website_pbar:
    website_pbar.set_description(f"Reading {website}")

    website_keywords = ch_client.query_df(
        query_keywords, parameters={"website": website}
    )

    keywords = website_keywords["keyword"].tolist()
    website_pbar.set_description(f"Embedding {website}")
    keyword_embedding = storage.embedder.embed(keywords).embedding
    average_position = website_keywords["average_position"]

    weights = 1 / torch.tensor(average_position.array).unsqueeze(-1)
    weighted_embeddings = weights * keyword_embedding

    # Normalize the average
    website_embedding = weighted_embeddings.mean(dim=0)
    website_embedding = website_embedding / website_embedding.norm()

    # website_point = PointStruct(
    #     id=uuid.uuid5(uuid.NAMESPACE_URL, website).hex,
    #     vector=website_embedding.tolist(),
    #     payload={"website": website},
    # )

    # storage.add_points([website_point])
    website_pbar.set_description(f"Inserting {website}")
    # continue

    insertion_pbar = tqdm.tqdm(
        total=len(website_keywords),
        desc="Adding keywords",
        leave=False,
    )

    # Create a new column filled with the website name
    # website_keywords["website"] = website

    # payloads = website_keywords.to_dict(orient="records")
    # This can be sped up using upload collection
    for i in range(0, len(website_keywords), chunk_size):
        keywords = website_keywords["keyword"].tolist()[i : i + chunk_size]
        while True:
            try:
                storage.add_texts(
                    texts=keywords,
                    # payloads=payloads[i : i + chunk_size],
                )
                break

            except ApiException as e:
                print(f"Error: {e}")
                pass

        insertion_pbar.update(len(keywords))

    insertion_pbar.close()

# diginoy.com
# 988 kw

                 website
12           digiato.com
13           diginoy.com
14   donya-e-eqtesad.com
15          drpharmo.com
16           faradeed.ir
..                   ...
96   www.titrebartar.com
97          www.zoomg.ir
98         www.zoomit.ir
99           zoomlife.ir
100         zoomtech.org

[89 rows x 1 columns]


Websites:   0%|          | 0/89 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: timed out
Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/988 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: Server disconnected without sending a response.
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: Server disconnected without sending a response.
Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/423 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/552 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/42 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1957 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/851 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/170 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/968 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/79 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/168 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1081 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/303 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/74 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/31 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/538 [00:00<?, ?it/s]

Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/96 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1096 [00:00<?, ?it/s]

Error: Server disconnected without sending a response.


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1008 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/168 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1196 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/971 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/6 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/267 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/1911 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/665 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2185 [00:00<?, ?it/s]

Error: Server disconnected without sending a response.
Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/587 [00:00<?, ?it/s]

Error: Server disconnected without sending a response.
Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/2143 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: Server disconnected without sending a response.
Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/186 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2315 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/401 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1043 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/657 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/337 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/170 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1051 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1551 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1020 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/669 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/237 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: Server disconnected without sending a response.


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out
Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/832 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1535 [00:00<?, ?it/s]

Error: timed out


Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Error: timed out


In [None]:
%load_ext jupyternotify

  from pkg_resources import resource_filename


<IPython.core.display.Javascript object>

In [None]:
%notify

<IPython.core.display.Javascript object>

In [None]:
storage.info().points_count

In [None]:
# %%time
# Query
import pandas as pd
import numpy as np


results = storage.search(query="خرید ", top_k=32)


result = []
websites = []
for r in results.points:
    # related_keywords.append(r.payload["text"])
    # if "keyword" not in r.payload:
    #     # print(f"{r.score:<.4f} -> {r.payload['website']}")
    #     websites.append({"website": r.payload["website"], "similarity": r.score})

    if "text" in r.payload:
        # print(
        #     f"{r.score:<.4f} $ {r.payload['average_position']:>6.2f} | {r.payload['keyword']} -> {r.payload['website']}"
        # )

        result.append(
            {
                "similarity": r.score,
                # "average_position": r.payload["average_position"],
                # "website": r.payload["website"],
                "keyword": r.payload["text"],
            }
        )

    # print(f"{r.score:<.4f} | {r.payload}")

result = pd.DataFrame(result)
websites = pd.DataFrame(websites)
# result
# websites

query_ranks = Path("queries/website_by_keyword.sql").read_text(encoding="utf-8")

ranks = ch_client.query_df(
    query_ranks,
    parameters={
        "keywords": result["keyword"].tolist(),
        "similarity": result["similarity"].tolist(),
    },
).rename(columns={"q.similarity": "similarity"})


ranks["score"] = (
    ranks["similarity"] / ranks["average_position"]
)  # np.log(ranks["average_position"] + 1)
ranked_websites = ranks.groupby("website")["score"].mean().sort_values(ascending=False)
ranked_websites.head(20)

In [None]:
result

In [None]:
import numpy as np


ranks["score"] = (
    ranks["similarity"] / ranks["average_position"]
)  # np.log(ranks["average_position"] + 1)
ranked_websites = ranks.groupby("website")["score"].mean().sort_values(ascending=False)
ranked_websites.head(20)

In [None]:
ranks.sort_values("score", ascending=False)

In [None]:
import numpy as np


result["score"] = result["similarity"] ** 768 / np.log(result["average_position"] + 1)
# Average of score per website
ranked_websites = result.groupby("website")["score"].mean().sort_values(ascending=False)
ranked_websites.head(20)

In [None]:
results.schema()

In [None]:
for r in results.points:
    # related_keywords.append(r.payload["text"])
    if "website" in r.payload:
        print(f"{r.score:<.4f} | {r.payload['website']}")


In [None]:
torch.tensor(average_position.array).shape

In [None]:
keyword_embedding.embedding.shape

In [None]:
debug_query = """
WITH
    {keywords:Array(String)} AS keywords,
    {scores:Array(Float32)} AS scores
SELECT
    q.keyword,
    q.score,
    d.website,
    d.average_position
FROM
(
    SELECT
        arrayJoin(arrayEnumerate(keywords)) AS idx,
        keywords[idx] AS keyword,
        scores[idx] AS score
) AS q
LEFT OUTER JOIN
(
    SELECT
        keyword,
        website,
        average_position
    FROM ahrefs.keywords
) AS d
ON q.keyword = d.keyword
"""

result = [r.score for r in results.points]
related_keywords = [r.payload["keyword"] for r in results.points]

websites = ch_client.query_df(
    debug_query, parameters={"keywords": related_keywords, "scores": result}
)
print(websites)


In [None]:
query = """
SELECT
    website,
    sum(average_position * volume) / sum(volume) AS weighted_avg_position
FROM ahrefs.keywords
WHERE keyword IN {keywords:Array(String)}
GROUP BY website
ORDER BY weighted_avg_position ASC
LIMIT 20
"""

df = ch_client.query_df(query, parameters={"keywords": related_keywords})
df