In [1]:
import os
import clickhouse_connect
from dotenv import load_dotenv


# qdrant_client = QdrantClient("http://127.0.0.1:6333")

load_dotenv()

ch_client = clickhouse_connect.get_client(
    host=os.getenv("CLICKHOUSE_HOST"),
    port=int(os.getenv("CLICKHOUSE_PORT")),
    username=os.getenv("CLICKHOUSE_USERNAME"),
    password=os.getenv("CLICKHOUSE_PASSWORD"),
)


In [2]:
from config import EMBEDDING_MODEL_PATH
from emcache import OllamaBackend
from emseo.storage import VectorStoreEmbedding

backend = OllamaBackend(base_url="http://127.0.0.1:11434")
storage = VectorStoreEmbedding(backend, collection_prefix="keywords")
# storage.embedder.load_state_dict(
#     torch.load(EMBEDDING_MODEL_PATH / f"{storage.collection_name}.pt")
# )

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=11434): Max retries exceeded with url: /api/embeddings (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001323DF37290>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [None]:
from emcache.huggingface import HuggingFaceBackend
from emseo.storage import VectorStoreEmbedding


# Initialize
# backend = HuggingFaceBackend(model_name="heydariAI/persian-embeddings")
backend = HuggingFaceBackend(model_name="intfloat/multilingual-e5-large")
storage = VectorStoreEmbedding(backend, collection_prefix="keywords")


### Initialize Qdrant

In [None]:
from pathlib import Path
import uuid
import torch
import tqdm.auto as tqdm
from qdrant_client.http.models import PointStruct

chunk_size = 128
query_websites = Path("queries/website_list.sql").read_text(encoding="utf-8")
query_keywords = Path("queries/keywords_by_website.sql").read_text(encoding="utf-8")
websites = ch_client.query_df(query_websites)

website_pbar = tqdm.tqdm(websites["website"], desc="Websites")
for website in website_pbar:
    website_pbar.set_description(f"Reading {website}")

    website_keywords = ch_client.query_df(
        query_keywords, parameters={"website": website}
    )

    keywords = website_keywords["keyword"].tolist()
    website_pbar.set_description(f"Embedding {website}")
    keyword_embedding = storage.embedder.embed(keywords).embedding
    average_position = website_keywords["average_position"]

    weights = 1 / torch.tensor(average_position.array).unsqueeze(-1)
    weighted_embeddings = weights * keyword_embedding

    # Normalize the average
    website_embedding = weighted_embeddings.mean(dim=0)
    website_embedding = website_embedding / website_embedding.norm()

    # website_point = PointStruct(
    #     id=uuid.uuid5(uuid.NAMESPACE_URL, website).hex,
    #     vector=website_embedding.tolist(),
    #     payload={"website": website},
    # )

    # storage.add_points([website_point])
    website_pbar.set_description(f"Inserting {website}")
    # continue

    insertion_pbar = tqdm.tqdm(
        total=len(website_keywords),
        desc="Adding keywords",
        leave=False,
    )

    # Create a new column filled with the website name
    # website_keywords["website"] = website

    # payloads = website_keywords.to_dict(orient="records")
    # This 
    for i in range(0, len(website_keywords), chunk_size):
        keywords = website_keywords["keyword"].tolist()[i : i + chunk_size]
        storage.add_texts(texts=keywords)
        insertion_pbar.update(len(keywords))

    insertion_pbar.close()

Websites:   0%|          | 0/101 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/851 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/74 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/587 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/186 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1051 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/107 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/113 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/154 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/400 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/442 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/988 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/423 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/552 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/42 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1957 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/170 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/968 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/79 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/168 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1081 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/303 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/31 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/538 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/96 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1096 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1008 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/168 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1196 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/971 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/6 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/267 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1911 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/665 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2185 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2143 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2315 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/401 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1043 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/657 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/337 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/170 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1551 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1020 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/669 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/237 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/832 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/1535 [00:00<?, ?it/s]

Adding keywords:   0%|          | 0/2500 [00:00<?, ?it/s]

In [117]:
storage.info().points_count

93178

In [None]:
# %%time
# Query
import pandas as pd
import numpy as np


results = storage.search(query="خرید ", top_k=32)


result = []
websites = []
for r in results.points:
    # related_keywords.append(r.payload["text"])
    if "keyword" not in r.payload:
        # print(f"{r.score:<.4f} -> {r.payload['website']}")
        websites.append({"website": r.payload["website"], "similarity": r.score})

    if "keyword" in r.payload:
        # print(
        #     f"{r.score:<.4f} $ {r.payload['average_position']:>6.2f} | {r.payload['keyword']} -> {r.payload['website']}"
        # )

        result.append(
            {
                "similarity": r.score,
                "average_position": r.payload["average_position"],
                "website": r.payload["website"],
                "keyword": r.payload["keyword"],
            }
        )

    # print(f"{r.score:<.4f} | {r.payload}")

result = pd.DataFrame(result)
websites = pd.DataFrame(websites)
# result
# websites

query_ranks = Path("queries/website_by_keyword.sql").read_text(encoding="utf-8")

ranks = ch_client.query_df(
    query_ranks,
    parameters={
        "keywords": result["keyword"].tolist(),
        "similarity": result["similarity"].tolist(),
    },
).rename(columns={"q.similarity": "similarity"})


ranks["score"] = (
    ranks["similarity"] / ranks["average_position"]
)  # np.log(ranks["average_position"] + 1)
ranked_websites = ranks.groupby("website")["score"].mean().sort_values(ascending=False)
ranked_websites.head(20)

website
www.zoomit.ir          0.302259
arzdigital.com         0.131988
fararu.com             0.111949
www.iranjib.ir         0.111643
digiato.com            0.087437
ana.ir                 0.032320
www.sharghdaily.com    0.031230
www.gsm.ir             0.014423
www.fardanews.com      0.013792
www.55online.news      0.013015
www.baharnews.ir       0.012895
roozno.com             0.012281
zoomtech.org           0.011006
cryptonegar.com        0.010977
mihanblockchain.com    0.010754
diginoy.com            0.010447
Name: score, dtype: float32

In [127]:
result

Unnamed: 0,similarity,average_position,website,keyword
0,0.957,5.222222,www.zoomit.ir,خرید لپ تاپ
1,0.952356,5.285714,www.zoomit.ir,خرید لپتاپ
2,0.938953,4.2,www.zoomit.ir,خرید تبلت
3,0.922697,3.111111,www.zoomit.ir,لب تاپ
4,0.92173,5.125,www.zoomit.ir,خرید موبایل
5,0.921361,3.625,www.zoomit.ir,لب تاب
6,0.91893,3.75,www.zoomit.ir,قیمت لپ تاپ
7,0.91016,1.0,www.zoomit.ir,راهنمای خرید لپ تاپ
8,0.910124,5.2,www.zoomit.ir,خرید گوشی
9,0.909331,3.4,www.zoomit.ir,لپ تاب


In [114]:
import numpy as np


ranks["score"] = (
    ranks["similarity"] / ranks["average_position"]
)  # np.log(ranks["average_position"] + 1)
ranked_websites = ranks.groupby("website")["score"].mean().sort_values(ascending=False)
ranked_websites.head(20)

website
arzdigital.com            0.690105
www.tgju.org              0.583497
www.iranjib.ir            0.320383
www.shomanews.com         0.221259
tejaratnews.com           0.178851
www.sharghdaily.com       0.155436
www.entekhab.ir           0.154838
diginoy.com               0.127491
mihanblockchain.com       0.117071
www.iscanews.ir           0.115437
www.eghtesadonline.com    0.104757
www.etemadonline.com      0.075478
www.tasnimnews.com        0.059003
donya-e-eqtesad.com       0.053102
www.parsine.com           0.017386
iraneconomist.com         0.014681
aftabnews.ir              0.014344
mihansignal.com           0.014311
utofx.com                 0.012829
www.hamyarcrypto.com      0.012355
Name: score, dtype: float32

In [107]:
ranks.sort_values("score", ascending=False)

Unnamed: 0,keyword,similarity,website,average_position,score
5,هندزفری,0.922499,www.zoomit.ir,3.909091,0.235988
14,هندزفری گردنی,0.912025,www.zoomit.ir,4.2,0.217149
8,هندزفری سامسونگ,0.915632,www.zoomit.ir,4.333333,0.2113
12,هندزفری بلوتوثی,0.912829,www.zoomit.ir,4.777778,0.191057
9,هندزفری بی سیم,0.915287,www.zoomit.ir,4.875,0.187751
13,خرید موبایل,0.912325,www.zoomit.ir,5.125,0.178015
18,خرید گوشی,0.897348,www.zoomit.ir,5.2,0.172567
15,خرید مانیتور,0.899764,www.zoomit.ir,6.833333,0.131673
17,خرید گوشی,0.897348,www.gsm.ir,37.666668,0.023823
1,عکس هندزفری,0.924241,ana.ir,54.0,0.017116


In [78]:
import numpy as np


result["score"] = result["similarity"] ** 768 / np.log(result["average_position"] + 1)
# Average of score per website
ranked_websites = result.groupby("website")["score"].mean().sort_values(ascending=False)
ranked_websites.head(20)

website
anzalweb.ir            9.241214e-30
www.gsm.ir             5.803781e-30
www.zoomit.ir          1.005721e-30
zoomtech.org           3.515494e-34
digiato.com            1.008411e-38
zoomlife.ir            8.112825e-40
arzdigital.com         9.361835e-41
www.fardanews.com      5.920407e-41
jamejamonline.ir       3.788663e-41
asemooni.com           5.250295e-42
ana.ir                 2.161713e-42
www.entekhab.ir        2.140458e-42
www.sharghdaily.com    7.009531e-43
mihanblockchain.com    5.854597e-44
www.iranjib.ir         4.263496e-44
www.tgju.org           5.425591e-45
www.tabnak.ir          2.631182e-45
techrato.com           7.516922e-46
utofx.com              3.720903e-46
footofan.com           2.195695e-46
Name: score, dtype: float64

In [17]:
results.schema()

C:\Users\20mah\AppData\Local\Temp\ipykernel_19240\1561052547.py:1: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  results.schema()


{'$defs': {'ScoredPoint': {'description': 'Search result',
   'properties': {'id': {'anyOf': [{'type': 'integer'}, {'type': 'string'}],
     'description': 'Search result',
     'title': 'Id'},
    'version': {'description': 'Point version',
     'title': 'Version',
     'type': 'integer'},
    'score': {'description': 'Points vector distance to the query vector',
     'title': 'Score',
     'type': 'number'},
    'payload': {'anyOf': [{'additionalProperties': True, 'type': 'object'},
      {'type': 'null'}],
     'default': None,
     'description': 'Payload - values assigned to the point',
     'title': 'Payload'},
    'vector': {'anyOf': [{'items': {'type': 'number'}, 'type': 'array'},
      {'items': {'items': {'type': 'number'}, 'type': 'array'},
       'type': 'array'},
      {'additionalProperties': {'anyOf': [{'items': {'type': 'number'},
          'type': 'array'},
         {'$ref': '#/$defs/SparseVector'},
         {'items': {'items': {'type': 'number'}, 'type': 'array'},
   

In [None]:
for r in results.points:
    # related_keywords.append(r.payload["text"])
    if "website" in r.payload:
        print(f"{r.score:<.4f} | {r.payload['website']}")


tensor([-0.0045,  0.0029,  0.0013,  ...,  0.0047, -0.0102,  0.0093])

In [None]:
torch.tensor(average_position.array).shape

torch.Size([2500])

In [53]:
keyword_embedding.embedding.shape

torch.Size([2500, 1024])

In [None]:
debug_query = """
WITH
    {keywords:Array(String)} AS keywords,
    {scores:Array(Float32)} AS scores
SELECT
    q.keyword,
    q.score,
    d.website,
    d.average_position
FROM
(
    SELECT
        arrayJoin(arrayEnumerate(keywords)) AS idx,
        keywords[idx] AS keyword,
        scores[idx] AS score
) AS q
LEFT OUTER JOIN
(
    SELECT
        keyword,
        website,
        average_position
    FROM ahrefs.keywords
) AS d
ON q.keyword = d.keyword
"""

result = [r.score for r in results.points]
related_keywords = [r.payload["keyword"] for r in results.points]

websites = ch_client.query_df(
    debug_query, parameters={"keywords": related_keywords, "scores": result}
)
print(websites)


             keyword     score                 website  average_position
0          ایده تولد  0.946327          www.chetor.com          2.750000
1              توحید  0.937184        www.beytoote.com          7.000000
2         سوره توحید  0.894493        www.beytoote.com          2.666667
3         سوره توحید  0.894493         www.delgarm.com         14.000000
4       ماه های تولد  0.891508  www.tasvirezendegi.com          4.000000
..               ...       ...                     ...               ...
367  تولد برای خواهر  0.813686  www.tasvirezendegi.com          1.000000
368     احادیث کوتاه  0.813636                  ana.ir         52.000000
369     احادیث کوتاه  0.813636           www.talab.org          6.000000
370     شایلی محمودی  0.813529            persianv.com         10.000000
371     شایلی محمودی  0.813529       www.niksalehi.com          5.000000

[372 rows x 4 columns]


In [None]:
query = """
SELECT
    website,
    sum(average_position * volume) / sum(volume) AS weighted_avg_position
FROM ahrefs.keywords
WHERE keyword IN {keywords:Array(String)}
GROUP BY website
ORDER BY weighted_avg_position ASC
LIMIT 20
"""

df = ch_client.query_df(query, parameters={"keywords": related_keywords})
df

Unnamed: 0,website,weighted_avg_position
0,www.zoomit.ir,1.0
1,www.tgju.org,2.684793
2,digiato.com,4.40625
3,arzdigital.com,6.445607
4,cryptonegar.com,6.734848
5,www.kojaro.com,6.746941
6,www.beytoote.com,6.893842
7,safarpin.com,7.0
8,www.chetor.com,7.74109
9,www.asriran.com,9.616824
