In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

master_path = "domains_master.csv"
df = pd.read_csv(master_path)

required_cols = {"domain", "cluster_keyword"}

# clusters
keywords = sorted(df["cluster_keyword"].dropna().unique().tolist())
centers = {k: f"{k}.com" for k in keywords}

# Embedding model
model = SentenceTransformer("Qwen/Qwen3-Embedding-8B", device="cpu")

# Encode all unique domains
unique_domains = df["domain"].unique().tolist()
domain_emb = model.encode(unique_domains, convert_to_tensor=True, show_progress_bar=True)

# Map domain to embedding row index
domain_idx = {d: i for i, d in enumerate(unique_domains)}

# Encode centers
center_texts = [centers[k] for k in keywords]
center_emb = model.encode(center_texts, convert_to_tensor=True, show_progress_bar=False)
center_idx = {k: i for i, k in enumerate(keywords)}

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

TypeError: 'float' object is not callable

In [2]:
# Compute embedding_distance for each row:
# distance = (1 - cosine_similarity(domain, its own center))*(1/0.6) CITAR
distances = np.empty(len(df), dtype=float)

for i, row in df.iterrows():
    k = row["cluster_keyword"]
    d = row["domain"]

    if pd.isna(k) or k not in centers:
        distances[i] = np.nan
        continue

    di = domain_idx[d]
    ci = center_idx[k]

    sim = util.cos_sim(domain_emb[di], center_emb[ci]).item()
    distances[i] = (1.0 - sim)*(1/0.6)

df["embedding_distance"] = distances

# Save
out_path = "domains_master_with_qwen.csv"
df.to_csv(out_path, index=False)

print("Saved:", out_path)
print(df[["domain", "cluster_keyword", "embedding_distance"]].head(10))


Saved: domains_master_with_qwen.csv
                   domain cluster_keyword  embedding_distance
0                chat.com            chat       -1.986821e-07
1  top-chat-solutions.com            chat        4.593615e-01
2              chatmy.com            chat        2.239104e-01
3         elitechatai.com            chat        5.670574e-01
4       quickchathelp.com            chat        3.305179e-01
5           chatmarket.ai            chat        4.825898e-01
6       securechat360.org            chat        5.126576e-01
7             chatbest.co            chat        3.418442e-01
8            truechat.org            chat        2.587932e-01
9      prochatservice.com            chat        3.308956e-01


In [3]:
master_qwen = pd.read_csv("domains_master_with_qwen.csv")[["domain", "embedding_distance"]]

# Chat panel (all)
chat_all = pd.read_csv("chat_panel_all.csv")
chat_all = chat_all.drop(columns=["embedding_distance"], errors="ignore").merge(master_qwen, on="domain", how="left")
chat_all.to_csv("chat_panel_all_with_qwen.csv", index=False)

# Chat panel (sold)
chat_sold = pd.read_csv("chat_sales_panel.csv")
chat_sold = chat_sold.drop(columns=["embedding_distance"], errors="ignore").merge(master_qwen, on="domain", how="left")
chat_sold.to_csv("chat_sales_panel_with_qwen.csv", index=False)

# Cross-section (sold)
cross_sold = pd.read_csv("multi_keyword_cross_section.csv")
cross_sold = cross_sold.drop(columns=["embedding_distance"], errors="ignore").merge(master_qwen, on="domain", how="left")
cross_sold.to_csv("multi_keyword_cross_section_with_qwen.csv", index=False)