is blocking (like exclude based on length difference an option?)
for the ones with low cosine similarity, just manually search?

Parameter k = how many neighbors you return, Parameter ef = How many neighbors you explore


In [5]:
import pandas as pd
# read test_pairs_all.parquet in data folder
df = pd.read_parquet('data/processed/test_pairs_all.parquet')
# take only first 10000 rows
df = df.head(10000)
print(df.head())

  fraudulent_name   real_name  label
0      mekwajisyo  meiwajisyo    1.0
1      meiawijsyo  meiwajisyo    1.0
2     meiwa-ijsoy  meiwajisyo    1.0
3      męiwajisyo  meiwajisyo    1.0
4      meįwąjisyo  meiwajisyo    1.0


In [6]:
df_real = df[['real_name']].drop_duplicates().reset_index(drop=True)
print(df_real.head())

           real_name
0         meiwajisyo
1   webfactorydesign
2             125wyt
3           aeonbank
4  filmesonlineagora


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=1
)

X_real = vectorizer.fit_transform(df_real["real_name"]).astype("float32")
X_real = X_real.toarray()


In [16]:
import hnswlib

dim = X_real.shape[1]

index = hnswlib.Index(space="cosine", dim=dim)
index.init_index(
    max_elements=len(X_real),
    ef_construction=200,
    M=16
)

index.add_items(X_real, ids=df_real.index.to_numpy())
index.set_ef(int(0.1*len(X_real)))

In [17]:
from rapidfuzz.distance import Levenshtein

def lev_sim(a, b):
    return 1 - Levenshtein.distance(a, b) / max(len(a), len(b))

X_query = vectorizer.transform(df["real_name"]).astype("float32")
X_query = X_query.toarray()

In [18]:
K = int(0.05*len(X_real))

results = []

for i, q_vec in enumerate(X_query):
    labels, distances = index.knn_query(q_vec, k=K)
    candidate_ids = labels[0]

    query_norm = df.loc[i, "real_name"]

    scored = []
    for cid in candidate_ids:
        real_norm = df_real.loc[cid, "real_name"]
        score = lev_sim(query_norm, real_norm)
        scored.append((cid, score))

    best_cid, best_score = max(scored, key=lambda x: x[1])

    results.append({
        "query_name": df.loc[i, "fraudulent_name"],
        "actual_real_name": df.loc[i, "real_name"],
        "best_real_name": df_real.loc[best_cid, "real_name"],
        "lev_similarity": best_score
    })


In [19]:
def decision(score):
    if score >= 0.90:
        return "block"
    elif score >= 0.80:
        return "review"
    return "allow"

out = pd.DataFrame(results)
out["decision"] = out["lev_similarity"].apply(decision)

In [20]:
print(out.head(20))

           query_name  actual_real_name    best_real_name  lev_similarity  \
0          mekwajisyo        meiwajisyo        meiwajisyo             1.0   
1          meiawijsyo        meiwajisyo        meiwajisyo             1.0   
2         meiwa-ijsoy        meiwajisyo        meiwajisyo             1.0   
3          męiwajisyo        meiwajisyo        meiwajisyo             1.0   
4          meįwąjisyo        meiwajisyo        meiwajisyo             1.0   
5          meiwàjišÿo        meiwajisyo        meiwajisyo             1.0   
6          meiwajiryò        meiwajisyo        meiwajisyo             1.0   
7          leiwâjiŝyo        meiwajisyo        meiwajisyo             1.0   
8         rneiwaijsyô        meiwajisyo        meiwajisyo             1.0   
9    wdbfactorydesign  webfactorydesign  webfactorydesign             1.0   
10  webfacttorydesgin  webfactorydesign  webfactorydesign             1.0   
11   webfictorydesgkn  webfactorydesign  webfactorydesign             1.0   

In [21]:
# print accuracy as percentage of rows where actual_real_name == best_real_name
accuracy = (out["actual_real_name"] == out["best_real_name"]).mean()
print(f"Accuracy: {accuracy:.4%}")

Accuracy: 96.3100%


In [22]:
# print rows where actual_real_name != best_real_name
mismatches = out[out["actual_real_name"] != out["best_real_name"]]
print(mismatches)

     query_name actual_real_name best_real_name  lev_similarity decision
1395        z6r               z6           nzxt        0.250000    allow
1396        zbc               z6           nzxt        0.250000    allow
1397       zbdi               z6           nzxt        0.250000    allow
1398         ẕ6               z6           nzxt        0.250000    allow
1399         ẓ6               z6           nzxt        0.250000    allow
...         ...              ...            ...             ...      ...
9013         ƀɱ               bm            fbo        0.333333    allow
9014         ƀɱ               bm            fbo        0.333333    allow
9015        bɱm               bm            fbo        0.333333    allow
9016         ɱƀ               bm            fbo        0.333333    allow
9017          ɍ               bm            fbo        0.333333    allow

[369 rows x 5 columns]
