In [27]:
import pandas as pd
import numpy as np
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import matplotlib.pyplot as plt
from matplotlib import font_manager

In [28]:
import unicodedata
import torch
import torch.nn.functional as F
import pandas as pd
import tqdm
from PIL import Image, ImageDraw, ImageFont
from matplotlib import font_manager
from transformers import AutoProcessor, SiglipModel
from sklearn.metrics import roc_auc_score

# Summer 2025

## Tex -> glyph -> embedding

In [29]:
def _get_unicode_font(font_size=14):
    # Try DejaVu Sans (bundled with matplotlib). Fall back to PIL default (not ideal).
    try:
        path = font_manager.findfont("DejaVu Sans", fallback_to_default=True)
        return ImageFont.truetype(path, font_size)
    except Exception:
        return ImageFont.load_default()


def generate_glyph_image(text, image_size=(224, 224)):
    """
    Converts a given business name into a glyph (image representation).
    """
    text = unicodedata.normalize('NFC', text)
    image = Image.new("RGB", image_size, color=(0,0,0))  # "L" mode for grayscale
    draw = ImageDraw.Draw(image)
    font = _get_unicode_font()

    # Get text bounding box
    bbox = draw.textbbox((0, 0), text, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]
    x = (image_size[0] - text_width) // 2
    y = (image_size[1] - text_height) // 2

    # Draw text onto the image
    draw.text((x, y), text, font=font, fill=(255, 255, 255))

    return image

In [30]:
# --- 2) Load model/processor (joint SigLIP; gives projected image_embeds) ---
from transformers import SiglipVisionModel
model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

In [47]:
from math import ceil
from tqdm.auto import tqdm
import torch
import torch.nn.functional as F

import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

def embed_names(names, batch_size=64):
    uniq = sorted(set(map(str, names)))
    name_to_emb = {}

    # pre-render images (CPU only)
    imgs = {}
    for n in tqdm(uniq, desc="Rendering glyphs", unit="img"):
        imgs[n] = generate_glyph_image(n)

    for i in tqdm(range(0, len(uniq), batch_size),
                  desc="Embedding images",
                  unit="batch",
                  dynamic_ncols=True):

        chunk = uniq[i:i + batch_size]
        pil_batch = [imgs[n] for n in chunk]

        for img in pil_batch:
          tensor_img = processor(images=[img], return_tensors="pt")["pixel_values"]

        if not torch.isfinite(tensor_img).all().item():
            print("Non-finite tensor_img detected!")

        batch = processor(images=pil_batch, return_tensors="pt")

        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            out = model(**batch)
            embs = out.pooler_output

        if not torch.isfinite(embs).all().item():
            print("Non-finite embeddings detected!")

        embs = F.normalize(embs, dim=-1, eps=1e-8)

        for n, e in zip(chunk, embs):
            name_to_emb[n] = e

    return name_to_emb


In [48]:
from math import ceil
from tqdm.auto import tqdm
import torch
import torch.nn.functional as F


In [76]:
from google.colab import files
uploaded = files.upload()

Saving test_pairs_10k.parquet to test_pairs_10k.parquet


In [77]:
# open parquet
test_data = pd.read_parquet('/content/test_pairs_10k.parquet')
final_df = pd.read_parquet('/content/test_pairs_10k.parquet')


In [78]:
all_names = pd.concat([test_data["fraudulent_name"], test_data["real_name"]], ignore_index=True)
name2emb = embed_names(all_names.tolist(), batch_size=128)

Rendering glyphs:   0%|          | 0/15975 [00:00<?, ?img/s]

Embedding images:   0%|          | 0/125 [00:00<?, ?batch/s]

In [79]:
# create pandas dataframe with all_names and their embeddings

import numpy as np
import pandas as pd

names = []
embs = []

for name, emb in name2emb.items():
    names.append(name)
    # remove first character
    names[-1] = names[-1].lstrip('-')
    embs.append(emb.detach().cpu().numpy())

embs = np.vstack(embs)

df = pd.DataFrame(embs, columns=[f"emb_{i}" for i in range(embs.shape[1])])
df.insert(0, "name", names)

print(np.isnan(embs).any())

False


In [80]:
print(final_df.head())

  fraudulent_name     real_name  label
0             îto           ito    1.0
1      sigmaphoto     iansphoto    0.0
2    jagatrev1ewg   jagatreview    1.0
3        tempsite   sempreinter    0.0
4     worlďsnoker  worldsnooker    1.0


In [81]:
# Ensure name column is string
df["name"] = df["name"].astype(str)
final_df["fraudulent_name"] = final_df["fraudulent_name"].str.lstrip('-').str.strip()
final_df["real_name"] = final_df["real_name"].str.lstrip('-').str.strip()

# Identify embedding columns
emb_cols = [c for c in df.columns if c.startswith("emb_")]
D = len(emb_cols)

fraud_emb_df = (
    df[["name"] + emb_cols]
    .rename(columns={c: f"fraud_{c}" for c in emb_cols})
    .rename(columns={"name": "fraudulent_name"})
)

final_df = final_df.merge(
    fraud_emb_df,
    on="fraudulent_name",
    how="left"
)

real_emb_df = (
    df[["name"] + emb_cols]
    .rename(columns={c: f"real_{c}" for c in emb_cols})
    .rename(columns={"name": "real_name"})
)

final_df = final_df.merge(
    real_emb_df,
    on="real_name",
    how="left"
)
print(final_df.head())

  fraudulent_name     real_name  label  fraud_emb_0  fraud_emb_1  fraud_emb_2  \
0             îto           ito    1.0    -0.026772    -0.004786    -0.020721   
1      sigmaphoto     iansphoto    0.0     0.007331    -0.017264     0.005130   
2    jagatrev1ewg   jagatreview    1.0     0.003873     0.005813     0.013747   
3        tempsite   sempreinter    0.0     0.015121     0.015807     0.019678   
4     worlďsnoker  worldsnooker    1.0    -0.022249    -0.052664     0.011284   

   fraud_emb_3  fraud_emb_4  fraud_emb_5  fraud_emb_6  ...  real_emb_758  \
0     0.009235    -0.022265     0.004612    -0.033148  ...     -0.044480   
1    -0.021434     0.005155    -0.000049    -0.046164  ...     -0.041599   
2     0.040721    -0.068396     0.054043     0.018278  ...     -0.004613   
3    -0.022666    -0.016875    -0.000962     0.009534  ...     -0.016473   
4     0.018898     0.014253     0.013480     0.033856  ...     -0.008150   

   real_emb_759  real_emb_760  real_emb_761  real_emb_76

In [82]:
# print true if any nan values
print(final_df.isna().any())

# save final_df to parquet
final_df.to_parquet('test_pairs_with_siglip_embeddings.parquet', index=False)

fraudulent_name    False
real_name          False
label              False
fraud_emb_0        False
fraud_emb_1        False
                   ...  
real_emb_763       False
real_emb_764       False
real_emb_765       False
real_emb_766       False
real_emb_767       False
Length: 1539, dtype: bool


In [83]:
nan_rows = final_df[final_df.filter(like="fraud_").isna().any(axis=1)]
print(nan_rows[["fraudulent_name", "real_name"]])


Empty DataFrame
Columns: [fraudulent_name, real_name]
Index: []
