In [24]:
import numpy as np
import pandas as pd

def load_npz_chunks(path, chunk_size=10_000):
    data = np.load(path, mmap_mode="r")
    keys = data.files
    dfs = []

    for i in range(0, len(keys), chunk_size):
        batch_keys = keys[i:i+chunk_size]

        rows = [data[k].squeeze(0) for k in batch_keys]
        ids = [k[:-4] if k.endswith(".png") else k for k in batch_keys]

        df_chunk = pd.DataFrame(rows)
        df_chunk.insert(0, "id", ids)
        dfs.append(df_chunk)

    return pd.concat(dfs, ignore_index=True)

df = load_npz_chunks("image_embeddings_train.npz")

In [25]:
df_full = pd.read_parquet("/Users/a../Itau-group-2/data/processed/train_pairs_medium_100k.parquet")

In [26]:
print(len(df_full))
print(df_full.head())

100000
  fraudulent_name   real_name  label
0         egynęsș     egynews    1.0
1         za-nlcs       ranks    1.0
2         carroup     carsoup    1.0
3      jqcpkotjoy  jackpotjoy    1.0
4    tticketsņowf  ticketsnow    1.0


In [None]:
print(len(df['id'].values))
print(df.head())
df_unique = df.drop_duplicates(subset=['id'], keep='first')


689697
                 id         0         1         2         3         4  \
0  viralvirålvideos  0.008351 -0.021340  0.052327 -0.004821 -0.029295   
1         lŏventi5w  0.024624 -0.013869 -0.002390  0.000390 -0.007365   
2         myįstored -0.005690 -0.003292 -0.003006  0.008450 -0.010322   
3         homechoie  0.021408 -0.000490  0.011311  0.018344 -0.012699   
4            jjımmʂ -0.006141 -0.010790  0.001257 -0.025195 -0.015342   

          5         6         7         8  ...       758       759       760  \
0  0.011599  0.043667 -0.009937 -0.012610  ... -0.009243  0.413550 -0.057422   
1  0.007263  0.032505  0.040346 -0.041110  ... -0.034923  0.360712 -0.029610   
2  0.024960  0.010066  0.021797  0.002236  ...  0.021149  0.457478 -0.001737   
3 -0.000879 -0.007694  0.017150 -0.024049  ... -0.017508  0.409301  0.003718   
4  0.020401  0.019057  0.010168 -0.049876  ... -0.006622  0.378064 -0.013442   

        761       762       763       764       765       766       767  

In [15]:
df_embeddings = pd.merge(df_full, df_unique, left_on='fraudulent_name', right_on='id', how='left')
df_embeddings = df_embeddings.drop(columns=['id'])
print(len(df_embeddings))
print(df_embeddings.head())

NameError: name 'df_full' is not defined

In [32]:
# list number of nan rows in df_embeddings
nan_count = df_embeddings.isna().any(axis=1).sum()
print(f"Number of NaN rows in df_embeddings: {nan_count}")
print(len(df_embeddings))

Number of NaN rows in df_embeddings: 0
100000


In [34]:
# change column names of embeddings to embedding_0, embedding_1, ..., ignore first 3 columns in naming scheme
# also change df name from df_embeddings to df4_embeddings
embedding_columns = df_embeddings.columns[3:]
new_column_names = {col: f"embedding_{i}" for i, col in enumerate(embedding_columns)}
df4_embeddings = df_embeddings.rename(columns=new_column_names)
print(df4_embeddings.head())

  fraudulent_name   real_name  label  embedding_0  embedding_1  embedding_2  \
0         egynęsș     egynews    1.0    -0.010370    -0.027957     0.011073   
1         za-nlcs       ranks    1.0     0.017507    -0.000764     0.033904   
2         carroup     carsoup    1.0     0.004613    -0.029075     0.011657   
3      jqcpkotjoy  jackpotjoy    1.0     0.003160    -0.019422     0.003183   
4    tticketsņowf  ticketsnow    1.0     0.004346     0.011437     0.019909   

   embedding_3  embedding_4  embedding_5  embedding_6  ...  embedding_758  \
0    -0.007775    -0.024626     0.000642    -0.008451  ...       0.020267   
1    -0.014927    -0.023493     0.044845     0.011360  ...      -0.010337   
2     0.001511    -0.018553    -0.011622    -0.004981  ...      -0.003396   
3     0.011719    -0.031880     0.045235     0.032722  ...       0.012257   
4    -0.001371    -0.022954     0.025783     0.024468  ...      -0.055358   

   embedding_759  embedding_760  embedding_761  embedding_762 

In [37]:
# save to parquet
df4_embeddings.to_parquet("emb_train_pairs_medium_100k.parquet", index=False)

In [27]:
import pandas as pd

# import the saved parquet
df4_embeddings = pd.read_parquet("emb_train_pairs_medium_100k.parquet")

In [28]:
#df4_embeddings = df4_embeddings.head(1000)
print(df4_embeddings.head())

  fraudulent_name   real_name  label  embedding_0  embedding_1  embedding_2  \
0         egynęsș     egynews    1.0    -0.010370    -0.027957     0.011073   
1         za-nlcs       ranks    1.0     0.017507    -0.000764     0.033904   
2         carroup     carsoup    1.0     0.004613    -0.029075     0.011657   
3      jqcpkotjoy  jackpotjoy    1.0     0.003160    -0.019422     0.003183   
4    tticketsņowf  ticketsnow    1.0     0.004346     0.011437     0.019909   

   embedding_3  embedding_4  embedding_5  embedding_6  ...  embedding_758  \
0    -0.007775    -0.024626     0.000642    -0.008451  ...       0.020267   
1    -0.014927    -0.023493     0.044845     0.011360  ...      -0.010337   
2     0.001511    -0.018553    -0.011622    -0.004981  ...      -0.003396   
3     0.011719    -0.031880     0.045235     0.032722  ...       0.012257   
4    -0.001371    -0.022954     0.025783     0.024468  ...      -0.055358   

   embedding_759  embedding_760  embedding_761  embedding_762 

In [29]:
import numpy as np
import pandas as pd

df = df4_embeddings.copy()

# 1) Identify embedding columns (embedding_0 ... embedding_767)
embedding_columns = [c for c in df.columns if c.startswith("embedding_")]

# 2) Build a lookup table: fraudulent_name -> its embedding columns
#    (drop duplicates so merge is well-defined)
lookup = (
    df[["fraudulent_name"] + embedding_columns]
    .dropna(subset=["fraudulent_name"])
    .drop_duplicates(subset=["fraudulent_name"])
    .rename(columns={"fraudulent_name": "real_name"})
)

# 3) Rename embedding columns to real_embedding_*
lookup = lookup.rename(columns={c: c.replace("embedding_", "real_embedding_") for c in embedding_columns})

# 4) Merge: attach real_embedding_* onto each row via real_name
df = df.merge(lookup, on="real_name", how="left")

# 5) Drop rows where real embeddings are missing
real_embedding_columns = [c.replace("embedding_", "real_embedding_") for c in embedding_columns]
df = df.dropna(subset=real_embedding_columns).reset_index(drop=True)

print(len(df))
print(df.head())

37986
  fraudulent_name   real_name  label  embedding_0  embedding_1  embedding_2  \
0         za-nlcs       ranks    1.0     0.017507    -0.000764     0.033904   
1    tticketsņowf  ticketsnow    1.0     0.004346     0.011437     0.019909   
2     code_layers  codelayers    1.0    -0.014691    -0.000852    -0.032053   
3            prom       mrdom    0.0     0.012328    -0.000572     0.033226   
4        zetawiki    catawiki    0.0     0.036106     0.005013     0.010049   

   embedding_3  embedding_4  embedding_5  embedding_6  ...  \
0    -0.014927    -0.023493     0.044845     0.011360  ...   
1    -0.001371    -0.022954     0.025783     0.024468  ...   
2     0.010591    -0.016126     0.016612     0.034613  ...   
3    -0.032742    -0.021779     0.003562     0.008447  ...   
4     0.002892    -0.032494     0.012415     0.008203  ...   

   real_embedding_758  real_embedding_759  real_embedding_760  \
0           -0.017455            0.412029           -0.028946   
1           -0.0

In [34]:
# save df to parquet
df.to_parquet("df_with_real_embeddings.parquet", index=False)

In [37]:
# List of columns to find indices
columns_to_find = ['embedding_0', 'embedding_767', 'real_embedding_0', 'real_embedding_767']

# Find and print the column indices
for col in columns_to_find:
    if col in df.columns:
        print(f"Index of {col}: {df.columns.get_loc(col)}")
    else:
        print(f"{col} not found in DataFrame")

Index of embedding_0: 3
Index of embedding_767: 770
Index of real_embedding_0: 771
Index of real_embedding_767: 1538
