## CLUSTERING APPROACH




In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import os
import numpy as np
from recordlinkage.preprocessing import phonetic
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

Mounted at /content/drive


In [2]:
# Load test data
account_test = pd.read_csv('/content/drive/MyDrive/EPFL/LAUZHACK/data/account_booking_test.csv')
account_test.drop_duplicates(subset='transaction_reference_id', inplace=True)
external_test = pd.read_csv('/content/drive/MyDrive/EPFL/LAUZHACK/data/external_parties_test.csv')
df_test = pd.merge(account_test, external_test)



  from tqdm.autonotebook import tqdm, trange


In [None]:

# Preprocess test data
df_test["phonetic_parsed_name"] = phonetic(df_test["parsed_name"], "soundex")
df_test["phonetic_parsed_address_street_name"] = phonetic(df_test["parsed_address_street_name"], "soundex")
df_test["phonetic_parsed_address_city"] = phonetic(df_test["parsed_address_city"], "soundex")
df_test["initials"] = (df_test["parsed_name"].str[0] + df_test["parsed_name"].str[-1]).fillna("")

# Initialize SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for 1.4M test entries
print("Generating embeddings...")
test_embeddings = model.encode(df_test['parsed_name'].astype(str) + ' ' +
                                df_test['parsed_address_street_name'].astype(str) + ' ' +
                                df_test['parsed_address_city'].astype(str),
                                batch_size=64, show_progress_bar=True)
np.save('/content/drive/MyDrive/EPFL/LAUZHACK/data/test_embeddings.npy', test_embeddings)


Generating embeddings...


Batches:   0%|          | 0/23152 [00:00<?, ?it/s]

In [2]:
# Load pre-saved embeddings
print("Loading embeddings...")
test_embeddings = np.load('/content/drive/MyDrive/EPFL/LAUZHACK/data/test_embeddings.npy')

# Ensure float64 for KMeans compatibility
test_embeddings = np.array(test_embeddings, dtype='float64')
np.save('/content/drive/MyDrive/EPFL/LAUZHACK/data/test_embeddings_float64.npy', test_embeddings)

Loading embeddings...


In [23]:
account_test = pd.read_csv('/content/drive/MyDrive/EPFL/LAUZHACK/data/account_booking_test.csv')
account_test.drop_duplicates(subset='transaction_reference_id', inplace=True)
external_test = pd.read_csv('/content/drive/MyDrive/EPFL/LAUZHACK/data/external_parties_test.csv')
df_test = pd.merge(account_test, external_test)
df_test["phonetic_parsed_name"] = phonetic(df_test["parsed_name"], "soundex")
df_test["phonetic_parsed_address_street_name"] = phonetic(df_test["parsed_address_street_name"], "soundex")
df_test["phonetic_parsed_address_city"] = phonetic(df_test["parsed_address_city"], "soundex")
df_test["initials"] = (df_test["parsed_name"].str[0] + df_test["parsed_name"].str[-1]).fillna("")
# Define parameters
embedding_file = '/content/drive/MyDrive/EPFL/LAUZHACK/data/test_embeddings.npy'
batch_size = 10000  # Number of embeddings to process at a time
num_clusters = 1400
cluster_dir = '/content/drive/MyDrive/EPFL/LAUZHACK/data/clusters_test'


# Initialize MiniBatchKMeans for incremental clustering
kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=0, batch_size=batch_size)

# Load and process embeddings in chunks
print("Loading embeddings and performing clustering in chunks...")
embedding_shape = np.load(embedding_file, mmap_mode='r').shape  # Get shape without loading all into memory
num_samples = embedding_shape[0]

# Load the embeddings using memmap for efficient memory usage
embeddings = np.load(embedding_file, mmap_mode='r')

# Incrementally fit the KMeans model
for i in range(0, num_samples, batch_size):
    batch_end = min(i + batch_size, num_samples)
    embedding_batch = embeddings[i:batch_end]
    kmeans.partial_fit(embedding_batch)  # Incremental training
    print(f"Processed batch {i // batch_size + 1}/{(num_samples + batch_size - 1) // batch_size}")

# Predict clusters for the full dataset
print("Predicting clusters...")
cluster_labels = []
for i in range(0, num_samples, batch_size):
    batch_end = min(i + batch_size, num_samples)
    embedding_batch = embeddings[i:batch_end]
    cluster_labels.extend(kmeans.predict(embedding_batch))  # Predict in chunks

# Add cluster assignments to the original DataFrame
df_test['cluster'] = cluster_labels

# Save MiniBatchKMeans model for reuse
joblib.dump(kmeans, '/content/drive/MyDrive/EPFL/LAUZHACK/data/kmeans_test_model.pkl')

# Save clusters into separate CSVs
print("Saving clusters...")
os.makedirs(cluster_dir, exist_ok=True)

for cluster_id in df_test['cluster'].unique():
    cluster_df = df_test[df_test['cluster'] == cluster_id]
    if not cluster_df.empty:  # Avoid saving empty clusters
        cluster_df.to_csv(os.path.join(cluster_dir, f'cluster_{cluster_id}.csv'), index=False)

print("Clustering completed and saved.")


Loading embeddings and performing clustering in chunks...
Processed batch 1/149
Processed batch 2/149
Processed batch 3/149
Processed batch 4/149
Processed batch 5/149
Processed batch 6/149
Processed batch 7/149
Processed batch 8/149
Processed batch 9/149
Processed batch 10/149
Processed batch 11/149
Processed batch 12/149
Processed batch 13/149
Processed batch 14/149
Processed batch 15/149
Processed batch 16/149
Processed batch 17/149
Processed batch 18/149
Processed batch 19/149
Processed batch 20/149
Processed batch 21/149
Processed batch 22/149
Processed batch 23/149
Processed batch 24/149
Processed batch 25/149
Processed batch 26/149
Processed batch 27/149
Processed batch 28/149
Processed batch 29/149
Processed batch 30/149
Processed batch 31/149
Processed batch 32/149
Processed batch 33/149
Processed batch 34/149
Processed batch 35/149
Processed batch 36/149
Processed batch 37/149
Processed batch 38/149
Processed batch 39/149
Processed batch 40/149
Processed batch 41/149
Processe

In [7]:
# Load test data
account_test = pd.read_csv('/content/drive/MyDrive/EPFL/LAUZHACK/data/account_booking_test.csv')
account_test.drop_duplicates(subset='transaction_reference_id', inplace=True)
external_test = pd.read_csv('/content/drive/MyDrive/EPFL/LAUZHACK/data/external_parties_test.csv')
df_test = pd.merge(account_test, external_test)
df_test["phonetic_parsed_name"] = phonetic(df_test["parsed_name"], "soundex")
df_test["phonetic_parsed_address_street_name"] = phonetic(df_test["parsed_address_street_name"], "soundex")
df_test["phonetic_parsed_address_city"] = phonetic(df_test["parsed_address_city"], "soundex")
df_test["initials"] = (df_test["parsed_name"].str[0] + df_test["parsed_name"].str[-1]).fillna("")


In [13]:
kmeans = joblib.load('/content/drive/MyDrive/EPFL/LAUZHACK/data/kmeans_test_model.pkl')
cluster_dir = '/content/drive/MyDrive/EPFL/LAUZHACK/data/clusters_test'
result_file = '/content/drive/MyDrive/EPFL/LAUZHACK/results/matches.csv'


df_test[['parsed_name', 'parsed_address_street_name', 'parsed_address_city']] = df_test[
    ['parsed_name', 'parsed_address_street_name', 'parsed_address_city']
].fillna('')
model = SentenceTransformer('all-MiniLM-L6-v2')
matches = []

# Process each test entry
for index, row in df_test.iterrows():
    # Generate embedding for the current test entry
    test_entry = row['parsed_name'] + ' ' + row['parsed_address_street_name'] + ' ' + row['parsed_address_city']
    test_embedding = model.encode([test_entry])  # Generate single-entry embedding

    # Convert embedding to float64 for KMeans compatibility
    test_embedding = test_embedding.astype('float32')

    # Predict the closest cluster
    closest_cluster_id = kmeans.predict(test_embedding)[0]

    # Load the corresponding cluster DataFrame
    cluster_path = os.path.join(cluster_dir, f'cluster_{closest_cluster_id}.csv')
    if not os.path.exists(cluster_path):
        print(f"Cluster file not found: {cluster_path}")
        continue

    cluster_df = pd.read_csv(cluster_path)

    # Add a fallback blocking key if 'initials' is not available
    if 'initials' not in cluster_df.columns:
        cluster_df['blocking_key'] = cluster_df['parsed_name'].str[0].fillna('')
        indexer = recordlinkage.Index()
        indexer.block('blocking_key')
    else:
        indexer = recordlinkage.Index()
        indexer.block('initials')

    candidate_links = indexer.index(cluster_df)

    compare = recordlinkage.Compare()
    compare.string('parsed_name', 'parsed_name', label='parsed_name')
    compare.exact('phonetic_parsed_name', 'phonetic_parsed_name', label="phonetic_parsed_name")
    compare.string('parsed_address_street_name', 'parsed_address_street_name', method='jarowinkler', label="parsed_address_street_name")
    compare.string('phonetic_parsed_address_street_name', 'phonetic_parsed_address_street_name', method='jarowinkler', label="phonetic_parsed_address_street_name")
    compare.string('parsed_address_state', 'parsed_address_state', method='jarowinkler', label="parsed_address_state")
    compare.string('parsed_address_city', 'parsed_address_city', method='jarowinkler', label="parsed_address_city")
    compare.string('parsed_address_country', 'parsed_address_country', method='jarowinkler', label="parsed_address_country")
    compare.string("party_iban", "party_iban", label="party_iban")

    features = compare.compute(candidate_links, cluster_df)

    # Filter matches based on a threshold
    threshold = 0.80
    filtered_matches = features[(features['parsed_name'] >= threshold) |
                                 (features['phonetic_parsed_name'] == 1) |
                                 (features['parsed_address_street_name'] >= threshold) |
                                 (features['phonetic_parsed_address_street_name'] >= threshold) |
                                 (features['parsed_address_state'] >= threshold) |
                                 (features['parsed_address_city'] >= threshold) |
                                 (features['parsed_address_country'] >= threshold) |
                                 (features['party_iban'] == 1)]

    # Store the matches
    matches.extend(filtered_matches.index.tolist())

# Format the matches correctly for the submission
formatted_matches = []
for match in matches:
    row1 = cluster_df.loc[match[0]]
    row2 = cluster_df.loc[match[1]]
    formatted_matches.append({
        'transaction_reference_id_1': row1['transaction_reference_id'],
        'transaction_reference_id_2': row2['transaction_reference_id'],
        'dataset_1': row1['dataset'] if 'dataset' in row1 else 'test',
        'dataset_2': row2['dataset'] if 'dataset' in row2 else 'test'
    })

# Save matches to a CSV file
matches_df = pd.DataFrame(formatted_matches)
matches_df.to_csv(result_file, index=False)

print(f"Matching completed. Results saved to {result_file}.")



KeyError: 1177