In [1]:
import ast
import os
import re
import time 
import warnings
from collections import defaultdict

import google.generativeai as genai
import numpy as np
import openpyxl
import pandas as pd
import pycountry
import spacy
from docx import Document
from openpyxl import Workbook
from openpyxl.styles import Font
from openpyxl.utils.dataframe import dataframe_to_rows
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.util import cos_sim
from sklearn.cluster import DBSCAN

warnings.filterwarnings("ignore", category=FutureWarning)

In [10]:
# YEU
# input_path = "content/YEU_CUST_MSTR.xlsx"
# sheet_name = "Sheet"

# MITER
raw_file_path = "content/MILGARD DATA 26-07-25 DEV2 (1).xlsx"
party_sheet_name = "Party"
site_sheet_name = "Site and Site Use"

party_sheet_name = "Party"

# demo
# input_path = "content/customer_data.xlsx"
# sheet_name = "Sheet1"

module = "absolute address"
header_row_num = 0

iterations     = 10
avg_threshold  = 98.0

eps = 0.1


output_xlsx = "content/AddressClusterReport.xlsx"

In [11]:
df = pd.read_excel(raw_file_path, site_sheet_name)
all_columns = pd.read_excel(raw_file_path, site_sheet_name, header=None).iloc[header_row_num].tolist()

# genai.configure(api_key="AIzaSyA5NWnp7qoQMoTQZMi8cWlS0Ei8-_SDvLs")
# model = genai.GenerativeModel("gemini-1.5-flash")

# prompt = f"""
# You are a data expert. Your task is to identify ONLY DISTINGUIHING columns related to module: {module} from a list of column names.

# Given the following list of column names:
# {all_columns}

# Only return a valid Python list of strings. Dont include IDs and Attributes and Timezone. Do not include any explanation or extra text.

# Example output:
# ["module related Column A", "module related Column B", "module related Column C", ...]

# Do not include any explanation or extra text.
# """

# max_retries = 5
# address_cols = []

# for attempt in range(max_retries):
#     response = model.generate_content(prompt)
#     text = response.text.strip()

#     try:
#         address_cols = ast.literal_eval(text)
#         if isinstance(address_cols, list) and all(isinstance(col, str) for col in address_cols):
#             break
#     except (SyntaxError, ValueError) as e:
#         print(f"Attempt {attempt+1} failed:", e)
#         print("Response was:", text)
#         time.sleep(1)

# address_cols

address_cols = [
 'Address Line 1',
 'Address Line 2',
 'Mail Stop',
 'City',
 'State/Province',
 'Postal Code',
 'County',
 'Country'
 ]

In [None]:
report_cols = [
    'Party ID', 'Site ID', 'Site Name', 'Site Purpose',
    'Address Line 1', 'Address Line 2', 'Mail Stop', 'City', 'State/Province',
    'Postal Code', 'County', 'Country'
]

df = df[report_cols]
f = lambda v: ', '.join(sorted(set(str(x) for x in v if pd.notna(x) and str(x).strip())))
df = df.groupby("Site ID", as_index=False).agg(lambda col: f(col) if col.name == "Site Purpose" else col.iloc[0])

In [13]:
os.system("python -m spacy download en_core_web_sm")
sbert = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("en_core_web_sm")

def normalize_text(s):
    s = str(s).lower()
    s = re.sub(r"[^\w\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

def build_weighted_text(row):
    parts = []
    for col in address_cols:
        val = str(row.get(col, '')).strip()
        
        if col == "Postal Code" and "-" in val:
            val = val.split("-")[0].strip()

        if col in ["Address Line 1", "Postal Code"]:
            parts.append((val + " ") * 3)
        else:
            parts.append(val)
    
    return normalize_text(" ".join(parts))

full_rows = df[address_cols].apply(build_weighted_text, axis=1).tolist()


embs = sbert.encode(full_rows, convert_to_tensor=True, normalize_embeddings=True)
matrix = embs.cpu().numpy()

cl = DBSCAN(eps, min_samples=2, metric='cosine').fit(matrix)
clusters = defaultdict(list)
for idx, label in enumerate(cl.labels_):
    if label != -1:
        clusters[label].append(idx)

In [14]:
final_clusters = {}

for label, indices in clusters.items():
    if len(indices) < 2:
        continue

    cluster_df = df.iloc[indices]
    party_ids = cluster_df['Party ID'].tolist()

    address_strings = [full_rows[i] for i in indices]

    all_party_ids_distinct = len(set(party_ids)) == len(party_ids)

    if all_party_ids_distinct:
        continue
    final_clusters[label] = indices

In [None]:
current = final_clusters.copy()

def find_medoid(indices):
    sim_matrix = util.cos_sim(embs[indices], embs[indices])
    sim_sums = sim_matrix.sum(dim=1).cpu().numpy()
    return indices[int(np.argmax(sim_sums))]

for it in range(iterations):
    next_level = {}
    for cid, indices in current.items():
        if len(indices) < 2:
            continue
        med   = find_medoid(indices)
        emb_m = embs[med]
        sims_all = [float(cos_sim(emb_m, embs[i]))*100 for i in indices]
        sims = [s for idx,s in zip(indices, sims_all) if idx != med]
        avg_sim = sum(sims) / len(sims)
        if avg_sim < avg_threshold:
            X      = embs[indices].cpu().numpy()
            labels = DBSCAN(eps, min_samples=2, metric='cosine').fit_predict(X)
            for sub in set(labels):
                if sub == -1: continue
                key = f"{cid}.{sub}"
                next_level[key] = [indices[i] for i,l in enumerate(labels) if l==sub]
        else:
            next_level[str(cid)] = indices
    current = next_level

final_clusters = current

In [16]:
split_clusters = {}
cluster_counter = 1

for _, indices in final_clusters.items():
    cluster_df = df.iloc[indices]
    party_id_to_indices = defaultdict(list)
    for idx in indices:
        party_id = df.iloc[idx]['Party ID']
        party_id_to_indices[party_id].append(idx)

    for group in party_id_to_indices.values():
        if len(group) > 1:
            split_clusters[cluster_counter] = group
            cluster_counter += 1

In [17]:
cluster_records = []

cluster_id = 1
for _, indices in sorted(split_clusters.items(), key=lambda x: x[0]):
    if len(indices) < 2:
        continue

    cluster_df = df.iloc[indices]
    party_ids = cluster_df['Party ID'].tolist()
    all_party_ids_distinct = len(set(party_ids)) == len(party_ids)

    med = find_medoid(indices)
    emb_m = embs[med]
    sims_all = [float(cos_sim(emb_m, embs[i]))*100 for i in indices]
    sims = [s for idx,s in zip(indices, sims_all) if idx != med]
    avg_sim = round(sum(sims)/len(sims), 2) if sims else 0

    if avg_sim == 100.0 or all_party_ids_distinct:
        continue

    for idx, sim in zip(sorted(indices), sims_all):
        row = df.iloc[idx][report_cols].to_dict()
        row["classification"] = cluster_id
        cluster_records.append(row)

    cluster_id += 1

cluster_output_df = pd.DataFrame(cluster_records)

df_party = pd.read_excel(raw_file_path, party_sheet_name, usecols=["Party ID", "Party Name"])

merged_df = cluster_output_df.merge(df_party, on="Party ID", how="left")

merged_df.to_excel(output_xlsx, index=False)
output_xlsx

'content/AddressClusterReport.xlsx'