In [None]:
import ast
import os
import re
import time 
import warnings
from collections import defaultdict

import google.generativeai as genai
import numpy as np
import openpyxl
import pandas as pd
import pycountry
import spacy
from docx import Document
from openpyxl import Workbook
from openpyxl.styles import Font
from openpyxl.utils.dataframe import dataframe_to_rows
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.util import cos_sim
from sklearn.cluster import DBSCAN

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# YEU
# input_path = "content/YEU_CUST_MSTR.xlsx"
# sheet_name = "Sheet"

# MITER
input_path = "content/Site and Site Uses.xlsx"
sheet_name = "Sheet1"

# demo
# input_path = "content/customer_data.xlsx"
# sheet_name = "Sheet1"

module = "absolute address"
header_row_num = 0

iterations     = 3
avg_threshold  = 95.0 

In [None]:
genai.configure(api_key="AIzaSyA5NWnp7qoQMoTQZMi8cWlS0Ei8-_SDvLs")
model = genai.GenerativeModel("gemini-1.5-flash")

df = pd.read_excel(input_path, sheet_name)
all_columns = pd.read_excel(input_path, sheet_name, header=None).iloc[header_row_num].tolist()

prompt = f"""
You are a data expert. Your task is to identify ONLY DISTINGUIHING columns related to module: {module} from a list of column names.

Given the following list of column names:
{all_columns}

Only return a valid Python list of strings. Dont include IDs and Attributes and Timezone. Do not include any explanation or extra text.

Example output:
["module related Column A", "module related Column B", "module related Column C", ...]

Do not include any explanation or extra text.
"""

max_retries = 5
address_cols = []

for attempt in range(max_retries):
    response = model.generate_content(prompt)
    text = response.text.strip()

    try:
        address_cols = ast.literal_eval(text)
        if isinstance(address_cols, list) and all(isinstance(col, str) for col in address_cols):
            break
    except (SyntaxError, ValueError) as e:
        print(f"Attempt {attempt+1} failed:", e)
        print("Response was:", text)
        time.sleep(1)

address_cols

['Address Line 1',
 'Address Line 2',
 'Mail Stop',
 'City',
 'State/Province',
 'Postal Code',
 'County',
 'Country']

In [4]:
report_cols = [
    # Unique Identifiers
    'Site ID', 'Site Name', 'Site Purpose',
    # Address related columns
    'Address Line 1', 'Address Line 2', 'Mail Stop', 'City', 'State/Province',
    'Postal Code', 'County', 'Country', 
    # Business use case related columns (Attributes)
    'Site Use Attribute 1', 'Site Use Attribute 2', 'Site Use Attribute 3',
    'Site Use Attribute 6', 'Site Use Attribute 7', 'Site Use Attribute 9',
    'Site Use Attribute 10'
]

df = df[report_cols]
f = lambda v: ', '.join(sorted(set(str(x) for x in v if pd.notna(x) and str(x).strip())))
df = df.groupby("Site ID", as_index=False).agg(lambda col: f(col) if col.name == "Site Purpose" else col.iloc[0])

In [5]:
os.system("python -m spacy download en_core_web_sm")
sbert = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("en_core_web_sm")

def normalize_text(s):
    s = str(s).lower()
    s = re.sub(r"[^\w\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

# full_rows = (
#     df[address_cols]
#     .fillna("")
#     .astype(str)
#     .agg(" ".join, axis=1)
#     .map(normalize_text)
#     .tolist()
# )

def build_weighted_text(row):
    parts = []
    for col in address_cols:
        val = str(row.get(col, '')).strip()
        if col in ["Address Line 1", "Postal Code"]:
            parts.append((val + " ") * 3)
        else:
            parts.append(val)
    return normalize_text(" ".join(parts))

full_rows = df[address_cols].apply(build_weighted_text, axis=1).tolist()


embs = sbert.encode(full_rows, convert_to_tensor=True, normalize_embeddings=True)
matrix = embs.cpu().numpy()

# cl = DBSCAN(eps=0.05, min_samples=2, metric='cosine').fit(matrix)
cl = DBSCAN(eps=0.1, min_samples=2, metric='cosine').fit(matrix)
# cl = DBSCAN(eps=0.15, min_samples=2, metric='cosine').fit(matrix)
# cl = DBSCAN(eps=0.20, min_samples=2, metric='cosine').fit(matrix)
clusters = defaultdict(list)
for idx, label in enumerate(cl.labels_):
    if label != -1:
        clusters[label].append(idx)

In [6]:
attribute_cols = [
    'Site Use Attribute 1', 'Site Use Attribute 2', 
    'Site Use Attribute 3', 'Site Use Attribute 6', 
    'Site Use Attribute 7', 'Site Use Attribute 9',
    'Site Use Attribute 10'
]
 
final_clusters = {}
 
for label, indices in clusters.items():
    if len(indices) < 2:
        continue
 
    cluster_df = df.iloc[indices]
    site_ids = cluster_df['Site ID'].tolist()

    # --- Elimination Condition 1 ---
    # Eliminate if addresses are 100% similar AND all Site IDs are distinct 
    # AND the combined attributes are all distinct.
    # Check for 100% address similarity (i.e., all normalized addresses are identical)
    address_strings = [full_rows[i] for i in indices]
    if len(set(address_strings)) == 1:
        are_site_ids_distinct = len(set(site_ids)) == len(site_ids)
        attribute_values = cluster_df[attribute_cols].fillna('').astype(str).agg(' '.join, axis=1).tolist()
        are_attributes_distinct = len(set(attribute_values)) == len(attribute_values)
 
        if are_site_ids_distinct and are_attributes_distinct:
            continue
 
    final_clusters[label] = indices

In [None]:
# start with your initial final_clusters
current = final_clusters.copy()

def find_medoid(indices):
    sim_matrix = util.cos_sim(embs[indices], embs[indices])
    sim_sums = sim_matrix.sum(dim=1).cpu().numpy()
    return indices[int(np.argmax(sim_sums))]

for it in range(iterations):
    next_level = {}
    for cid, indices in current.items():
        if len(indices) < 2:
            continue
        med   = find_medoid(indices)
        emb_m = embs[med]
        sims_all = [float(cos_sim(emb_m, embs[i]))*100 for i in indices]
        # exclude medoid from average
        sims = [s for idx,s in zip(indices, sims_all) if idx != med]
        avg_sim = sum(sims) / len(sims)
        if avg_sim < avg_threshold:
            # recluster this cluster’s indices
            X      = embs[indices].cpu().numpy()
            labels = DBSCAN(eps=0.05, min_samples=2, metric='cosine').fit_predict(X)
            for sub in set(labels):
                if sub == -1: continue
                key = f"{cid}.{sub}"
                next_level[key] = [indices[i] for i,l in enumerate(labels) if l==sub]
        else:
            next_level[str(cid)] = indices
    current = next_level

final_clusters = current

In [8]:
# write out with sequential numbering
wb  = Workbook()
ws  = wb.active
bold = Font(bold=True)
rp   = 1

output_xlsx = "content/ClusterReport.xlsx"

items = sorted(final_clusters.items(), key=lambda x: x[0])
for seq, (_, indices) in enumerate(items, start=1):
    if len(indices) < 2:
        continue
    med   = find_medoid(indices)
    emb_m = embs[med]
    sims_all = [float(cos_sim(emb_m, embs[i]))*100 for i in indices]
    sims = [s for idx,s in zip(indices, sims_all) if idx != med]
    avg_sim = round(sum(sims)/len(sims),2)

    ws.cell(rp,1,f"Cluster {seq} (Avg {avg_sim}%):").font = bold; rp+=1
    ws.cell(rp,1,"Similarity %").font = bold
    for ci,col in enumerate(report_cols, start=2):
        ws.cell(rp,ci,col).font = bold
    rp+=1

    for idx,sim in zip(sorted(indices), sims_all):
        ws.cell(rp,1,f"{round(sim,2)}%")
        for ci,col in enumerate(report_cols, start=2):
            ws.cell(rp,ci,df.iloc[idx][col])
        rp+=1
    rp+=1

wb.save(output_xlsx)
print(f"\nExcel report saved to: {output_xlsx}")


Excel report saved to: content/ClusterReport.xlsx
