In [3]:
import ast
import os
import re
import time 
import warnings
from collections import defaultdict

import google.generativeai as genai
import numpy as np
import openpyxl
import pandas as pd
import pycountry
import spacy
from docx import Document
from openpyxl import Workbook
from openpyxl.styles import Font
from openpyxl.utils.dataframe import dataframe_to_rows
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.util import cos_sim
from sklearn.cluster import DBSCAN

warnings.filterwarnings("ignore", category=FutureWarning)

In [4]:
input_path = "content/YEU_CUST_MSTR.xlsx"
sheet_name = "Unique Customer IDs"

module = "Customer Name"
header_row_num = 0

iterations = 10
avg_threshold = 85.0
eps=0.125


In [None]:
genai.configure(api_key="AIzaSyCsPWoi-y_Sc7oJCZyI0CoVctMojywsbgA")
model = genai.GenerativeModel("gemini-1.5-flash")

df = pd.read_excel(input_path, sheet_name)
all_columns = pd.read_excel(input_path, sheet_name, header=None).iloc[header_row_num].tolist()

prompt = f"""
You are a data expert. Your task is to identify ONLY DISTINGUISHING columns related to module: {module} from a list of column names.

Given the following list of column names:
{all_columns}

Focus on columns that help identify and distinguish party names, such as:
- Party Name variations
- Company names
- Legal entity names
- Business names
- Organization names

Only return a valid Python list of strings. Don't include IDs, generic attributes, and timezone columns. Do not include any explanation or extra text.

Example output:
["Party Name", "Legal Name", "Business Name", ...]

Do not include any explanation or extra text.
"""

max_retries = 5
party_cols = []

for attempt in range(max_retries):
    response = model.generate_content(prompt)
    text = response.text.strip()
    
    try:
        party_cols = ast.literal_eval(text)
        if isinstance(party_cols, list) and all(isinstance(col, str) for col in party_cols):
            break
    except (SyntaxError, ValueError) as e:
        print(f"Attempt {attempt+1} failed:", e)
        print("Response was:", text)
        time.sleep(1)

party_cols


['Customer Name']

In [None]:
report_cols = [
    'Customer ID', 'Customer Name'
]

df = df[report_cols]

f = lambda v: ', '.join(sorted(set(str(x) for x in v if pd.notna(x) and str(x).strip())))
df = df.groupby("Customer ID", as_index=False).agg(lambda col: f(col) if col.name == "Customer Name" else col.iloc[0])


In [None]:
os.system("python -m spacy download en_core_web_sm")
sbert = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("en_core_web_sm")

def normalize_party_name(s):
    """Normalize party names for better comparison"""
    s = re.sub(r'\b(inc|corp|corporation|llc|ltd|limited|co|company|enterprises|group|holdings)\b', '', s)
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_weighted_party_text(row):
    """Build weighted text for party comparison with emphasis on party name"""
    parts = []
    for col in party_cols:
        val = str(row.get(col, '')).strip()
        if col in ["Customer Name"]:
            parts.append((val + " ") * 3)
        else:
            parts.append(val)
    return normalize_party_name(" ".join(parts))

full_rows = df[party_cols].apply(build_weighted_party_text, axis=1).tolist()

embs = sbert.encode(full_rows, convert_to_tensor=True, normalize_embeddings=True)
matrix = embs.cpu().numpy()

cl = DBSCAN(eps, min_samples=2, metric='cosine').fit(matrix)

clusters = defaultdict(list)
for idx, label in enumerate(cl.labels_):
    if label != -1:
        clusters[label].append(idx)


In [18]:

final_clusters = {}

for label, indices in clusters.items():
    if len(indices) < 2:
        continue
        
    cluster_df = df.iloc[indices]
    party_ids = cluster_df['Customer ID'].tolist()
    
    # --- Elimination Condition 1 ---
    # Eliminate if party names are 100% similar AND all Party IDs are distinct 
    # AND the combined attributes are all distinct.
    party_strings = [full_rows[i] for i in indices]
    if len(set(party_strings)) == 1:
        are_party_ids_distinct = len(set(party_ids)) == len(party_ids)
        

    final_clusters[label] = indices


In [None]:
current = final_clusters.copy()

def find_medoid(indices):
    sim_matrix = util.cos_sim(embs[indices], embs[indices])
    sim_sums = sim_matrix.sum(dim=1).cpu().numpy()
    return indices[int(np.argmax(sim_sums))]

for it in range(iterations):
    next_level = {}
    for cid, indices in current.items():
        if len(indices) < 2:
            continue
        med = find_medoid(indices)
        emb_m = embs[med]
        sims_all = [float(cos_sim(emb_m, embs[i]))*100 for i in indices]
        sims = [s for idx,s in zip(indices, sims_all) if idx != med]
        avg_sim = sum(sims) / len(sims)
        if avg_sim < avg_threshold:
            X = embs[indices].cpu().numpy()
            labels = DBSCAN(eps, min_samples=2, metric='cosine').fit_predict(X)
            for sub in set(labels):
                if sub == -1: continue
                key = f"{cid}.{sub}"
                next_level[key] = [indices[i] for i,l in enumerate(labels) if l==sub]
        else:
            next_level[str(cid)] = indices
    current = next_level

final_clusters = current


In [None]:
wb   = Workbook()
ws   = wb.active
bold = Font(bold=True)
rp   = 1 

output_xlsx = "content/YEU_CustomerClusterReport.xlsx"
header_cols = ['classification'] + report_cols
for ci, col_name in enumerate(header_cols, start=1):
    ws.cell(rp, ci, col_name).font = bold
rp += 1

items = sorted(final_clusters.items(), key=lambda x: x[0])

for seq, (_, indices) in enumerate(items, start=1):
    if len(indices) < 2:
        continue

    for idx in indices:
        ws.cell(rp, 1, seq)

        for ci, col in enumerate(report_cols, start=2):
            ws.cell(rp, ci, df.iloc[idx][col])

        rp += 1

wb.save(output_xlsx)
print(f"\nParty duplicate report saved to: {output_xlsx}")



Party duplicate report saved to: content/YEU_CustomerClusterReport.xlsx
