In [None]:
%pip install pandas rapidfuzz python-docx recordlinkage google-generativeai sentence-transformers scikit-learn numpy pycountry spacy openpyxl

In [2]:
import pandas as pd
import re
from rapidfuzz import fuzz
from docx import Document
from collections import defaultdict
import google.generativeai as genai
import ast
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN
import numpy as np
import pycountry
import spacy
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import openpyxl
from openpyxl.styles import Font
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import Workbook
from sentence_transformers.util import cos_sim
import numpy as np

### Input parameters

In [3]:
# input_path = "content/YEU_CUST_MSTR.xlsx"
# module = "absolute address"
# sheet_name = "Sheet"
# header_row_num = 0
input_path = "content/Site and Site Uses.xlsx"
module = "absolute address"
sheet_name = "Sheet1"
header_row_num = 0

### Identify Module related columns

In [4]:
genai.configure(api_key="AIzaSyA5NWnp7qoQMoTQZMi8cWlS0Ei8-_SDvLs")
model = genai.GenerativeModel("gemini-1.5-flash")

df = pd.read_excel(input_path, sheet_name)
all_columns = pd.read_excel(input_path, sheet_name, header=None).iloc[header_row_num].tolist()

prompt = f"""
You are a data expert. Your task is to identify ONLY DISTINGUIHING columns related to module: {module} from a list of column names.

Given the following list of column names:
{all_columns}

Only return a valid Python list of strings. Dont include IDs and Attributes and Timezone. Do not include any explanation or extra text.

Example output:
["module related Column A", "module related Column B", "module related Column C", ...]

Do not include any explanation or extra text.
"""

response = model.generate_content(prompt)

try:
    text = response.text.strip()
    address_cols = ast.literal_eval(text)
except (SyntaxError, ValueError) as e:
    print(text)
    print("\n\nError parsing response to list:", e)
    address_cols = []

address_cols

['Address Line 1',
 'Address Line 2',
 'Mail Stop',
 'City',
 'State/Province',
 'Postal Code',
 'County',
 'Country']

### Vectorization and Clustering

In [5]:
os.system("python -m spacy download en_core_web_sm")
sbert = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("en_core_web_sm")

def normalize_text(s):
    s = str(s).lower()
    s = re.sub(r"[^\w\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

full_rows = (
    df[address_cols]
    .fillna("")
    .astype(str)
    .agg(" ".join, axis=1)
    .map(normalize_text)
    .tolist()
)

embs = sbert.encode(full_rows, convert_to_tensor=True, normalize_embeddings=True)
matrix = embs.cpu().numpy()

# cl = DBSCAN(eps=0.05, min_samples=2, metric='cosine').fit(matrix)
cl = DBSCAN(eps=0.15, min_samples=2, metric='cosine').fit(matrix)
cl = DBSCAN(eps=0.15, min_samples=2, metric='cosine').fit(matrix)
clusters = defaultdict(list)
for idx, label in enumerate(cl.labels_):
    if label != -1:
        clusters[label].append(idx)

### Filter valid redundancy

In [6]:
attribute_cols = [
    'Site Use Attribute 1', 'Site Use Attribute 2', 'Site Use Attribute 3',
    'Site Use Attribute 6', 'Site Use Attribute 7', 'Site Use Attribute 9',
    'Site Use Attribute 10', 'Site Use Attribute 12'
]
 
final_clusters = {}
 
for label, indices in clusters.items():
    if len(indices) < 2:
        continue
 
    cluster_df = df.iloc[indices]
    site_ids = cluster_df['Site ID'].tolist()
    site_use_ids = cluster_df['Site Use ID'].tolist()
    # --- Elimination Condition 1 ---
    # Eliminate if all Site IDs are the same BUT all Site Use IDs are different.
    if len(set(site_ids)) == 1 and len(set(site_use_ids)) == len(site_use_ids):
        continue

    # --- Elimination Condition 2 ---
    # Eliminate if addresses are 100% similar AND all Site IDs are distinct 
    # AND the combined attributes are all distinct.
    # Check for 100% address similarity (i.e., all normalized addresses are identical)
    address_strings = [full_rows[i] for i in indices]
    if len(set(address_strings)) == 1:
        are_site_ids_distinct = len(set(site_ids)) == len(site_ids)
        attribute_values = cluster_df[attribute_cols].fillna('').astype(str).agg(' '.join, axis=1).tolist()
        are_attributes_distinct = len(set(attribute_values)) == len(attribute_values)
 
        if are_site_ids_distinct and are_attributes_distinct:
            continue
 
    final_clusters[label] = indices

### Excel Repot

#### With mediod Similarity

In [20]:
wb = Workbook()
ws = wb.active
ws.title = "Cluster Report"

bold_font = Font(bold=True)
row_pointer = 1

def find_medoid(indices):
    sub_embs = embs[indices]
    sim_matrix = cos_sim(sub_embs, sub_embs)
    avg_similarities = sim_matrix.mean(dim=1)
    medoid_local_idx = int(avg_similarities.argmax()) 
    return indices[medoid_local_idx]

report_cols = [
    # Unique Identifiers
    'Site ID', 'Site Use ID', 'Site Name', 'Site Purpose',
    # Address related columns
    'Address Line 1', 'Address Line 2', 'Mail Stop', 'City', 'State/Province',
    'Postal Code', 'County', 'Country', 
    # Business use case related columns (Attributes)
    'Site Use Attribute 1', 'Site Use Attribute 2', 'Site Use Attribute 3',
    'Site Use Attribute 6', 'Site Use Attribute 7', 'Site Use Attribute 9',
    'Site Use Attribute 10', 'Site Use Attribute 12'
]

for new_cluster_id, (_, indices) in enumerate(sorted(final_clusters.items()), start=1):
    if len(indices) < 2:
        continue

    ws.cell(row=row_pointer, column=1, value=f"Cluster {new_cluster_id}:").font = bold_font
    row_pointer += 1

    ws.cell(row=row_pointer, column=1, value="Similarity %").font = bold_font
    for col_index, col_name in enumerate(report_cols, start=2):
        ws.cell(row=row_pointer, column=col_index, value=col_name).font = bold_font
    row_pointer += 1

    medoid_idx = find_medoid(indices)
    medoid_emb = embs[medoid_idx]

    for idx in sorted(indices):
        similarity = float(cos_sim(medoid_emb, embs[idx])) * 100
        similarity = round(similarity, 2)
        ws.cell(row=row_pointer, column=1, value=f"{similarity}%")
        for col_index, col_name in enumerate(report_cols, start=2):
            ws.cell(row=row_pointer, column=col_index, value=df.iloc[idx][col_name])
        row_pointer += 1

    row_pointer += 1

output_xlsx = "content/Cluster_Report.xlsx"
wb.save(output_xlsx)
print(f"\nExcel report saved to: {output_xlsx}")


Excel report saved to: content/Cluster_Report.xlsx


#### With Mean Similarity

In [None]:
wb = Workbook()
ws = wb.active
ws.title = "Cluster Report"

bold_font = Font(bold=True)
row_pointer = 1

def compute_mean_similarity_to_others(indices):
    sub_embs = embs[indices]
    sim_matrix = cos_sim(sub_embs, sub_embs).cpu().numpy()
    mean_sims = []
    for i in range(len(indices)):
        others = [sim_matrix[i][j] for j in range(len(indices)) if j != i]
        mean_sim = round(np.mean(others) * 100, 2)  
        mean_sims.append(mean_sim)
    return dict(zip(indices, mean_sims))

report_cols = [
    # Unique Identifiers
    'Site ID', 'Site Use ID', 'Site Name', 'Site Purpose',
    # Address related columns
    'Address Line 1', 'Address Line 2', 'Mail Stop', 'City', 'State/Province',
    'Postal Code', 'County', 'Country', 
    # Business use case related columns (Attributes)
    'Site Use Attribute 1', 'Site Use Attribute 2', 'Site Use Attribute 3',
    'Site Use Attribute 6', 'Site Use Attribute 7', 'Site Use Attribute 9',
    'Site Use Attribute 10', 'Site Use Attribute 12'
]

for new_cluster_id, (_, indices) in enumerate(sorted(final_clusters.items()), start=1):
    if len(indices) < 2:
        continue

    ws.cell(row=row_pointer, column=1, value=f"Cluster {new_cluster_id}:").font = bold_font
    row_pointer += 1

    ws.cell(row=row_pointer, column=1, value="Avg Similarity %").font = bold_font
    for col_index, col_name in enumerate(report_cols, start=2):
        ws.cell(row=row_pointer, column=col_index, value=col_name).font = bold_font
    row_pointer += 1

    row_similarities = compute_mean_similarity_to_others(indices)

    for idx in sorted(indices):
        sim_value = row_similarities[idx]
        ws.cell(row=row_pointer, column=1, value=f"{sim_value}%")
        for col_index, col_name in enumerate(report_cols, start=2):
            value = df.iloc[idx].get(col_name, '')
            ws.cell(row=row_pointer, column=col_index, value=value)
        row_pointer += 1

    row_pointer += 1  

output_xlsx = "content/Cluster_Report.xlsx"
wb.save(output_xlsx)
print(f"\nExcel report saved to: {output_xlsx}")


Excel report saved to: content/Cluster_Report.xlsx


In [11]:
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import pdist, squareform
from openpyxl import Workbook
from openpyxl.styles import Font
from sentence_transformers.util import cos_sim

# --- Step 1: Create refined sub-clusters ---
sub_clusters = {}
distance_threshold = 0.05  # ~95% similarity

for cluster_id, indices in final_clusters.items():
    if len(indices) < 2:
        continue

    sub_embs = embs[indices].cpu().numpy()
    dist_matrix = squareform(pdist(sub_embs, metric="cosine"))

    model = AgglomerativeClustering(
        n_clusters=None,
        linkage="average",
        distance_threshold=distance_threshold
    )
    sub_labels = model.fit_predict(dist_matrix)

    for sub_label in set(sub_labels):
        sub_indices = [indices[i] for i in range(len(indices)) if sub_labels[i] == sub_label]
        if len(sub_indices) >= 2:
            sub_clusters[f"{cluster_id}.{sub_label + 1}"] = sub_indices

# --- Step 2: Function to compute average similarity ---
def compute_mean_similarity_to_others(indices):
    sub_embs = embs[indices]
    sim_matrix = cos_sim(sub_embs, sub_embs).cpu().numpy()
    mean_sims = []
    for i in range(len(indices)):
        others = [sim_matrix[i][j] for j in range(len(indices)) if j != i]
        mean_sim = round(np.mean(others) * 100, 2)
        mean_sims.append(mean_sim)
    return dict(zip(indices, mean_sims))

report_cols = [
    'Site ID', 'Site Use ID', 'Site Name', 'Site Purpose',
    'Address Line 1', 'Address Line 2', 'Mail Stop', 'City', 'State/Province',
    'Postal Code', 'County', 'Country',
    'Site Use Attribute 1', 'Site Use Attribute 2', 'Site Use Attribute 3',
    'Site Use Attribute 6', 'Site Use Attribute 7', 'Site Use Attribute 9',
    'Site Use Attribute 10', 'Site Use Attribute 12'
]

# --- Step 3: Create Excel Workbook with both sheets ---
wb = Workbook()
ws1 = wb.active
ws1.title = "Refined Cluster Report"
bold_font = Font(bold=True)
row_pointer = 1

cluster_list = []

for cluster_label, indices in sorted(sub_clusters.items()):
    row_similarities = compute_mean_similarity_to_others(indices)

    # --- Write to Sheet 1 ---
    ws1.cell(row=row_pointer, column=1, value=f"Cluster {cluster_label}:").font = bold_font
    row_pointer += 1

    ws1.cell(row=row_pointer, column=1, value="Avg Similarity %").font = bold_font
    for col_index, col_name in enumerate(report_cols, start=2):
        ws1.cell(row=row_pointer, column=col_index, value=col_name).font = bold_font
    row_pointer += 1

    for idx in sorted(indices):
        sim_value = row_similarities[idx]
        ws1.cell(row=row_pointer, column=1, value=f"{sim_value}%")
        for col_index, col_name in enumerate(report_cols, start=2):
            value = df.iloc[idx].get(col_name, '')
            ws1.cell(row=row_pointer, column=col_index, value=value)
        row_pointer += 1

        # For Sheet 2 (cluster list)
        row_entry = {
            "Original Index": idx,
            "Cluster ID": cluster_label,
            "Avg Similarity %": sim_value
        }
        for col in report_cols:
            row_entry[col] = df.iloc[idx].get(col, '')
        cluster_list.append(row_entry)

    row_pointer += 1  # Blank row between clusters

# --- Step 4: Add Final Cluster List Sheet ---
ws2 = wb.create_sheet(title="Final Cluster List")

# Write headers
headers = ["Original Index", "Cluster ID", "Avg Similarity %"] + report_cols
for col_index, header in enumerate(headers, start=1):
    ws2.cell(row=1, column=col_index, value=header).font = bold_font

# Write data
for row_idx, row_data in enumerate(cluster_list, start=2):
    for col_index, header in enumerate(headers, start=1):
        ws2.cell(row=row_idx, column=col_index, value=row_data[header])

# --- Save Workbook ---
wb.save("Refined_Cluster_Report.xlsx")
print("Excel report saved: Refined_Cluster_Report.xlsx with 2 sheets")


  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = 

Excel report saved: Refined_Cluster_Report.xlsx with 2 sheets


### Detailed word report

In [58]:
def format_address_row(i):
    return ", ".join(
        f"{col}='{df.at[i, col]}'" for col in address_cols if str(df.at[i, col]).strip()
    )

def build_prompt(grp):
    prompt = """
        Compare the fields in the following rows and return each and every and clear differences in the format like:
        country in row 17 is \"USA\" and \"United States\" in row 18.\n And Address 1 is \"xyz\" in row 17 and \"x.y.z\" in row 19.\n And... so on
        or
        in row 17 \"USA\" is in column state along with the \"<state name>\". And... so on.
        I there are multiple rows in a group then adjust accordingly like country in row 2 is \"something\" in row 3 and 4 is \"something else\" in row 5 is empty
        Respond with only such lines and nothing else. You can list each and every the differences.
    """
    for i in grp:
        prompt += f"Row {i+1}: {format_address_row(i)}\n"
    return prompt

doc = Document()
doc.add_heading("Redundancy Report", level=1)

# for grp in filtered:
#     prompt = build_prompt(grp)
#     response = model.generate_content(prompt)
#     explanation = response.text.strip()
#     if explanation:
#         doc.add_paragraph(explanation)

prompt = build_prompt(filtered[0])
response = model.generate_content(prompt)
explanation = response.text.strip()
if explanation:
    doc.add_paragraph(explanation)

output_docx = "content/redundancy_report.docx"
doc.save(output_docx)
print(f"\nDOCX report saved to: {output_docx}")


DOCX report saved to: content/redundancy_report.docx
