In [None]:
%pip install pandas rapidfuzz python-docx recordlinkage google-generativeai sentence-transformers scikit-learn numpy pycountry spacy openpyxl

In [None]:
import pandas as pd
import re
from rapidfuzz import fuzz
from docx import Document
from collections import defaultdict
import google.generativeai as genai
import ast
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN
import numpy as np
import pycountry
import spacy
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import openpyxl
from openpyxl.styles import Font
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import Workbook

### Input parameters

In [None]:
input_path = "content/Site and Site Uses.xlsx"
module = "absolute address"
sheet_name = "Sheet1"
header_row_num = 0

### Identify Module related columns

In [None]:
genai.configure(api_key="AIzaSyA5NWnp7qoQMoTQZMi8cWlS0Ei8-_SDvLs")
model = genai.GenerativeModel("gemini-1.5-flash")

df = pd.read_excel(input_path, sheet_name)
all_columns = pd.read_excel(input_path, sheet_name, header=None).iloc[header_row_num].tolist()

prompt = f"""
You are a data expert. Your task is to identify ONLY DISTINGUIHING columns related to module: {module} from a list of column names.

Given the following list of column names:
{all_columns}

Only return a valid Python list of strings. Dont include IDs and Attributes and Timezone. Do not include any explanation or extra text.

Example output:
["DISTINGUIHING module related Column A", "module related Column B", "module related Column C", ...]
"""

response = model.generate_content(prompt)

try:
    text = response.text.strip()
    address_cols = ast.literal_eval(text)
except (SyntaxError, ValueError) as e:
    print("Error parsing response to list:", e)
    address_cols = []

address_cols

['Address Line 1',
 'Address Line 2',
 'Mail Stop',
 'City',
 'State/Province',
 'Postal Code',
 'County',
 'Country']

### Vectorization and Clustering

In [1]:
os.system("python -m spacy download en_core_web_sm")
sbert = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("en_core_web_sm")

def normalize_text(s):
    s = str(s).lower()
    s = re.sub(r"[^\w\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

full_rows = (
    df[address_cols]
    .fillna("")
    .astype(str)
    .agg(" ".join, axis=1)
    .map(normalize_text)
    .tolist()
)

embs = sbert.encode(full_rows, convert_to_tensor=True, normalize_embeddings=True)
matrix = embs.cpu().numpy()

cl = DBSCAN(eps=0.05, min_samples=2, metric='cosine').fit(matrix)
clusters = defaultdict(list)
for idx, label in enumerate(cl.labels_):
    if label != -1:
        clusters[label].append(idx)

def extract_canonical_gpe(text):
    doc = nlp(text)
    gpes = [ent.text.strip().lower() for ent in doc.ents if ent.label_ == "GPE"]
    canonicals = []
    for gpe in gpes:
        try:
            country = pycountry.countries.lookup(gpe)
            canonicals.append(country.name.lower())
        except:
            canonicals.append(gpe)
    if canonicals:
        return max(set(canonicals), key=canonicals.count)
    return ""

canonical_gpes = [extract_canonical_gpe(row) for row in full_rows]

NameError: name 'os' is not defined

In [None]:
attribute_cols = [
    'Site Use Attribute 1', 'Site Use Attribute 2', 'Site Use Attribute 3',
    'Site Use Attribute 6', 'Site Use Attribute 7', 'Site Use Attribute 9',
    'Site Use Attribute 10', 'Site Use Attribute 12'
]
 
final_clusters = {}
 
for label, indices in clusters.items():
    if len(indices) < 2:
        continue
 
    cluster_df = df.iloc[indices]
    site_ids = cluster_df['Site ID'].tolist()
    site_use_ids = cluster_df['Site Use ID'].tolist()
    # --- Elimination Condition 1 ---
    # Eliminate if all Site IDs are the same BUT all Site Use IDs are different.
    if len(set(site_ids)) == 1 and len(set(site_use_ids)) == len(site_use_ids):
        continue

    # --- Elimination Condition 2 ---
    # Eliminate if addresses are 100% similar AND all Site IDs are distinct 
    # AND the combined attributes are all distinct.
    # Check for 100% address similarity (i.e., all normalized addresses are identical)
    address_strings = [full_rows[i] for i in indices]
    if len(set(address_strings)) == 1:
        are_site_ids_distinct = len(set(site_ids)) == len(site_ids)
        attribute_values = cluster_df[attribute_cols].fillna('').astype(str).agg(' '.join, axis=1).tolist()
        are_attributes_distinct = len(set(attribute_values)) == len(attribute_values)
 
        if are_site_ids_distinct and are_attributes_distinct:
            continue
 
    final_clusters[label] = indices

Applying business rules...


### Excel Repot

In [None]:
wb = Workbook()
ws = wb.active
ws.title = "Cluster Report"

bold_font = Font(bold=True)
row_pointer = 1

# Sequentially number the clusters regardless of original label
for new_cluster_id, (_, indices) in enumerate(sorted(final_clusters.items()), start=1):
    if len(indices) < 2:
        continue

    ws.cell(row=row_pointer, column=1, value=f"Cluster {new_cluster_id}:").font = bold_font
    row_pointer += 1

    for col_index, col_name in enumerate(df.columns, start=1):
        ws.cell(row=row_pointer, column=col_index, value=col_name).font = bold_font
    row_pointer += 1

    for idx in sorted(indices):
        for col_index, col_name in enumerate(df.columns, start=1):
            ws.cell(row=row_pointer, column=col_index, value=df.iloc[idx][col_name])
        row_pointer += 1

    row_pointer += 1

output_xlsx = "content/Cluster_Report.xlsx"
wb.save(output_xlsx)
print(f"\nExcel report saved to: {output_xlsx}")


Excel report saved to: content/Cluster_Report.xlsx


### Detailed word report

In [None]:
def format_address_row(i):
    return ", ".join(
        f"{col}='{df.at[i, col]}'" for col in address_cols if str(df.at[i, col]).strip()
    )

def build_prompt(grp):
    prompt = """
        Compare the fields in the following rows and return each and every and clear differences in the format like:
        country in row 17 is \"USA\" and \"United States\" in row 18.\n And Address 1 is \"xyz\" in row 17 and \"x.y.z\" in row 19.\n And... so on
        or
        in row 17 \"USA\" is in column state along with the \"<state name>\". And... so on.
        I there are multiple rows in a group then adjust accordingly like country in row 2 is \"something\" in row 3 and 4 is \"something else\" in row 5 is empty
        Respond with only such lines and nothing else. You can list each and every the differences.
    """
    for i in grp:
        prompt += f"Row {i+1}: {format_address_row(i)}\n"
    return prompt

doc = Document()
doc.add_heading("Redundancy Report", level=1)

# for grp in filtered:
#     prompt = build_prompt(grp)
#     response = model.generate_content(prompt)
#     explanation = response.text.strip()
#     if explanation:
#         doc.add_paragraph(explanation)

prompt = build_prompt(filtered[0])
response = model.generate_content(prompt)
explanation = response.text.strip()
if explanation:
    doc.add_paragraph(explanation)

output_docx = "content/redundancy_report.docx"
doc.save(output_docx)
print(f"\nDOCX report saved to: {output_docx}")


DOCX report saved to: content/redundancy_report.docx
