In [14]:
import pandas as pd
import re
import json
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import xlsxwriter
import numpy as np

warnings.filterwarnings("ignore", category=FutureWarning)

In [15]:
address_similarity_threshold = 75

classification_path = "content/YEU_CustomerClusterReport.xlsx"
site_path = "content/YEU_Preprocessed.xlsx"
site_sheet_name = "Sheet1"
output_path = "content/YEU_FinalReport.xlsx"
address_cluster_path = "content/YEU_AddressClusterReport.xlsx"

classification_df = pd.read_excel(classification_path)
df_sites = pd.read_excel(site_path, site_sheet_name)

address_cols = ['Address', 'City', 'Postal Code', 'State', 'Country']

report_cols = [
    'Name', 'Purpose', 'Customer ID'
] + address_cols

columns_to_keep = [
    'classification', 'Customer Name', 'Customer ID', 'Name'
] + address_cols

In [21]:
df_sites = df_sites[report_cols]
merged_df = classification_df.merge(df_sites, on='Customer ID', how='inner').fillna("")

def normalize_text(s):
    return re.sub(r"\s+", " ", re.sub(r"[^\w\s]", " ", str(s).lower())).strip()

def build_weighted_text(row):
    parts = []
    for col in address_cols:
        val = str(row.get(col, '')).strip()
        if col == "Postal Code" and "-" in val:
            val = val.split("-")[0].strip()
        parts.append((val + " ") * 3 if col in ["Address", "Postal Code"] else val)
    return normalize_text(" ".join(parts))

merged_df["full_address"] = merged_df.apply(build_weighted_text, axis=1)

merged_df = merged_df.drop_duplicates()

merged_df = merged_df.reset_index(drop=True)

In [None]:
sbert = SentenceTransformer('all-MiniLM-L6-v2')
embs = sbert.encode(
    merged_df["full_address"].tolist(),
    convert_to_tensor=False,
    normalize_embeddings=True
)
embs = np.array(embs)

sim_matrix = cosine_similarity(embs)

customer_ids = merged_df["Customer ID"].astype(str).values
site_names = merged_df["Name"].astype(str).values

similarity_column = []
n_rows = len(merged_df)

for i in range(n_rows):
    pid1 = customer_ids[i]
    sims = []
    
    mask = (customer_ids != pid1)
    
    valid_indices = np.where(mask)[0]
    
    for j in valid_indices:
        sims.append({
            "To Party": customer_ids[j],
            "To Site": site_names[j],
            "Similarity": f"{sim_matrix[i, j] * 100:.2f}%"
        })
    
    similarity_column.append(json.dumps(sims, ensure_ascii=False))

merged_df["Similarity to Other Parties"] = similarity_column
merged_df["Similarity Parsed"] = merged_df["Similarity to Other Parties"].apply(json.loads)
merged_df.to_excel("content/YEU_merged_Cluster.xlsx", index=False)


In [23]:
final_clusters = []

for class_id, group in merged_df.groupby("classification"):
    pids_in_class = set(group["Customer ID"])

    qualified_rows = []
    for _, row in group.iterrows():
        sims = row["Similarity Parsed"]
        for entry in sims:
            if float(entry["Similarity"].replace('%', '')) > address_similarity_threshold and entry["To Party"] in pids_in_class:
                qualified_rows.append(row)
                break

    if len(qualified_rows) >= 2:
        final_clusters.append(pd.DataFrame(qualified_rows))

if final_clusters:
    result_df = pd.concat(final_clusters).copy()
    result_df.sort_values(by=["classification", "Customer ID"], inplace=True)
    old_to_new_class = {old: new for new, old in enumerate(sorted(result_df["classification"].unique()), start=1)}
    result_df["classification"] = result_df["classification"].map(old_to_new_class)
else:
    result_df = pd.DataFrame(columns=merged_df.columns)

type_labels_map = {}

for class_id, group in result_df.groupby("classification"):
    is_pure_customer_duplication = True

    for sims in group["Similarity to Other Parties"]:
        parsed = json.loads(sims)
        sims_floats = [float(entry["Similarity"].replace('%', '')) for entry in parsed]

        if 100.0 not in sims_floats:
            is_pure_customer_duplication = False
            break

        if any(sim != 100.0 for sim in sims_floats):
            is_pure_customer_duplication = False
            break

    label = "Customer Duplication" if is_pure_customer_duplication else "Customer + Address Duplication"
    type_labels_map[class_id] = label

result_df["type"] = result_df["classification"].map(type_labels_map)
result_df = result_df[["type"] + columns_to_keep]

result_df.drop(columns=["Similarity Parsed", "full_address", "row_id"], errors='ignore', inplace=True)
result_df = result_df[["type"] + columns_to_keep]

addr_dup_df = pd.read_excel(address_cluster_path)
addr_dup_df["type"] = "Address Duplication"
final_df = pd.concat([result_df, addr_dup_df], ignore_index=True)

final_df = final_df.replace([np.inf, -np.inf], np.nan)
final_df = final_df.fillna("")

final_df = final_df[["type"] + columns_to_keep]

In [None]:
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    final_df.to_excel(writer, index=False, sheet_name='Filtered')
    
    workbook = writer.book
    worksheet = writer.sheets['Filtered']

    header_format = workbook.add_format({
        'bold': True,
        'bg_color': '#D3D3D3',
        'border': 1
    })

    border_format = workbook.add_format({'border': 1})
    highlight_format = workbook.add_format({'bg_color': '#FFFACD', 'border': 1})

    for col_num, column_name in enumerate(final_df.columns):
        worksheet.write(0, col_num, column_name, header_format)

    worksheet.autofilter(0, 0, 0, len(final_df.columns) - 1)

    max_row = final_df.shape[0]
    max_col = final_df.shape[1]

    address_col_indices = [final_df.columns.get_loc(col) for col in address_cols + ["Customer ID", "Customer Name"] if col in final_df.columns]

    for class_id, group in final_df.groupby('classification'):
        cluster_indices = group.index.tolist()
        for col_idx in address_col_indices:
            col_name = final_df.columns[col_idx]
            mode_val = group[col_name].mode().iloc[0] if not group[col_name].mode().empty else None
            highlight_entire_col = any(
                mode_val is not None and cell_val != mode_val
                for cell_val in group[col_name]
            )
            for row_idx in cluster_indices:
                excel_row = row_idx + 1
                cell_val = final_df.iloc[row_idx, col_idx]
                if highlight_entire_col:
                    worksheet.write(excel_row, col_idx, cell_val, highlight_format)
                else:
                    worksheet.write(excel_row, col_idx, cell_val, border_format)
        for row_idx in cluster_indices:
            for col in range(max_col):
                if col not in address_col_indices:
                    excel_row = row_idx + 1
                    worksheet.write(excel_row, col, final_df.iloc[row_idx, col], border_format)

    for idx, col in enumerate(final_df.columns):
        col_data = final_df[col].astype(str)
        max_len = max([len(str(col))] + col_data.map(len).tolist())
        worksheet.set_column(idx, idx, max_len + 2)

output_path

In [None]:
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    for dup_type, type_group in final_df.groupby("type"):
        df_to_write = type_group.drop(columns=["type"])

        df_to_write.to_excel(writer, index=False, sheet_name=dup_type[:31]) 

        workbook = writer.book
        worksheet = writer.sheets[dup_type[:31]]

        header_format = workbook.add_format({
            'bold': True,
            'bg_color': '#D3D3D3',
            'border': 1
        })
        border_format = workbook.add_format({'border': 1})
        address_highlight_format = workbook.add_format({'bg_color': '#FFFACD', 'border': 1})  # light yellow
        customer_highlight_format = workbook.add_format({'bg_color': '#ADD8E6', 'border': 1})  # light blue

        for col_num, column_name in enumerate(df_to_write.columns):
            worksheet.write(0, col_num, column_name, header_format)

        worksheet.autofilter(0, 0, 0, len(df_to_write.columns) - 1)

        max_col = df_to_write.shape[1]

        address_col_indices = [df_to_write.columns.get_loc(col) for col in address_cols if col in df_to_write.columns]
        customer_col_indices = [df_to_write.columns.get_loc(col) for col in ["Customer ID", "Customer Name"] if col in df_to_write.columns]

        for class_id, group in df_to_write.groupby('classification'):
            cluster_positions = [df_to_write.index.get_loc(idx) for idx in group.index]

            if dup_type in ["Address Duplication", "Customer + Address Duplication"]:
                for col_idx in address_col_indices:
                    col_name = df_to_write.columns[col_idx]
                    mode_val = group[col_name].mode().iloc[0] if not group[col_name].mode().empty else None
                    for pos in cluster_positions:
                        excel_row = pos + 1
                        cell_val = df_to_write.iloc[pos, col_idx]
                        if mode_val is not None and cell_val != mode_val:
                            worksheet.write(excel_row, col_idx, cell_val, address_highlight_format)
                        else:
                            worksheet.write(excel_row, col_idx, cell_val, border_format)

            if dup_type in ["Customer Duplication", "Customer + Address Duplication"]:
                for col_idx in customer_col_indices:
                    col_name = df_to_write.columns[col_idx]
                    mode_val = group[col_name].mode().iloc[0] if not group[col_name].mode().empty else None
                    for pos in cluster_positions:
                        excel_row = pos + 1
                        cell_val = df_to_write.iloc[pos, col_idx]
                        if mode_val is not None and cell_val != mode_val:
                            worksheet.write(excel_row, col_idx, cell_val, customer_highlight_format)
                        else:
                            worksheet.write(excel_row, col_idx, cell_val, border_format)

            for pos in cluster_positions:
                for col in range(max_col):
                    if col not in address_col_indices and col not in customer_col_indices:
                        excel_row = pos + 1
                        worksheet.write(excel_row, col, df_to_write.iloc[pos, col], border_format)

        for idx, col in enumerate(df_to_write.columns):
            col_data = df_to_write[col].astype(str)
            max_len = max([len(str(col))] + col_data.map(len).tolist())
            worksheet.set_column(idx, idx, max_len + 2)
