In [4]:
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import display

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
class TaxonomyLabeler:
    def __init__(self, taxonomy_labels, model_name='all-mpnet-base-v2'):
        """
        Initialize the labeler with taxonomy labels and load the embedding model.
        """
        self.taxonomy_labels = taxonomy_labels
        self.model = SentenceTransformer(model_name)
        self.label_embeddings = self.model.encode(taxonomy_labels, convert_to_tensor=True, show_progress_bar=True)

    def label_errors(self, error_texts, threshold=0.6):
        """
        Assign taxonomy labels to a list of error texts.
        If threshold is provided, assign 'Other' if similarity below threshold.
        Returns:
            labels: list of label strings
            scores: list of highest similarity scores
        """
        error_embeddings = self.model.encode(error_texts, convert_to_tensor=True, show_progress_bar=True)
        cosine_scores = util.cos_sim(error_embeddings, self.label_embeddings)
        max_scores, best_indices = torch.max(cosine_scores, dim=1)

        labels = []
        for score, idx in zip(max_scores, best_indices):
            if threshold is not None and score < threshold:
                labels.append("Other")
            else:
                labels.append(self.taxonomy_labels[idx])

        return labels, max_scores.cpu().numpy()

In [7]:
def label_categories_with_taxonomy(df, new_csv="Agent_Assist_Final_Labeled_Data_3.csv"):
    # taxonomy_labels = [
    #     'Payroll / Compensation', 
    #     'Leave Management / FMLA',
    #     'Enrollment & Benefits', 
    #     'Access & Technical Issues', 
    #     'Retirement',
    #     'Other / Miscellaneous',
    #     'HR General / Operations',
    #     'Taxes & Withholding',
    #     'Disability & State Claims', 
    #     'Verification & Documentation',
    #     'Timekeeping & Scheduling', 
    #     'Job Changes & Terminations']
    taxonomy_labels = [
        "Employee Leave Management (California, FMLA, Salesforce Portal)",              # topic 0
        "Form Assistance & Status Check",                                               # topic 1
        "Maternity Leave & FMLA Eligibility Clarification",                             # topic 2
        "Benefits Enrollment & COBRA Coverage",                                         # topic 3
        "Job Transfer & Position Change Updates",                                       # topic 4
        "Employment Verification Document Submission",                                  # topic 5
        "HR Service Center Case Resolution",                                            # topic 6
        "Disability Claims & Salary Continuance (EDD, MetLife, SDI)",                   # topic 7
        "Benefits Plan & Severance Enrollment Issues",                                  # topic 8
        "Paycheck Discrepancies & Disability Coordination",                             # topic 9
        "Retirement & Pension Contributions (Vanguard, Fidelity)",                      # topic 10
        "Adding Spouse or Dependents (Certificates & Documentation)",                   # topic 11
        "Bank Account Updates & Payment Recalls",                                       # topic 12
        "HR Systems & Employee Portals (MyHR, HRConnect)",                              # topic 13
        "State-Specific Leave Policies (Colorado, Washington)",                         # topic 14
        "Department Transfer & Payroll Verification",                                   # topic 15
        "CRM System & Call Logging Issues",                                             # topic 16
        "Retirement Procedures & Kaiser Permanente Retirement Center",                  # topic 17
        "Tax Withholding & Payroll Deductions",                                         # topic 18
        "Overpayment & Repayment Processes",                                            # topic 19
        "Wage Increase & Record Verification",                                          # topic 20
        "Document Receipt & Processing Timeframes",                                     # topic 21
        "HR System Delegation & User Access",                                           # topic 22
        "Flexible Spending Accounts (FSA/DCSA) Issues",                                 # topic 23
        "Wage Loss Verification (WLV) Letters",                                         # topic 24
        "Paycheck Deductions & Tax Coordination",                                       # topic 25
        "Mercer Insurance & Benefit Deductions",                                        # topic 26
        "Communication & Call Connection Issues",                                       # topic 27
        "Delta Dental Coverage Questions",                                              # topic 28
        "MetLife Disability Insurance & Claim Processing",                              # topic 29
        "Tuition Reimbursement & Education Benefits",                                   # topic 30
        "Employee Personal Information Updates (Name, Address)",                        # topic 31
        "Spousal Insurance Surcharges & Billing",                                       # topic 32
        "Training Application (TRA) Status & Withdrawals",                              # topic 33
        "Missing or Insufficient Information Provided",                                 # topic 34
        "Dental Procedure & Coverage Queries",                                          # topic 35
        "Supervisor Escalation & Callback Requests",                                    # topic 36
        "ADP Paystub Retrieval & Documentation",                                        # topic 37
        "Licensing, Certification & Registration (LCR Compliance)",                     # topic 38
        "Workers' Compensation & Injury Reporting",                                     # topic 39
        "Family Leave (CFRA) Expansion Queries",                                        # topic 40
        "Bereavement Leave Policies",                                                   # topic 41
        "Spousal Surcharge & Letter Clarification",                                     # topic 42
        "IT Account & Password Issues",                                                 # topic 43
        "Bereavement Policy Clarification",                                             # topic 44
        "Workers' Compensation Claim Submission (Sedgwick)",                            # topic 45
        "IRS Tax Withholding & Garnishment Issues",                                     # topic 46
        "Garnishment Processes & Debt Collection",                                      # topic 47
        "Vacation Conversion & Absence Management",                                     # topic 48
        "FMLA Leave Approval & Notifications",                                          # topic 49
        "HR Delegation Permissions & Submission Issues",                                # topic 50
        "Grandchild Dependent Addition Issues",                                         # topic 51
        "Per Diem Contract & Worker Classification",                                    # topic 52
        "Death Reporting & Survivor Benefits",                                          # topic 53
        "OSHA Training & Case Escalations",                                             # topic 54
        "Extended Sick Leave (ESL) Policy Clarification",                               # topic 55
        "Callback Verification & Contractor Security Protocols",                        # topic 56
        "Unclear or Incomplete Employee Queries",                                       # topic 57
        "Performance Improvement Plans (PIP) & Feedback Processes"                      # topic 58
    ]

    labeler = TaxonomyLabeler(taxonomy_labels)
    summary_texts = df["Knowledge_Answer"].fillna("").astype(str).tolist()
    labels, scores = labeler.label_errors(summary_texts, threshold=0.25)
    df["SubCategory Topic"] = labels
    df["SubCategory Similarity_Score"] = scores
    df.to_csv(new_csv, index=False)
    return df

In [8]:
df = pd.read_csv("..\Agent_Assist_Final_Labeled_Data.csv")
print(df.head(5))
new_df = label_categories_with_taxonomy(df=df, new_csv="Agent_Assist_Final_Labeled_Data_3")

# df = pd.read_csv("final_dataset_july_7.csv")
# label_errors_with_taxonomy(df=df, new_csv="final_dataset_july_7_2.csv")

  Query_Type  Feedback      Conversation_Topic  Conversation_Subtopic  \
0          -  positive  Accurate Summarization                    NaN   
1          -  positive  Accurate Summarization                    NaN   
2          -  positive  Accurate Summarization                    NaN   
3          -  positive  Accurate Summarization                    NaN   
4          -  positive  Accurate Summarization                    NaN   

                                    Knowledge_Answer Knowledge Agent_ID  \
0  status salary advance payment ment hour approv...         -  S522948   
1  status salary advance payment ment hour approv...         -  S522948   
2  status modify payment arrangement document sen...         -  S522948   
3  status modify payment arrangement document sen...         -  S522948   
4  add guardian benefit emergency basis health ca...         -  S160879   

             Timestamp Summary_Reason  \
0  2025-05-17 00:58:20              -   
1  2025-05-17 00:58:07      

Batches: 100%|██████████| 2/2 [00:01<00:00,  1.51it/s]
Batches: 100%|██████████| 547/547 [39:20<00:00,  4.31s/it]  


In [None]:
df.to_csv(new_csv, index=False)
print("Saved to CSV")


# Count rows below threshold
threshold = 0.25
num_below_threshold = (df["Similarity_Score"] < threshold).sum()
num_total = len(df)
percent_below = (num_below_threshold / num_total) * 100

# Count rows above threshold
num_above_threshold = num_total - num_below_threshold
percent_above = 100 - percent_below

# Print counts and percentages
print("\n=== Similarity Score Summary ===")
print(f"Total records: {num_total}")
print(f"Records below threshold ({threshold}): {num_below_threshold} ({percent_below:.2f}%)")
print(f"Records above threshold: {num_above_threshold} ({percent_above:.2f}%)")

# Create a simple ASCII bar chart
print("\n[ASCII Bar Chart]")
bar_length = 50

# Calculate proportional bar lengths
below_bar = int((num_below_threshold / num_total) * bar_length)
above_bar = bar_length - below_bar

print(f"Below Threshold  : {'#' * below_bar}{' ' * above_bar} ({percent_below:.2f}%)")
print(f"Above Threshold  : {'#' * above_bar}{' ' * below_bar} ({percent_above:.2f}%)")

# Plot histogram
plt.figure(figsize=(8, 6))
plt.hist(df["Similarity_Score"], bins=20, color="steelblue", edgecolor="black")

# Add threshold line
threshold = 0.25
plt.axvline(threshold, color="red", linestyle="--", linewidth=1.5, label=f"Threshold = {threshold}")

# Add titles and labels
plt.title("Distribution of Similarity Scores")
plt.xlabel("Similarity Score")
plt.ylabel("Number of Records")
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()