In [2]:
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import display

In [19]:
class TaxonomyLabeler:
    def __init__(self, taxonomy_labels, model_name='all-mpnet-base-v2'):
        """
        Initialize the labeler with taxonomy labels and load the embedding model.
        """
        self.taxonomy_labels = taxonomy_labels
        self.model = SentenceTransformer(model_name)
        self.label_embeddings = self.model.encode(taxonomy_labels, convert_to_tensor=True, show_progress_bar=True)

    def label_errors(self, error_texts, threshold=0.6):
        """
        Assign taxonomy labels to a list of error texts.
        If threshold is provided, assign 'Other' if similarity below threshold.
        Returns:
            labels: list of label strings
            scores: list of highest similarity scores
        """
        error_embeddings = self.model.encode(error_texts, convert_to_tensor=True, show_progress_bar=True)
        cosine_scores = util.cos_sim(error_embeddings, self.label_embeddings)
        max_scores, best_indices = torch.max(cosine_scores, dim=1)

        labels = []
        for score, idx in zip(max_scores, best_indices):
            if threshold is not None and score < threshold:
                labels.append("Other")
            else:
                labels.append(self.taxonomy_labels[idx])

        return labels, max_scores.cpu().numpy()

In [20]:
def label_errors_with_taxonomy(df, new_csv="final_labeled_errors.csv"):
    taxonomy_labels = [
        "Confusion About Leave Request Submission or Approval",
        "Unclear FMLA or Bonding Eligibility Criteria",
        "Knowledge Base Articles Missing or Not Relevant",
        "Paycheck Errors, Deductions, or Overpayment Disputes",
        "Inability to Access HR Systems or Forms",
        "Employees Unaware of Leave Types or Benefit Interactions",
        "Difficulty Providing Documentation or Verifying Identity",
        "Lack of Clear Rules for Leave Accrual and Usage",
        "Complex or Confusing Enrollment Processes",
        "Unclear Disability Insurance Procedures",
        "Delays in Processing or Approving Requests",
        "Region-Specific Policy or Escalation Confusion",
        "Inadequate or Vague Communication to Employees",
        "Generic or Unclassifiable Issues"
    ]
    labeler = TaxonomyLabeler(taxonomy_labels)
    error_texts = df["Knowledge_Answer"].fillna("").astype(str).tolist()
    labels, scores = labeler.label_errors(error_texts, threshold=0.3)
    df["Parent Error Topic"] = labels
    df["Parent Error Similarity Score"] = scores
    return df


In [21]:
send_df = pd.read_csv("final_dataset_july_7.csv")
print(send_df.head(5))
my_df = label_errors_with_taxonomy(df=send_df, new_csv="final_dataset_july_7_2.csv")
my_df.to_csv("final_dataset_july_7_3.csv", index=False)
print("final_dataset_july_7_2.csv")
# df = pd.read_csv("final_dataset_july_7.csv")
# label_errors_with_taxonomy(df=df, new_csv="final_dataset_july_7_2.csv")

  Query_Type  Feedback      Conversation_Topic  Conversation_Subtopic  \
0          -  positive  Accurate Summarization                    NaN   
1          -  positive  Accurate Summarization                    NaN   
2          -  positive  Accurate Summarization                    NaN   
3          -  positive  Accurate Summarization                    NaN   
4          -  positive  Accurate Summarization                    NaN   

                                    Knowledge_Answer Knowledge Agent_ID  \
0  status salary advance payment ment hour approv...         -  S522948   
1  status salary advance payment ment hour approv...         -  S522948   
2  status modify payment arrangement document sen...         -  S522948   
3  status modify payment arrangement document sen...         -  S522948   
4  add guardian benefit emergency basis health ca...         -  S160879   

             Timestamp Summary_Reason  \
0  2025-05-17 00:58:20              -   
1  2025-05-17 00:58:07      

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.52it/s]
Batches: 100%|██████████| 547/547 [49:08<00:00,  5.39s/it]  


final_dataset_july_7_2.csv


In [None]:
my_df.head()



AttributeError: 'NoneType' object has no attribute 'to_csv'

In [None]:
    # Count rows below threshold
    threshold = 0.3
    num_below_threshold = (df["Similarity_Score"] < threshold).sum()
    num_total = len(df)
    percent_below = (num_below_threshold / num_total) * 100

      # Count rows above threshold
    num_above_threshold = num_total - num_below_threshold
    percent_above = 100 - percent_below

    # Print counts and percentages
    print("\n=== Similarity Score Summary ===")
    print(f"Total records: {num_total}")
    print(f"Records below threshold ({threshold}): {num_below_threshold} ({percent_below:.2f}%)")
    print(f"Records above threshold: {num_above_threshold} ({percent_above:.2f}%)")

    # Create a simple ASCII bar chart
    print("\n[ASCII Bar Chart]")
    bar_length = 50

    # Calculate proportional bar lengths
    below_bar = int((num_below_threshold / num_total) * bar_length)
    above_bar = bar_length - below_bar

    print(f"Below Threshold  : {'#' * below_bar}{' ' * above_bar} ({percent_below:.2f}%)")
    print(f"Above Threshold  : {'#' * above_bar}{' ' * below_bar} ({percent_above:.2f}%)")

    # Plot histogram
    plt.figure(figsize=(8, 6))
    plt.hist(df["Similarity_Score"], bins=20, color="steelblue", edgecolor="black")

    # Add threshold line
    threshold = 0.3
    plt.axvline(threshold, color="red", linestyle="--", linewidth=1.5, label=f"Threshold = {threshold}")

    # Add titles and labels
    plt.title("Distribution of Similarity Scores")
    plt.xlabel("Similarity Score")
    plt.ylabel("Number of Records")
    plt.legend()

    # Show the plot
    plt.tight_layout()
    plt.show()