# Create Cluster-Category Mapping and Working Hour Mapping

In [1]:
Cluster_Info = {
    -1: {
        "description": "No matching category found - the item is outside known clusters",
        "working_hours": None
    },
    0: {
        "description": "Package Jackets by Size and Gender",
        "working_hours": None
    },
    1: {
        "description": "Print Bi-lingual Instructions on Polybags",
        "working_hours": None
    },
    2: {
        "description": "Insert Bi-lingual Instruction Cards",
        "working_hours": None
    },
    3: {
        "description": "Special Fragile Packaging Required",
        "working_hours": None
    },
    4: {
        "description": "Print Individual Name",
        "working_hours": None
    },
    5: {
        "description": "Use Canadian Ingredient Labels",
        "working_hours": None
    },
    6: {
        "description": "Apply Canadian Ingredient QR Code Labels",
        "working_hours": None
    },
    7: {
        "description": "Inspect and Ship Lids Separately",
        "working_hours": None
    },
    8: {
        "description": "Package and Ship Straws Separately",
        "working_hours": None
    },
    9: {
        "description": "Package Rings by City and Quantity",
        "working_hours": None
    },
    10: {
        "description": "Pack with Tissue Paper (Specified Color)",
        "working_hours": None
    },
    11: {
        "description": "Machine Pack Specific Items",
        "working_hours": None
    },
    12: {
        "description": "Reship and Correct Split Shipments",
        "working_hours": None
    },
    13: {
        "description": "Insert Customer-Supplied Cards",
        "working_hours": None
    },
    14: {
        "description": "Add Cards to Power Banks",
        "working_hours": None
    },
    15: {
        "description": "Insert Items into Pouches",
        "working_hours": None
    },
    16: {
        "description": "Assort Products by Color",
        "working_hours": None
    },
    17: {
        "description": "Insert FCA Cards per Notebook",
        "working_hours": None
    },
    18: {
        "description": "Handle Returns and Accessories Separately",
        "working_hours": None
    },
    19: {
        "description": "Mark Special Shipping Notes on Boxes",
        "working_hours": None
    },
    20: {
        "description": "Pack Specific Quantity per Skid",
        "working_hours": None
    },
    21: {
        "description": "Apply Barcode Stickers to Items or Bags",
        "working_hours": None
    },
    22: {
        "description": "Inspect and Manage Imprints",
        "working_hours": None
    },
    23: {
        "description": "Pack Different Logos Separately",
        "working_hours": None
    },
    24: {
        "description": "Place SKU Labels on Gift Boxes",
        "working_hours": None
    },
    25: {
        "description": "Label Cartons with Specific Destination",
        "working_hours": None
    },
    26: {
        "description": "Manage Delayed Drop Shipments",
        "working_hours": None
    },
    27: {
        "description": "Apply Customer-Supplied Stickers on Polybags",
        "working_hours": None
    },
    28: {
        "description": "Inspect Items for Quality Issues",
        "working_hours": None
    },
    29: {
        "description": "Apply UPC Labels to Bottle Bottoms",
        "working_hours": None
    },
    30: {
        "description": "Attach NFL Hologram Stickers",
        "working_hours": None
    },
    31: {
        "description": "Apply Hologram Stickers to Polybags",
        "working_hours": None
    },
    32: {
        "description": "Bubble Wrap Items for Protection",
        "working_hours": None
    },
    33: {
        "description": "Label Bags with Specific Codes",
        "working_hours": None
    },
    34: {
        "description": "Apply Amazon SKU Labels",
        "working_hours": None
    },
    35: {
        "description": "Label Boxes with Pricing or Customer Info",
        "working_hours": None
    },
}


# Load Model and Start Prediction

In [2]:
import joblib
from sentence_transformers import SentenceTransformer
from hdbscan import approximate_predict
import numpy as np

reducer = joblib.load('my_umap_reducer_fianl.pkl')
clusterer = joblib.load('my_hdbscan_clusterer_final.pkl')
model = SentenceTransformer('all-MiniLM-L6-v2')  


# Predict

In [4]:
import re


def clean_data(text):
    # apply number masking to reduce sensitivity to specific quantity values
    text = re.sub(r'\b\d+(\.\d+)?\b', '<NUM>', text)#\b means word boundary. \d+ means Match 1 or more digits
    # remove non-informative puncutuation
    text = re.sub(r'[-:*"\'.,]', '', text)
    # remove non-informative words
    text = re.sub(r'\b(please|kindly)\b', '', text) 
    # remove repeated spaces
    text = re.sub(r'\s+', ' ', text) # \s+ = one or more whitespace character
    # remove "special package"
    text = re.sub(r'\b(special package|special packaging)\b', '', text)
    # remove space at the beginning and end of the string
    text = text.strip()
    # remove "special package"
    text = re.sub(r'\b(special package|special packaging)\b', '', text)
    
    return text



def predict_cluster(new_sentence):
    new_sentence = clean_data(new_sentence)
    print(f"Cleaned Sentence: {new_sentence}")
    embedding = model.encode([new_sentence])
    reduced = reducer.transform(embedding)
    label, strength = approximate_predict(clusterer, reduced)
    return int(label[0]), float(strength[0])

label, confidence = predict_cluster("requested 27 per box")
info = Cluster_Info.get(label, {})
print(f"{info.get('description', 'None')}\n"
      f"Working Hours: {info.get('working_hours', 'None')}\n"
      f"Cluster: {label}\n"
      f"Confidence: {confidence:.2f}"
      )

Cleaned Sentence: requested <NUM> per box
None
Working Hours: None
Cluster: 36
Confidence: 0.85


# Downside
## 5/15 (issue fixed)
### The model is currently very sensitive to quantity values, which affects its classification accuracy.

For example:

- 150 per box requested 

    Box Items in Specified Quantities
    Working Hours: 8
    Cluster: 73
    Confidence: 0.77
- 50 per box requested

    No matching category found - the item is outside known clusters
    Working Hours: None
    Cluster: -1
    Confidence: 0.00

**Proposed Solution:**

To address this issue, we can apply number masking during preprocessing to reduce sensitivity to specific quantity values. This may improve model generalization across similar tasks with varying quantities.
