# Create Cluster-Category Mapping and Working Hour Mapping

In [8]:
Cluster_Info = {
-1: {
"description": "No matching category found - the item is outside known clusters",
"working_hours": None
},
0: {
"description": "Print bilingual instructions on polybags",
"working_hours": None
},
1: {
"description": "Insert bilingual or French instruction cards",
"working_hours": None
},
2: {
"description": "Assign to artist or performer (names)",
"working_hours": None
},
3: {
"description": "Use Canadian ingredient and QR code labels",
"working_hours": None
},
4: {
"description": "Pack or replace lids and straws",
"working_hours": None
},
5: {
"description": "Label each carton with destination or customer info",
"working_hours": None
},
6: {
"description": "Pack with tissue paper (by color or style)",
"working_hours": None
},
7: {
"description": "Use fragile packaging for sensitive items",
"working_hours": None
},
8: {
"description": "Follow polybagging rules (bulk, sets, or skip)",
"working_hours": None
},
9: {
"description": "Pack specific quantity per polybag",
"working_hours": None
},
10: {
"description": "Wrap items in bubble wrap before packing",
"working_hours": None
},
11: {
"description": "Pack <NUM> pieces per bag (generic)",
"working_hours": None
},
12: {
"description": "Pack <NUM> pieces by color mix in bags",
"working_hours": None
},
13: {
"description": "Insert or switch ring colors (for bottles/items)",
"working_hours": None
},
14: {
"description": "Pack assorted colors with exact counts (multi-color)",
"working_hours": None
},
15: {
"description": "Pack each color in separate boxes",
"working_hours": None
},
16: {
"description": "Fold and insert cloth/items into pouch",
"working_hours": None
},
17: {
"description": "Apply SKU or quantity labels to polybags",
"working_hours": None
},
18: {
"description": "Apply barcode labels to items or packaging",
"working_hours": None
},
19: {
"description": "Apply hologram stickers to packaging or item",
"working_hours": None
},
20: {
"description": "Keep logos separated and visible",
"working_hours": None
},
21: {
"description": "Label each box (gift, shipping, booth, etc.)",
"working_hours": None
},
22: {
"description": "Apply custom stickers, UPCs, or labels",
"working_hours": None
},
23: {
"description": "Insert customer-supplied cards or notes",
"working_hours": None
},
24: {
"description": "Check, apply, or remove imprints",
"working_hours": None
},
25: {
"description": "Inspect items for damage or return to stock",
"working_hours": None
},
26: {
"description": "Ship or handle chargers, cords, and power items",
"working_hours": None
},
27: {
"description": "Manage order splits and partial shipments",
"working_hours": None
},
28: {
"description": "Prepare for scheduled customer pickups",
"working_hours": None
},
29: {
"description": "Use import or factory-standard packaging",
"working_hours": None
},
30: {
"description": "Insert or ship gift boxes and contents",
"working_hours": None
},
31: {
"description": "Pack exact quantities per box",
"working_hours": None
},
32: {
"description": "Attach packing slips to boxes (inside or outside)",
"working_hours": None
},
33: {
"description": "Pack all items together in one box",
"working_hours": None
},
34: {
"description": "Polybag fixed quantities per set",
"working_hours": None
},
35: {
"description": "Bundle or pack items in fixed sets",
"working_hours": None
}
}

# Load Model and Start Prediction

In [9]:
import joblib
from sentence_transformers import SentenceTransformer
from hdbscan import approximate_predict
import numpy as np

reducer = joblib.load('my_umap_reducer_fianl.pkl')
clusterer = joblib.load('my_hdbscan_clusterer_final.pkl')
model = SentenceTransformer('all-MiniLM-L6-v2')  


# Predict

In [16]:
import re


def clean_data(text):
    # apply number masking to reduce sensitivity to specific quantity values
    text = re.sub(r'\b\d+(\.\d+)?\b', '<NUM>', text)#\b means word boundary. \d+ means Match 1 or more digits
    # remove non-informative puncutuation
    text = re.sub(r'[-:*"\'.,]', '', text)
    # remove non-informative words
    text = re.sub(r'\b(please|kindly)\b', '', text) 
    # remove repeated spaces
    text = re.sub(r'\s+', ' ', text) # \s+ = one or more whitespace character
    # remove "special package"
    text = re.sub(r'\b(special package|special packaging)\b', '', text)
    # remove space at the beginning and end of the string
    text = text.strip()
    # remove "special package"
    text = re.sub(r'\b(special package|special packaging)\b', '', text)
    
    return text



def predict_cluster(new_sentence):
    new_sentence = clean_data(new_sentence)
    print(f"Cleaned Sentence: {new_sentence}")
    embedding = model.encode([new_sentence])
    reduced = reducer.transform(embedding)
    label, strength = approximate_predict(clusterer, reduced)
    return int(label[0]), float(strength[0])

label, confidence = predict_cluster("iport packaging")
info = Cluster_Info.get(label, {})
print(f"{info.get('description', 'None')}\n"
      f"Working Hours: {info.get('working_hours', 'None')}\n"
      f"Cluster: {label}\n"
      f"Confidence: {confidence:.2f}"
      )

Cleaned Sentence: iport packaging
Attach packing slips to boxes (inside or outside)
Working Hours: None
Cluster: 32
Confidence: 0.35


In [5]:
print("Number of clusters:", len(set(clusterer.labels_)) - (1 if -1 in clusterer.labels_ else 0))


Number of clusters: 36


In [6]:
set(clusterer.labels_)

{-1,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35}

# Downside
## 5/15 (issue fixed)
### The model is currently very sensitive to quantity values, which affects its classification accuracy.

For example:

- 150 per box requested 

    Box Items in Specified Quantities
    Working Hours: 8
    Cluster: 73
    Confidence: 0.77
- 50 per box requested

    No matching category found - the item is outside known clusters
    Working Hours: None
    Cluster: -1
    Confidence: 0.00

**Proposed Solution:**

To address this issue, we can apply number masking during preprocessing to reduce sensitivity to specific quantity values. This may improve model generalization across similar tasks with varying quantities.
