In [None]:
file_path = "/home/xd/Projects/dmp/integrations/UNSPSC-Seaarch/unspsc_finder/data/unspsc_data.csv"
import polars as pl

df = pl.read_csv(file_path)

df.sample(1).to_dicts()

# [{'segment_code': 50000000,
#   'segment_title': 'Food and Beverage Products',
#   'segment_definition': 'This segment includes human food and beverages as well as condiments, colorings, flavorings and preservatives used in the preparation of food or beverages. This segment also includes plant extracts, mineral supplements and vitamins for human consumption.',
#   'family_code': 50380000,
#   'family_title': 'Fresh fruit purees',
#   'family_definition': 'Fruits which have been pureed by being ground, pressed, blended, and/or sieved to the consistency of a soft creamy paste or thick liquid.',
#   'class_code': 50384300,
#   'class_title': 'Kiwi fruit purees',
#   'class_definition': 'The type of egg-shaped kiwi fruit have a furry brownish green skin and firm, translucent green flesh with edible purple-black seeds at the centre. Also called chinese gooseberry, edible fruit of the vine actinidia chinensis (family actinidiaceae). The plant is native to china and taiwan and is now grown commercially in new zealand and california. which have been pureed.',
#   'commodity_code': 50384301,
#   'commodity_title': 'Ananasnaja kiwi fruit purees',
#   'commodity_definition': 'The variety of kiwi fruit known as ananasnaja kiwi fruit which have been pureed.'}]

In [2]:
# preprocess the data
df = df.with_columns(
    pl.col("segment_title").str.to_lowercase(),
    pl.col("segment_definition").str.to_lowercase(),
    pl.col("family_title").str.to_lowercase(),
    pl.col("family_definition").str.to_lowercase(),
    pl.col("class_title").str.to_lowercase(),
    pl.col("class_definition").str.to_lowercase(),
    pl.col("commodity_title").str.to_lowercase(),
    pl.col("commodity_definition").str.to_lowercase(),
)


In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re


In [9]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
from typing import Dict, List, Tuple
import logging

class UNSPSCMapper:
    def __init__(self, unspsc_data_path: str):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.unspsc_df = pd.read_csv(unspsc_data_path)
        self.setup_indices()
        
    def setup_indices(self):
        """Initialize UNSPSC embeddings and indices"""
        # Create combined definitions for each level
        self.unspsc_df['segment_text'] = (
            self.unspsc_df['segment_title'] + ' ' + 
            self.unspsc_df['segment_definition']
        )
        self.unspsc_df['commodity_text'] = (
            self.unspsc_df['commodity_title'] + ' ' + 
            self.unspsc_df['commodity_definition']
        )
        
        # Generate embeddings for each UNSPSC level
        print("Generating embeddings...")
        self.segment_embeddings = self.model.encode(
            self.unspsc_df['segment_text'].unique().tolist()
        )
        self.segment_codes = self.unspsc_df['segment_code'].unique()
        
    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text"""
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
        
    def extract_attributes(self, text: str) -> Dict[str, str]:
        """Extract key product attributes from description"""
        attributes = {}
        
        # Extract material types
        materials = re.findall(r'(steel|rubber|plastic|metal|aluminum)', text)
        if materials:
            attributes['material'] = materials[0]
            
        # Extract measurements
        measurements = re.findall(r'(\d+(?:\.\d+)?)\s*(in|ft|m|mm)', text)
        if measurements:
            attributes['size'] = measurements[0]
            
        return attributes
        
    def find_best_segment(self, text: str) -> Tuple[int, float]:
        """Find best matching segment for item description"""
        text_embedding = self.model.encode([text])[0]
        similarities = cosine_similarity([text_embedding], self.segment_embeddings)[0]
        best_idx = np.argmax(similarities)
        return self.segment_codes[best_idx], similarities[best_idx]
        
    def find_best_family(self, text: str, segment_code: int) -> Tuple[int, float]:
        """Find best matching family within segment"""
        segment_families = self.unspsc_df[
            self.unspsc_df['segment_code'] == segment_code
        ]['family_code'].unique()
        
        if len(segment_families) == 0:
            return None, 0.0
            
        family_texts = self.unspsc_df[
            self.unspsc_df['family_code'].isin(segment_families)
        ]['family_title'].unique()
        
        family_embeddings = self.model.encode(family_texts.tolist())
        text_embedding = self.model.encode([text])[0]
        
        similarities = cosine_similarity([text_embedding], family_embeddings)[0]
        best_idx = np.argmax(similarities)
        
        return segment_families[best_idx], similarities[best_idx]

    def find_best_commodity(self, text: str, family_code: int) -> Tuple[int, float]:
        """Find best matching commodity within family"""
        family_commodities = self.unspsc_df[
            self.unspsc_df['family_code'] == family_code
        ]['commodity_code'].unique()
        
        if len(family_commodities) == 0:
            return None, 0.0
            
        commodity_texts = self.unspsc_df[
            self.unspsc_df['commodity_code'].isin(family_commodities)
        ]['commodity_text'].tolist()
        
        commodity_embeddings = self.model.encode(commodity_texts)
        text_embedding = self.model.encode([text])[0]
        
        similarities = cosine_similarity([text_embedding], commodity_embeddings)[0]
        best_idx = np.argmax(similarities)
        
        return family_commodities[best_idx], similarities[best_idx]
        
    def classify_item(self, description: str) -> Dict:
        """Main classification method"""
        # Preprocess
        clean_text = self.preprocess_text(description)
        attributes = self.extract_attributes(clean_text)
        
        # Find best segment
        segment_code, segment_confidence = self.find_best_segment(clean_text)
        
        # Find best family
        family_code, family_confidence = self.find_best_family(clean_text, segment_code)
        
        # Find best commodity
        commodity_code, commodity_confidence = self.find_best_commodity(clean_text, family_code) if family_code else (None, 0.0)
        
        # Get codes and titles
        segment_info = self.unspsc_df[
            self.unspsc_df['segment_code'] == segment_code
        ].iloc[0]
        
        family_info = self.unspsc_df[
            self.unspsc_df['family_code'] == family_code
        ].iloc[0] if family_code else None

        commodity_info = self.unspsc_df[
            self.unspsc_df['commodity_code'] == commodity_code
        ].iloc[0] if commodity_code else None
        
        return {
            'item_description': description,
            'attributes': attributes,
            'segment_code': segment_code,
            'segment_title': segment_info['segment_title'],
            'segment_confidence': segment_confidence,
            'family_code': family_code,
            'family_title': family_info['family_title'] if family_info is not None else None,
            'family_confidence': family_confidence,
            'commodity_code': commodity_code,
            'commodity_title': commodity_info['commodity_title'] if commodity_info is not None else None,
            'commodity_confidence': commodity_confidence
        }


In [10]:

# Initialize mapper
mapper = UNSPSCMapper("./unspsc_finder/data/unspsc_data.csv")


Generating embeddings...


In [6]:

# Test items
test_items = [
    "SPRING,RETAINING;PETERBIL;2258H1230",
    "GASKET,SPIRAL WOUND: 34IN PIPE, CLASS 300, 316L STAINLESS STEEL",
    "HOSE,WATER;REINFORCED RUBBER;3/4IN;50FT"
]



In [8]:
# Process items
for item in test_items[1:2]:
    result = mapper.classify_item(item)
    print(f"\nItem: {item}")
    print(f"Segment: {result['segment_code']} - {result['segment_title']}")
    print(f"Family: {result['family_code']} - {result['family_title']}")
    print(f"Confidence: {result['segment_confidence']:.2f}")


Item: GASKET,SPIRAL WOUND: 34IN PIPE, CLASS 300, 316L STAINLESS STEEL
Segment: 15000000 - Fuels and Fuel Additives and Lubricants and Anti corrosive Materials
Family: 15120000.0 - Lubricants and oils and greases and anti corrosives
Confidence: 0.20
