In [16]:
import joblib
import re
import numpy as np

In [17]:
loaded_model = joblib.load('../best_model.pkl')
label_encoder_group = joblib.load('../label_encoder_group.pkl')
label_encoder_entity = joblib.load('../label_encoder_entity.pkl')

In [18]:
# Function to convert entity_value to a common unit (e.g., meters)
def convert_to_meters(value):
    match = re.match(r"([0-9.]+)\s*(\w+)", value)
    if not match:
        return None
    num, unit = match.groups()
    num = float(num)
    unit = unit.lower()
    if unit in ['mm', 'millimeter', 'millimeters', 'millimetre', 'millimetres']:
        return num / 1000
    elif unit in ['cm', 'centimeter', 'centimeters', 'centimetre']:
        return num / 100
    elif unit in ['m', 'meter', 'meters', 'metre', 'metres']:
        return num
    elif unit in ['km', 'kilometer', 'kilometers', 'kilometre', 'kilometres']:
        return num * 1000
    else:
        return None

In [19]:
# Function to predict dimension type
def predict_dimension_type(model, group_id, entity_value):
    group_id_encoded = label_encoder_group.transform([group_id])[0]
    entity_value_meters = convert_to_meters(entity_value)
    if entity_value_meters is None:
        raise ValueError("Invalid entity_value format")
    input_data = np.array([[group_id_encoded, entity_value_meters]])
    prediction_encoded = model.predict(input_data)[0]
    prediction = label_encoder_entity.inverse_transform([prediction_encoded])[0]

    # Get confidence scores for all classes
    confidence_scores = model.predict_proba(input_data)[0]
    entity_types = label_encoder_entity.inverse_transform(np.arange(len(confidence_scores)))
    
    # Print confidence scores
    for entity_type, score in zip(entity_types, confidence_scores):
        print(f'{entity_type}: {score:.4f}')

    return prediction

# Example usage
group_id = 675317
entity_value = '22.0 millimetre'
predicted_dimension_type = predict_dimension_type(loaded_model, group_id, entity_value)
print(f'Predicted Dimension Type: {predicted_dimension_type}')

depth: 0.0694
height: 0.0572
width: 0.8734
Predicted Dimension Type: width


In [20]:
def get_dimension(model, group_id, entity_name, entity_values):
    group_id_encoded = label_encoder_group.transform([group_id])[0]
    entity_name_encoded = label_encoder_entity.transform([entity_name])[0]
    
    best_entity_value = None
    highest_score = -1
    
    for entity_value in entity_values:
        entity_value_meters = convert_to_meters(entity_value)
        if entity_value_meters is None:
            continue
        input_data = np.array([[group_id_encoded, entity_value_meters]])
        
        # Get confidence scores for all classes
        confidence_scores = model.predict_proba(input_data)[0]
        
        # Get the score for the specific entity_name
        score = confidence_scores[entity_name_encoded]
        
        if score > highest_score:
            highest_score = score
            best_entity_value = entity_value
    
    if best_entity_value is None:
        raise ValueError("No valid entity_value found")
    
    return best_entity_value

In [27]:
# Example usage
group_id = 916768
entity_name = 'width'
entity_values = ['9.0 centimeter', '5.0 centimeter']
best_entity_value = get_dimension(loaded_model, group_id, entity_name, entity_values)
print(f'Best Entity Value for {entity_name}: {best_entity_value}')

Best Entity Value for width: 9.0 centimeter
