# Categorização de Produtos

In [54]:
from liqfit.pipeline import ZeroShotClassificationPipeline
from liqfit.models import T5ForZeroShotClassification
from transformers import T5Tokenizer
import polars as pl
import plotly.express as px

In [2]:
model = T5ForZeroShotClassification.from_pretrained('knowledgator/comprehend_it-multilingual-t5-base')
tokenizer = T5Tokenizer.from_pretrained('knowledgator/comprehend_it-multilingual-t5-base')
classifier = ZeroShotClassificationPipeline(model=model, tokenizer=tokenizer, hypothesis_template = '{}', encoder_decoder = True)

You are using a model of type T5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


tokenizer_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


## Teste de classificação de produtos

In [48]:
categorias = ['eletrodomésticos', 'eletrônicos', 'beleza', 'brinquedos']

In [49]:
df = pl.read_csv('../dados/descricoes_produtos.csv')

In [50]:
def categorizar(descricao):
    resultado = classifier(descricao, categorias, multi_label=False)
    cat_max = max(zip(resultado['labels'], resultado['scores']), key=lambda x: x[1])
    return cat_max

In [51]:
classificacao = df["descricao"].map_elements(categorizar, return_dtype=pl.Object)

In [52]:
df = df.with_columns([
    pl.Series("label", classificacao.map_elements(lambda v: v[0], return_dtype=pl.Utf8), dtype=pl.Utf8),
    pl.Series("score", classificacao.map_elements(lambda v: v[1], return_dtype=pl.Float32), dtype=pl.Float32)
])

In [53]:
df

descricao,label,score
str,str,f32
"""Liquidificador de alta potênci…","""eletrodomésticos""",0.346486
"""Forno Micro-ondas de 20 litros…","""eletrodomésticos""",0.50596
"""Máquina de café espresso com r…","""eletrodomésticos""",0.302402
"""Torradeira com capacidade para…","""eletrodomésticos""",0.437097
"""Panela elétrica multifuncional…","""eletrodomésticos""",0.46119
…,…,…
"""Sérum facial anti-idade com vi…","""beleza""",0.701239
"""Máscara facial de argila purif…","""beleza""",0.592232
"""Quebra-cabeça de 1000 peças co…","""brinquedos""",0.561517
"""Kit de ciências para crianças …","""brinquedos""",0.481192


In [55]:
px.histogram(df.to_pandas(), x='label', color='label', title='Distribuição de produtos por categoria')