In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from utils.paths import DATA_RAW_DIR, DATA_PROCESSED_DIR, MODELS_DIR

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

import joblib

import re
import string

import nltk
from nltk.corpus import stopwords
import spacy

from transformers import AutoTokenizer
from transformers.pipelines import pipeline

In [3]:
reviews_path = DATA_RAW_DIR / "reviews.csv"
print(Path(reviews_path).exists())

True


In [4]:
df_reviews = pd.read_csv(reviews_path, sep=",", encoding="utf-8")
df_reviews.head()

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,es_0491108,product_es_0296024,reviewer_es_0999081,1,Nada bueno se me fue ka pantalla en menos de 8...,television Nevir,es,electronics
1,es_0869872,product_es_0922286,reviewer_es_0216771,1,"Horrible, nos tuvimos que comprar otro porque ...",Dinero tirado a la basura con esta compra,es,electronics
2,es_0811721,product_es_0474543,reviewer_es_0929213,1,Te obligan a comprar dos unidades y te llega s...,solo llega una unidad cuando te obligan a comp...,es,drugstore
3,es_0359921,product_es_0656090,reviewer_es_0224702,1,"No entro en descalificar al vendedor, solo pue...",PRODUCTO NO RECIBIDO.,es,wireless
4,es_0068940,product_es_0662544,reviewer_es_0224827,1,Llega tarde y co la talla equivocada,Devuelto,es,shoes


# Tokenizer with Transformers

In [5]:
# load tokenizer in spanish
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [6]:
df_reviews.columns

Index(['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body',
       'review_title', 'language', 'product_category'],
      dtype='object')

In [7]:
# sample text

sample_text = df_reviews['review_body'].iloc[0]
sample_text

'Nada bueno se me fue ka pantalla en menos de 8 meses y no he recibido respuesta del fabricante'

In [8]:
tokens = tokenizer.tokenize(sample_text)
print(tokens)

['Nada', 'bueno', 'se', 'me', 'fue', 'k', '##a', 'pantalla', 'en', 'menos', 'de', '8', 'meses', 'y', 'no', 'he', 'recibido', 'respuesta', 'del', 'fabricante']


# Classification with Pre-trined models

In [9]:
# sentimental pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use mps:0


In [10]:
# new review

new_review = "Me encanta este producto, es increíble y funciona muy bien."
# predict sentiment
result = sentiment_pipeline(new_review)
print(result)

  return forward_call(*args, **kwargs)


[{'label': '5 stars', 'score': 0.8591135144233704}]


In [11]:
new_review = "Lo odio ;)!"
result = sentiment_pipeline(new_review)

print("Texto de ejemplo:", new_review)
print("Resultado del análisis de sentimiento:", result)

Texto de ejemplo: Lo odio ;)!
Resultado del análisis de sentimiento: [{'label': '5 stars', 'score': 0.6157907247543335}]


In [12]:
new_review = "me encanto?"
result = sentiment_pipeline(new_review)

print("Texto de ejemplo:", new_review)
print("Resultado del análisis de sentimiento:", result)

Texto de ejemplo: me encanto?
Resultado del análisis de sentimiento: [{'label': '5 stars', 'score': 0.4976471960544586}]


In [13]:
new_review = "me encanto!!!!!!!"
result = sentiment_pipeline(new_review)

print("Texto de ejemplo:", new_review)
print("Resultado del análisis de sentimiento:", result)

Texto de ejemplo: me encanto!!!!!!!
Resultado del análisis de sentimiento: [{'label': '5 stars', 'score': 0.8658758401870728}]


# NER

In [14]:
# NER
ner_pipeline = pipeline("token-classification", model="mrm8488/bert-spanish-cased-finetuned-ner")

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at mrm8488/bert-spanish-cased-finetuned-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use mps:0


In [15]:
review = "El producto de Samsung Galaxy S21 llegó el 12 de marzo y superó mis expectativas."
ner_result = ner_pipeline(review)

for result in ner_result:
    print(result)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  return forward_call(*args, **kwargs)


{'entity': 'B-ORG', 'score': np.float32(0.99355763), 'index': 4, 'word': 'Sam', 'start': 15, 'end': 18}
{'entity': 'B-ORG', 'score': np.float32(0.5250632), 'index': 5, 'word': '##su', 'start': 18, 'end': 20}
{'entity': 'I-ORG', 'score': np.float32(0.9311909), 'index': 6, 'word': '##ng', 'start': 20, 'end': 22}
{'entity': 'I-ORG', 'score': np.float32(0.98734134), 'index': 7, 'word': 'Gala', 'start': 23, 'end': 27}
{'entity': 'I-ORG', 'score': np.float32(0.98206973), 'index': 8, 'word': '##xy', 'start': 27, 'end': 29}
{'entity': 'I-ORG', 'score': np.float32(0.8131862), 'index': 9, 'word': 'S', 'start': 30, 'end': 31}
{'entity': 'I-ORG', 'score': np.float32(0.78243333), 'index': 10, 'word': '##21', 'start': 31, 'end': 33}


In [16]:
def reconstruct_entity(ner_tokens):
    entity = ""
    for token in ner_tokens:
        word = token['word']
        if word.startswith("##"):
            entity += word[2:]
        else:
            if entity:
                entity += " " + word
            else:
                entity += word
    return entity

In [17]:
entity_name = reconstruct_entity(ner_result)
print("entity:", entity_name)

entity: Samsung Galaxy S21


In [18]:
review = "Compré el portátil HP en Madrid, y el servicio de atención al cliente fue excelente."
ner_result = ner_pipeline(review)

for result in ner_result:
    print(result)

entity_name = reconstruct_entity(ner_result)
print("entity:", entity_name)

{'entity': 'B-MISC', 'score': np.float32(0.9984775), 'index': 5, 'word': 'H', 'start': 19, 'end': 20}
{'entity': 'I-MISC', 'score': np.float32(0.91564894), 'index': 6, 'word': '##P', 'start': 20, 'end': 21}
{'entity': 'B-LOC', 'score': np.float32(0.9998977), 'index': 8, 'word': 'Madrid', 'start': 25, 'end': 31}
entity: HP Madrid


In [19]:
review = "La cámara Canon EOS Rebel tiene una calidad de imagen impresionante, ideal para profesionales."
ner_result = ner_pipeline(review)

for result in ner_result:
    print(result)

entity_name = reconstruct_entity(ner_result)
print("entity:", entity_name)

{'entity': 'B-MISC', 'score': np.float32(0.9957456), 'index': 3, 'word': 'Can', 'start': 10, 'end': 13}
{'entity': 'I-MISC', 'score': np.float32(0.99319434), 'index': 4, 'word': '##on', 'start': 13, 'end': 15}
{'entity': 'I-MISC', 'score': np.float32(0.9966273), 'index': 5, 'word': 'E', 'start': 16, 'end': 17}
{'entity': 'I-MISC', 'score': np.float32(0.995978), 'index': 6, 'word': '##OS', 'start': 17, 'end': 19}
{'entity': 'I-MISC', 'score': np.float32(0.95100576), 'index': 7, 'word': 'Re', 'start': 20, 'end': 22}
{'entity': 'I-MISC', 'score': np.float32(0.93761325), 'index': 8, 'word': '##bel', 'start': 22, 'end': 25}
entity: Canon EOS Rebel


In [20]:
review = "Recibí el reloj Casio a tiempo, pero el embalaje estaba dañado."
ner_result = ner_pipeline(review)

for result in ner_result:
    print(result)

entity_name = reconstruct_entity(ner_result)
print("entity:", entity_name)

{'entity': 'B-MISC', 'score': np.float32(0.9961661), 'index': 4, 'word': 'Casi', 'start': 16, 'end': 20}
{'entity': 'I-MISC', 'score': np.float32(0.8894443), 'index': 5, 'word': '##o', 'start': 20, 'end': 21}
entity: Casio


In [21]:
review = "La experiencia con Apple fue innovadora, aunque el precio es bastante elevado."
ner_result = ner_pipeline(review)

for result in ner_result:
    print(result)

entity_name = reconstruct_entity(ner_result)
print("entity:", entity_name)

{'entity': 'B-ORG', 'score': np.float32(0.9954259), 'index': 4, 'word': 'Apple', 'start': 19, 'end': 24}
entity: Apple


In [22]:
review = "Me cuesta entender cómo crearon DeepSeek."
ner_result = ner_pipeline(review)

for result in ner_result:
    print(result)

entity_name = reconstruct_entity(ner_result)
print("entity:", entity_name)

{'entity': 'B-MISC', 'score': np.float32(0.9944628), 'index': 6, 'word': 'DeepSeek', 'start': 32, 'end': 40}
entity: DeepSeek
