In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

import joblib

import re
import string

import nltk
from nltk.corpus import stopwords
import spacy

from transformers.pipelines import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.training_args import TrainingArguments

from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer

from datasets import Dataset, DatasetDict

import evaluate

In [2]:
from utils.paths import DATA_RAW_DIR, DATA_PROCESSED_DIR, MODELS_DIR, CACHE_DIR

In [3]:
# load data
path_reviews = DATA_RAW_DIR / "reviews.csv"
print(Path.exists(path_reviews))

True


In [4]:
df_reviews = pd.read_csv(path_reviews, sep=",", encoding="utf-8")
df_reviews.head()

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,es_0491108,product_es_0296024,reviewer_es_0999081,1,Nada bueno se me fue ka pantalla en menos de 8...,television Nevir,es,electronics
1,es_0869872,product_es_0922286,reviewer_es_0216771,1,"Horrible, nos tuvimos que comprar otro porque ...",Dinero tirado a la basura con esta compra,es,electronics
2,es_0811721,product_es_0474543,reviewer_es_0929213,1,Te obligan a comprar dos unidades y te llega s...,solo llega una unidad cuando te obligan a comp...,es,drugstore
3,es_0359921,product_es_0656090,reviewer_es_0224702,1,"No entro en descalificar al vendedor, solo pue...",PRODUCTO NO RECIBIDO.,es,wireless
4,es_0068940,product_es_0662544,reviewer_es_0224827,1,Llega tarde y co la talla equivocada,Devuelto,es,shoes


In [5]:
# split data
# 70% train, 20% test, 10% validation

# 1) Split initial: 70% train, 30% temp (val+test)
df_train, df_temp = train_test_split(
    df_reviews,
    test_size=0.30,            # 30% goes to temp
    random_state=42,           # for reproducibility
    stratify=df_reviews['stars']  # keep star distribution
)

# 2) Split temp into val (10%) and test (20%):
#    Since df_temp is 30% of data, to get 10% val we need 10/30 ≈ 0.333 of df_temp
df_val, df_test = train_test_split(
    df_temp,
    test_size=2/3,             # 2/3 of temp → test (0.3 * 2/3 = 0.20)
    random_state=42,
    stratify=df_temp['stars']
)

# Check sizes
print(f"Train:      {len(df_train)} rows ({len(df_train)/len(df_reviews):.0%})")
print(f"Test:       {len(df_test)} rows ({len(df_test)/len(df_reviews):.0%})")
print(f"Validation: {len(df_val)} rows ({len(df_val)/len(df_reviews):.0%})")


Train:      147000 rows (70%)
Test:       42000 rows (20%)
Validation: 21000 rows (10%)


In [6]:
df_train['labels'] = df_train['stars'].apply(lambda x: 1 if x >= 3 else 0)
df_test['labels'] = df_test['stars'].apply(lambda x: 1 if x >= 3 else 0)
df_val['labels'] = df_val['stars'].apply(lambda x: 1 if x >= 3 else 0)

In [7]:
# shape
df_train.shape, df_test.shape, df_val.shape

((147000, 9), (42000, 9), (21000, 9))

In [8]:
# convert to datasets
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)
dataset_val = Dataset.from_pandas(df_val)

In [9]:
dataset_train.features

{'review_id': Value('string'),
 'product_id': Value('string'),
 'reviewer_id': Value('string'),
 'stars': Value('int64'),
 'review_body': Value('string'),
 'review_title': Value('string'),
 'language': Value('string'),
 'product_category': Value('string'),
 'labels': Value('int64'),
 '__index_level_0__': Value('int64')}

In [10]:
# datset dictionary
dataset_dict = DatasetDict({
    'train': dataset_train,
    'test': dataset_test,
    'val': dataset_val
})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category', 'labels', '__index_level_0__'],
        num_rows: 147000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category', 'labels', '__index_level_0__'],
        num_rows: 42000
    })
    val: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category', 'labels', '__index_level_0__'],
        num_rows: 21000
    })
})

In [11]:
path_cache = CACHE_DIR / "huggingface"
path_cache.mkdir(parents=True, exist_ok=True)

In [13]:
# tokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
    cache_dir=str(path_cache)
)


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [14]:
def tokenize_reviews(examples):
    return tokenizer(examples["review_body"], truncation=True)

In [15]:
columns = dataset_dict["train"].column_names
columns

['review_id',
 'product_id',
 'reviewer_id',
 'stars',
 'review_body',
 'review_title',
 'language',
 'product_category',
 'labels',
 '__index_level_0__']

In [20]:
columns.remove("labels")

In [21]:
columns

['review_id',
 'product_id',
 'reviewer_id',
 'stars',
 'review_body',
 'review_title',
 'language',
 'product_category',
 '__index_level_0__']

In [22]:
encoded_dataset = dataset_dict.map(tokenize_reviews,
                                batched=True,
                                remove_columns=columns)
encoded_dataset

Map:   0%|          | 0/147000 [00:00<?, ? examples/s]

Map:   0%|          | 0/42000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 147000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 42000
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 21000
    })
})

In [23]:
print(encoded_dataset)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 147000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 42000
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 21000
    })
})


In [25]:
# fine-tuning convolutional neural network
num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    cache_dir=str(path_cache)
)

In [27]:
# evaluate
metric = evaluate.load("accuracy")
print(metric)

Downloading builder script: 0.00B [00:00, ?B/s]

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value('int32'), 'references': Value('int32')}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
        {'accuracy': 0.5}

    Exa

In [28]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return metric.compute(predictions=predictions, references=labels)

In [30]:
# prediction
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

Device set to use mps:0


In [31]:
sentiment_pipeline("I love using Hugging Face Transformers!")

  return forward_call(*args, **kwargs)


[{'label': '5 stars', 'score': 0.876865029335022}]

In [32]:
sentiment_pipeline("Te obligan a comprar dos unidades")

  return forward_call(*args, **kwargs)


[{'label': '1 star', 'score': 0.2124156355857849}]

In [33]:
sentiment_pipeline("la peor compra de mi vida!!! no recomiendo!")

  return forward_call(*args, **kwargs)


[{'label': '1 star', 'score': 0.967896044254303}]