In [1]:
!pip install datasets
!pip install transformers
!pip install git+https://github.com/huggingface/accelerate
!pip install nltk
!pip install tensorflow

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
import nltk
nltk.download('punkt')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from datasets import Dataset
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from datasets import load_dataset
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from datasets import load_metric
from transformers import TrainingArguments
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Vamos a probar con 3 modelos : regresión logítica, redes neuronales y por último un modelo preentrenado de HugginFace

In [4]:
############### Regresión Logística ###############

In [5]:
# Leemos CSV
df = pd.read_csv('df_vehicles_prepro.csv')
print(df.head(3))
print(df.shape)

# Divisón de datos
X_train, X_test, y_train, y_test = train_test_split(
    df['reviewText'],
    df['overall'],
    train_size=0.75,
    test_size=0.25,
    random_state=14,
    shuffle=True
)

# Vectorización de texto
cv = TfidfVectorizer(
    max_df=0.95,
    min_df=3,
    ngram_range=(1, 3)
)

X_train_tfidf = cv.fit_transform(X_train)
X_test_tfidf = cv.transform(X_test)

                       reviewText  overall
0  chevy truck bed rust junk body        0
1                            junk        0
2                   dont fit good        0
(4588, 2)


In [6]:
# Definir una lista vacía para almacenar los resultados para cada valor de C
results_list = []

c_params = [0.01, 0.05, 0.25, 0.5, 1, 10, 100, 1000]

for c in c_params:
    lr = LogisticRegression(C=c, solver='liblinear', max_iter=500)
    lr.fit(X_train_tfidf, y_train)

    # Predicción
    test_predict = lr.predict(X_test_tfidf)

    # Calcular las métricas
    test_accuracy = accuracy_score(y_test, test_predict)
    test_precision = precision_score(y_test, test_predict, average='macro')
    test_recall = recall_score(y_test, test_predict, average='macro')
    test_f1 = f1_score(y_test, test_predict, average='macro')

    # Agregar los resultados a la lista
    results_list.append({
        'Valores de regularización': c,
        'Accuracy': test_accuracy,
        'Precisión': test_precision,
        'Recall': test_recall,
        'F1-score': test_f1
    })

# Crear un DataFrame a partir de la lista
df_results = pd.DataFrame(results_list)
df_results

Unnamed: 0,Valores de regularización,Accuracy,Precisión,Recall,F1-score
0,0.01,0.781168,0.812359,0.785631,0.777257
1,0.05,0.817786,0.824128,0.819768,0.81741
2,0.25,0.8483,0.848944,0.848978,0.8483
3,0.5,0.850915,0.851124,0.851369,0.850904
4,1.0,0.852659,0.852584,0.852863,0.852614
5,10.0,0.846556,0.846453,0.846379,0.846413
6,100.0,0.835222,0.835095,0.835061,0.835077
7,1000.0,0.825632,0.825507,0.825437,0.82547


In [7]:
############### Redes Neuronales ###############

In [8]:
# Lee un archivo CSV
df = pd.read_csv('df_vehicles_prepro.csv')

# Dividimos los datos en conjuntos de entrenamiento, validación y test
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, valid = train_test_split(train, test_size=0.15, random_state=42)

# Imprimimos el shape de los conjuntos resultantes
print("Conjunto de datos:", df.shape)
print("Conjunto de Entrenamiento:", train.shape)
print("Conjunto de Validación:", valid.shape)
print("Conjunto de Prueba:", test.shape)

# Tokenización
df['descripcion_tokenizada'] = df['reviewText'].apply(word_tokenize)

# Entrenamiento de Word2Vec para transformarlo en vectores que capturan significado semántico y relaciones contextuales
model_w2v = Word2Vec(sentences=df['descripcion_tokenizada'], vector_size=10, window=5, min_count=1, sg=0)
df.head(3)

Conjunto de datos: (4588, 2)
Conjunto de Entrenamiento: (3119, 2)
Conjunto de Validación: (551, 2)
Conjunto de Prueba: (918, 2)


Unnamed: 0,reviewText,overall,descripcion_tokenizada
0,chevy truck bed rust junk body,0,"[chevy, truck, bed, rust, junk, body]"
1,junk,0,[junk]
2,dont fit good,0,"[dont, fit, good]"


In [9]:
# Convierte cada palabra en un vector utilizando Word2Vec
def obtener_vector_palabra(palabra, modelo):
    try:
        return modelo.wv[palabra]
    except KeyError:
        return None

df['descripcion_vector'] = df['descripcion_tokenizada'].apply(lambda tokens: [obtener_vector_palabra(palabra, model_w2v) for palabra in tokens])

# Rellena las secuencias de vectores resultantes con ceros para que tengan la misma longitud utilizando pad_sequences
df['descripcion_vector_padded'] = list(pad_sequences(df['descripcion_vector'], padding='post', dtype='float32'))
df.head(3)

Unnamed: 0,reviewText,overall,descripcion_tokenizada,descripcion_vector,descripcion_vector_padded
0,chevy truck bed rust junk body,0,"[chevy, truck, bed, rust, junk, body]","[[0.113782905, 0.1037049, 1.271349, 0.3657576,...","[[0.113782905, 0.1037049, 1.271349, 0.3657576,..."
1,junk,0,[junk],"[[0.19828564, -0.19564274, 1.0767508, 0.410233...","[[0.19828564, -0.19564274, 1.0767508, 0.410233..."
2,dont fit good,0,"[dont, fit, good]","[[1.0932955, -0.011799474, 1.4435383, 0.553836...","[[1.0932955, -0.011799474, 1.4435383, 0.553836..."


In [10]:
# Crear un array numpy de las secuencias
sequences = np.array(df['descripcion_vector_padded'].tolist())

MAX_SEQUENCE_LENGTH = 6330

# Crear el modelo LSTM
model = Sequential()
model.add(Embedding(input_dim=model_w2v.wv.vectors.shape[0], output_dim=model_w2v.wv.vectors.shape[1], weights=[model_w2v.wv.vectors], trainable=False, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(units=100))
model.add(Dense(units=1, activation='sigmoid'))

# Compilar y entrenar el modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Definir las máscaras de entrenamiento y validación
train_mask = df.index.isin(train.index)
valid_mask = df.index.isin(valid.index)
test_mask = df.index.isin(test.index)

# Obtener las secuencias correspondientes a las máscaras
train_sequences = sequences[train_mask]
valid_sequences = sequences[valid_mask]
test_sequences = sequences[test_mask]

# Aplastar las dimensiones de las secuencias de vectores
train_sequences_flat = train_sequences.reshape(train_sequences.shape[0], -1)
valid_sequences_flat = valid_sequences.reshape(valid_sequences.shape[0], -1)
test_sequences_flat = test_sequences.reshape(test_sequences.shape[0], -1)

# Entrenar el modelo
model.fit(train_sequences_flat, train['overall'], epochs=3, batch_size=32, validation_data=(valid_sequences_flat, valid['overall']))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7b16a4f86920>

In [11]:
# Realizar predicciones en los datos de prueba
test_predictions = model.predict(test_sequences_flat)

# Las predicciones son valores continuos entre 0 y 1
threshold = 0.5
binary_predictions = (test_predictions > threshold).astype(int)

# Comparar las etiquetas predichas con las etiquetas reales
accuracy = accuracy_score(test['overall'], binary_predictions)
recall = recall_score(test['overall'], binary_predictions)
f1 = f1_score(test['overall'], binary_predictions)
precision = precision_score(test['overall'], binary_predictions)

print("Accuracy:", accuracy)
print("Recall:", recall)
print("F1 Score:", f1)
print("Precision:", precision)

Accuracy: 0.49019607843137253
Recall: 1.0
F1 Score: 0.6578947368421052
Precision: 0.49019607843137253


In [12]:
############### HuggingFace ###############

In [13]:
# Lectura CSV
df = pd.read_csv('df_vehicles_prepro.csv')

# Crear un conjunto de datos a partir del DataFrame
dataset = Dataset.from_pandas(df)

# Cambiar el nombre de la columna "overall" a "label"
dataset = dataset.rename_column("overall", "label")

# Barajeado aleatorio
dataset = dataset.shuffle(seed=42)

# Dividir en train y test
train_size = int(0.7 * len(dataset))
train_dataset = dataset.select(range(train_size))# .select(range(100))
test_dataset = dataset.select(range(train_size, len(dataset)))# .select(range(100))

# Crear el objeto DatasetDict
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
print(dataset)

# Comprobamos que existan los distintos tipos de etiquetas en train y test
distinct_labels_train = set(dataset["train"]["label"])
distinct_labels_test = set(dataset["test"]["label"])

print("\n" "Valores distintos conjunto de entrenamiento:", distinct_labels_train)
print("Valores distintos conjunto de prueba:", distinct_labels_test)

DatasetDict({
    train: Dataset({
        features: ['reviewText', 'label'],
        num_rows: 3211
    })
    test: Dataset({
        features: ['reviewText', 'label'],
        num_rows: 1377
    })
})

Valores distintos conjunto de entrenamiento: {0, 1}
Valores distintos conjunto de prueba: {0, 1}


In [14]:
# Tokenizamos el dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["reviewText"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Cargamos el modelo Preentrenado
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/3211 [00:00<?, ? examples/s]

Map:   0%|          | 0/1377 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Crear función de métricas
def custom_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calcular las métricas globales
    precision_global = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall_global = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1_global = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    # Calcular las métricas por etiqueta
    precision_por_label = metric1.compute(predictions=predictions, references=labels, average=None)["precision"].tolist()
    recall_por_label = metric2.compute(predictions=predictions, references=labels, average=None)["recall"].tolist()
    f1_por_label = metric3.compute(predictions=predictions, references=labels, average=None)["f1"].tolist()

    # Calcular la matriz de confusión manualmente
    cm = confusion_matrix(y_true=labels, y_pred=predictions).tolist()

    return {
        "precision_global": precision_global,
        "recall_global": recall_global,
        "f1_global": f1_global,
        "accuracy": accuracy,

        "precision_por_label": precision_por_label,
        "recall_por_label": recall_por_label,
        "f1_por_label": f1_por_label,

        "matriz_confusion": cm
}


In [16]:
# Entrenamiento y evaluación
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

from transformers import Trainer, TrainingArguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=custom_metrics
)

from transformers import Trainer

trainer.train()
resultados_evaluacion = trainer.evaluate()

Epoch,Training Loss,Validation Loss,Precision Global,Recall Global,F1 Global,Accuracy,Precision Por Label,Recall Por Label,F1 Por Label,Matriz Confusion
1,No log,0.461791,0.857057,0.851852,0.851239,0.851852,"[0.8146718146718147, 0.9]","[0.9134199134199135, 0.7894736842105263]","[0.8612244897959184, 0.8411214953271027]","[[633, 60], [144, 540]]"
2,0.462200,0.451907,0.865546,0.864198,0.864104,0.864198,"[0.8880368098159509, 0.8427586206896551]","[0.8354978354978355, 0.8932748538011696]","[0.8609665427509294, 0.8672817601135556]","[[579, 114], [73, 611]]"
3,0.292500,0.542165,0.865661,0.86565,0.865651,0.86565,"[0.8681159420289855, 0.8631732168850073]","[0.8643578643578643, 0.8669590643274854]","[0.866232827187274, 0.8650619985412108]","[[599, 94], [91, 593]]"


  metric1 = load_metric("precision")


Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Trainer is attempting to log a value of "[0.8146718146718147, 0.9]" of type <class 'list'> for key "eval/precision_por_label" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.9134199134199135, 0.7894736842105263]" of type <class 'list'> for key "eval/recall_por_label" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8612244897959184, 0.8411214953271027]" of type <class 'list'> for key "eval/f1_por_label" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[633, 60], [144, 540]]" of type <class 'list'> for key "eval/matriz_confusion" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8

Trainer is attempting to log a value of "[0.8681159420289855, 0.8631732168850073]" of type <class 'list'> for key "eval/precision_por_label" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8643578643578643, 0.8669590643274854]" of type <class 'list'> for key "eval/recall_por_label" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.866232827187274, 0.8650619985412108]" of type <class 'list'> for key "eval/f1_por_label" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[599, 94], [91, 593]]" of type <class 'list'> for key "eval/matriz_confusion" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [17]:
# Convertir el diccionario de resultados a un DataFrame
resultados_df = pd.DataFrame.from_dict(resultados_evaluacion, orient="index", columns=["valor"]).T

# Lista de nombres de columnas que deseas eliminar
columnas_a_eliminar = ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch']
resultados_df = resultados_df.drop(columns=columnas_a_eliminar)

# Guardamos en un CSV los resultados obtenidos
resultados_df.to_csv('df_metricas.csv', index=False)

# Conclusiones
"""A la vista de los resultados obtenidos, el modelo preentrenado de HugginFace es el que obtiene mejores resultados.
   En estos casos considero que no hay que 'reinventar la rueda' y si existen modelos que funcionan y que han sido
   creados y preentrenados para estos propósitos han de ser utlizados.
   Guardaremos los resultados en un CSV para posteriomente generar en el Notebook4 un html con todas las métricas"""

"A la vista de los resultados obtenidos, el modelo preentrenado de HugginFace es el que obtiene mejores resultados.\n   En estos casos considero que no hay que 'reinventar la rueda' y si existen modelos que funcionan y que han sido\n   creados y preentrenados para estos propósitos han de ser utlizados.\n   Guardaremos los resultados en un CSV para posteriomente generar en el Notebook4 un html con todas las métricas"