In [None]:
import os

# Definir la estructura del proyecto
directories = [
    "data/raw",
    "data/processed",
    "data/external",
    "data/interim",

]

# Crear los directorios
for directory in directories:
    os.makedirs(directory, exist_ok=True)



print("Estructura de directorios y archivos creada con éxito.")


Estructura de directorios y archivos creada con éxito.


In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313502 sha256=d1ec01b93a1f5e

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col

def create_spark_session():
    """Crea una sesión de Spark."""
    return SparkSession.builder.appName("ValidacionBalanceo").getOrCreate()

def validate_balance(file_path):
    """Valida que el dataset tenga clases balanceadas."""
    spark = create_spark_session()
    df = spark.read.parquet(file_path)

    # Contar registros por clase
    class_counts = df.groupBy("label").count().orderBy("label").collect()

    # Mostrar resultados
    for row in class_counts:
        print(f"🔹 Clase {row['label']}: {row['count']} registros")

    # Validar si el dataset está balanceado
    if abs(class_counts[0]['count'] - class_counts[1]['count']) <= 0.05 * class_counts[0]['count']:
        print("✅ El dataset está balanceado.")
    else:
        print("⚠️ Advertencia: Las clases no están completamente balanceadas.")

# if __name__ == "__main__":
processed_data_path = "data/processed/Software_processed.parquet"
validate_balance(processed_data_path)


🔹 Clase 0: 809052 registros
🔹 Clase 1: 809802 registros
✅ El dataset está balanceado.


In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P models/


--2025-03-20 00:27:32--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.34.53, 13.226.34.7, 13.226.34.83, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.34.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘models/lid.176.bin’


2025-03-20 00:27:33 (159 MB/s) - ‘models/lid.176.bin’ saved [131266198/131266198]



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, lower, regexp_replace, count, when
from pyspark.sql.window import Window
import fasttext
from pyspark.sql.types import StringType
import os
from pyspark.sql import functions as F

def create_spark_session():
    """Crea una sesión de Spark."""
    return SparkSession.builder.appName("PreprocesamientoAmazonReviews").getOrCreate()

def load_data(file_path, spark):
    """Carga los datos desde Parquet."""
    print(f"📥 Cargando datos desde {file_path}...")
    df = spark.read.parquet(file_path)
    print("✅ Datos cargados correctamente.")
    return df

def filter_english_reviews(df):
    """Filtra solo las reseñas en inglés si la columna 'language' existe."""
    if "language" in df.columns:
        print("\n🗑️ Eliminando reseñas en idiomas distintos al inglés...")
        # df = df.filter(col("language") == "en")
        # df = df.repartition("language").filter(col("language") == "en")
        # df = df.select("language", "text").filter(col("language") == "en")
        df = df.filter(col("language") == "en").select("language", "label","text")

        print(f"✅ Total de registros después del filtro de idioma: {df.count()}")
    else:
        print("\n⚠️ Advertencia: La columna 'language' no existe en el DataFrame. No se aplica el filtro.")

    return df

def remove_neutral_reviews(df):
    """Elimina reseñas de 3 estrellas para convertirlo en un problema binario."""
    print("\n🗑️ Eliminando reseñas de 3 estrellas...")
    df = df.filter(col("rating") != 3)

    # Convertir ratings en clasificación binaria (0 = negativo, 1 = positivo)
    df = df.withColumn("label", when(col("rating") <= 2, 0).otherwise(1))

    print(f"✅ Total de registros después de eliminación de neutrales: {df.count()}")
    return df

def clean_text(df):
    """Limpia el texto de las reseñas eliminando caracteres especiales."""
    print("\n🧹 Limpiando texto...")
    df = df.withColumn("text", lower(col("text")))  # Convertir a minúsculas
    df = df.withColumn("text", regexp_replace(col("text"), "[^a-zA-Z0-9\s]", ""))  # Eliminar caracteres especiales
    df = df.withColumn("text", regexp_replace(col("text"), "\s+", " "))  # Eliminar espacios extras
    print("✅ Texto limpiado.")
    return df

def remove_duplicates_and_empty(df):
    """Elimina duplicados y valores nulos."""
    print("\n🗑️ Eliminando valores nulos y duplicados...")
    # df = df.filter((col("text").isNotNull()) & (col("text") != ""))  # Eliminar reseñas vacías
    # df = df.dropDuplicates(["text"])  # Eliminar reseñas duplicadas
    df = df.filter((col("text").isNotNull()) & (col("text") != "")).dropDuplicates(["text"])

    # print(f"✅ Total de registros después de limpieza: {df.count()}")
    return df

def undersampling(df):
    """Balancea las clases reduciendo la cantidad de reseñas positivas (Undersampling)."""
    print("\n⚖️ Aplicando Undersampling para balancear clases...")

    # Contar cantidad de positivos y negativos
    """class_counts = df.groupBy("label").count().collect()
    positive_count = next(x["count"] for x in class_counts if x["label"] == 1)
    negative_count = next(x["count"] for x in class_counts if x["label"] == 0)
    min_class_count = min(positive_count, negative_count)  # Seleccionar la menor cantidad
    """
    # ✅ Contar clases de forma más eficiente sin `collect()`
    class_counts = df.groupBy("label").count().toPandas().set_index("label")["count"]
    positive_count, negative_count = class_counts.get(1, 0), class_counts.get(0, 0)
    min_class_count = min(positive_count, negative_count)

    print(f"🔹 Positivos: {positive_count}, Negativos: {negative_count}")
    print(f"✅ Reduciéndolos a: {min_class_count}")

    """# Seleccionar aleatoriamente `min_class_count` reseñas de cada clase
    df_positive = df.filter(col("label") == 1).sample(False, min_class_count / positive_count, seed=42)
    df_negative = df.filter(col("label") == 0).sample(False, min_class_count / negative_count, seed=42)

    df_balanced = df_positive.union(df_negative)
    """
    # ✅ Filtrar y muestrear en una sola operación
    df_balanced = (
        df.withColumn("rand", F.rand(seed=42))  # Agregar una columna aleatoria para el muestreo
        .withColumn("rank", F.row_number().over(Window.partitionBy("label").orderBy("rand")))
        .filter(col("rank") <= min_class_count)  # Filtrar para balancear
        .drop("rand", "rank")  # Limpiar columnas auxiliares
    )

    print(f"✅ Total de registros después del balanceo: {df_balanced.count()}")
    return df_balanced

def save_cleaned_data(df, output_path):
    """Guarda los datos procesados en Parquet."""
    print(f"\n💾 Guardando datos procesados en {output_path}...")
    df.write.mode("overwrite").parquet(output_path)
    print("✅ Datos guardados correctamente.")

# Función para cargar el modelo FastText en cada worker de Spark
def get_fasttext_model():
    """Carga el modelo FastText en cada worker solo una vez."""
    return fasttext.load_model("models/lid.176.bin")

def detect_language(text):
    """Detecta el idioma usando FastText."""
    try:
        if not text:
            return "unknown"

        # Cargar modelo FastText en cada worker
        model = get_fasttext_model()

        label = model.predict([text.replace("\n", " ")])
        return label[0][0][0].replace("__label__", "")
    except Exception:
        return "unknown"

def check_and_download_file(file_path, url):
    """
    Verifica si el archivo existe en la ruta especificada.
    Si no existe, lo descarga usando wget.

    """
    if os.path.exists(file_path):
        print(f"✅ El archivo ya existe: {file_path}")
    else:
        print(f"⚠️ El archivo no existe. Descargando desde: {url}")
        os.system(f"wget {url} -P {os.path.dirname(file_path)}")
        print("✅ Descarga completada.")

detect_language_udf = udf(detect_language, StringType())

def add_language_column(df):
    # Convertir la función en UDF para PySpark

    """Añade la columna 'language' con la detección de idioma."""
    print("\n🌍 Detectando idioma de las reseñas...")
    df = df.withColumn("language", detect_language_udf(col("text")))
    print("✅ Detección de idioma completada.")
    return df





In [None]:
  # 📌 Definir rutas
  interim_data_path = "data/interim/Software_interim.parquet"
  processed_data_path = "data/processed/Software_processed.parquet"
  from pyspark import SparkContext

  # Crear sesión de Spark
  spark = create_spark_session()

  # Cargar datos
  df = load_data(interim_data_path, spark)
  df.printSchema()
  df = df.select(col("text"), col("rating"))
  df = remove_neutral_reviews(df)
  # # Aplicar detección de idioma
  df.show(5, truncate=True)


  file_path = "data/models/lid.176.bin"
  url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
  check_and_download_file(file_path, url)
  # df = spark.read.parquet("data/processed/Software_processed.parquet")
  res=detect_language("hola como estas soy de colomb")
  # Definir la función con carga de modelo en cada worker
  def detect_language_udf():
      model = None  # Variable para almacenar el modelo en caché

      def detect_language(text):
          nonlocal model
          if model is None:
              model = fasttext.load_model(file_path)  # ✅ Carga del modelo en cada worker
          try:
              if not text:
                  return "unknown"
              label = model.predict([text.replace("\n", " ")])
              return label[0][0][0].replace("__label__", "")
          except Exception:
              return "unknown"

      return detect_language

  # Convertir la función en una UDF
  detect_language = udf(detect_language_udf(), StringType())
  # Aplicar la UDF al DataFrame de Spark
  df = df.withColumn("language", detect_language(col("text")))
  df.show(5, truncate=True)


  # # 📌 Verificar los datos antes de seguir
  # df.select("text", "language").show(5, truncate=True)

  # # Eliminar reseñas neutrales y convertir a clasificación binaria
  df = filter_english_reviews(df)
  # df = remove_neutral_reviews(df)
  df.show(5, truncate=True)



📥 Cargando datos desde data/interim/Software_interim.parquet...
✅ Datos cargados correctamente.
root
 |-- asin: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- images: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- attachment_type: string (nullable = true)
 |    |    |-- large_image_url: string (nullable = true)
 |    |    |-- medium_image_url: string (nullable = true)
 |    |    |-- small_image_url: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)


🗑️ Eliminando reseñas de 3 estrellas...
✅ Total de registros después de eliminación de neutrales: 4460825
+--------------------+------+-----+
|                text|rating|label|
+--------------------+------+-----+
|Great pr

In [None]:
  # # # Preprocesar datos
  df = clean_text(df)
  df = remove_duplicates_and_empty(df)

  # # Guardar datos procesados
  save_cleaned_data(df, processed_data_path)

  print("\n🎯 Proceso de preprocesamiento completado.")


🧹 Limpiando texto...
✅ Texto limpiado.

🗑️ Eliminando valores nulos y duplicados...

💾 Guardando datos procesados en data/processed/Software_processed.parquet...
✅ Datos guardados correctamente.

🎯 Proceso de preprocesamiento completado.


In [None]:
  # # Aplicar Undersampling
  df = undersampling(df)

  # # Guardar datos procesados
  save_cleaned_data(df, processed_data_path)

  print("\n🎯 Proceso de preprocesamiento completado.")


⚖️ Aplicando Undersampling para balancear clases...
🔹 Positivos: 2748554, Negativos: 803325
✅ Reduciéndolos a: 803325
✅ Total de registros después del balanceo: 1606650

💾 Guardando datos procesados en data/processed/Software_processed.parquet...
✅ Datos guardados correctamente.

🎯 Proceso de preprocesamiento completado.


Entrenamiento

In [None]:
import os

# Definir la estructura del proyecto
directories = [

    "notebooks",
    "src/data",
    "src/models",
    "src/deployment",
    "src/monitoring",
    "src/utils",
    "tests",
    "configs",
]

# Crear los directorios
for directory in directories:
      os.makedirs(directory, exist_ok=True)


In [None]:
!pip install mlflow==2.1.0 requests



In [None]:
import mlflow
import os
import pandas as pd
from IPython.display import display

Adicionalmente, utilizaremos un servidor de `mlflow`:

In [None]:
command = """
mlflow server \
        --backend-store-uri sqlite:///tracking.db \
        --default-artifact-root file:mlruns \
        -p 5000 &
"""
get_ipython().system_raw(command) #**TODO**: 27869

Utilizaremos `ngrok` para acceder al tablero de `mlflow`:

In [None]:
!pip install pyngrok



Ahora debe agregar su token de `ngrok`:

In [None]:
token = "2R2O3rzaAG13L2kjEQv7WCH067c_3htAdvYvLM7qiyiCJDXD6" # Agregue el token dentro de las comillas
os.environ["NGROK_TOKEN"] = token

Nos autenticamos en ngrok:

In [None]:
!ngrok authtoken $NGROK_TOKEN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok
ngrok.connect(5000, "http")

<NgrokTunnel: "https://f5de-35-227-84-85.ngrok-free.app" -> "http://localhost:5000">

In [None]:
mlflow.set_tracking_uri("https://5c9e-35-227-84-85.ngrok-free.app")

In [None]:
exp_id = mlflow.create_experiment(name="airline_delay", artifact_location="mlruns/")

In [None]:
import tensorflow as tf
import os
import mlflow
import mlflow.tensorflow
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers.schedules import PolynomialDecay
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split

# 📌 Configuración
DATA_PATH = "data/processed/Software_processed.parquet"
SAVE_PATH = "data/processed/splits/"
SAVE = True  # Bandera para guardar o cargar los datos
BATCH_SIZE = 32
EPOCHS = 3
MLFLOW_TRACKING_URI = "http://localhost:5000"  # Ajustar si se usa un servidor remoto

# 📌 Configurar MLflow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("bert_finetuning")

# 📌 Cargar el tokenizador de BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(texts, labels, max_length=128):
    tokens = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors="tf")
    return tokens, tf.convert_to_tensor(labels, dtype=tf.int32)

# 📌 Verificar si los datasets ya existen
if not os.path.exists(SAVE_PATH):
    print("📂 Creando conjuntos de datos de entrenamiento, validación y prueba...")
    os.makedirs(SAVE_PATH, exist_ok=True)

    df = pd.read_parquet(DATA_PATH)
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(df["text"], df["label"], test_size=0.3, random_state=42)
    val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

    print("✅ Datos guardados en", SAVE_PATH)
else:
    print("📂 Cargando datasets preprocesados...")


📂 Creando conjuntos de datos de entrenamiento, validación y prueba...
✅ Datos guardados en data/processed/splits/


In [None]:
df.info()
df.to_csv(os.path.join(SAVE_PATH, "Software_processed.csv"), index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1606650 entries, 0 to 1606649
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   language  1606650 non-null  object
 1   label     1606650 non-null  int32 
 2   text      1606650 non-null  object
dtypes: int32(1), object(2)
memory usage: 30.6+ MB


In [None]:
import pandas as pd
import numpy as np
# import flair
# from flair.data import Sentence
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import random as rn
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter
from PIL import Image

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pd.options.display.max_rows = None
seed=40

In [None]:
df.head()


Unnamed: 0,language,label,text
0,en,1,no broken crayons with this game fun and easy ...
1,en,1,this game is a lot of fun it is one of the bes...
2,en,1,this games is awesome i wish there were easier...
3,en,1,enjoy the selection of music and good quality
4,en,1,very satisfied my s4 did not come with a light...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1606650 entries, 0 to 1606649
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   language  1606650 non-null  object
 1   label     1606650 non-null  int32 
 2   text      1606650 non-null  object
dtypes: int32(1), object(2)
memory usage: 30.6+ MB


In [None]:
print(type(train_texts))

<class 'pandas.core.series.Series'>


In [None]:
reviews = df["text"].tolist()
labels = df["label"].tolist()

In [None]:
# split the dataset into train, validation and holdout sets (60-20-20)
training_sentences, test_sentences, training_labels, test_labels = train_test_split(reviews, labels, test_size=.4)

validation_sentences, holdout_sentences, validation_labels, holdout_labels = train_test_split(test_sentences, test_labels, test_size=.5)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Example of the tokenizer output
tokenizer([training_sentences[0]], truncation=True,
                            padding=True, max_length=512)

{'input_ids': [[101, 5936, 1363, 3094, 102]], 'token_type_ids': [[0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1]]}

In [None]:
# We convert the input encodings and labels into a TensorFlow Dataset object

train_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(train_encodings),
                            training_labels
                            ));

validation_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(validation_encodings),
                            validation_labels
                            ));

holdout_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(holdout_encodings),
                            holdout_labels
                            ));

In [None]:
# We initialize our pre-trained BERT model

model = TFBertForSequenceClassification.from_pretrained('bert-base-cased',num_labels=2)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

In [None]:

# 📌 Entrenamiento con MLflow
with mlflow.start_run():
    mlflow.log_params({"batch_size": BATCH_SIZE, "epochs": EPOCHS, "learning_rate": 5e-5})
    history = model.fit(train_dataset, validation_data=validation_dataset, epochs=EPOCHS)

    # Registrar métricas
    for epoch in range(EPOCHS):
        mlflow.log_metric("train_accuracy", history.history['accuracy'][epoch], step=epoch)
        mlflow.log_metric("val_accuracy", history.history['val_accuracy'][epoch], step=epoch)

    # Guardar modelo en MLflow
    mlflow.tensorflow.log_model(model, "bert_finetuned")

# 📌 Guardar el modelo localmente
model.save_pretrained("src/models/checkpoints/bert_finetuned")

print("✅ Entrenamiento finalizado y modelo guardado.")
