# Pipeline de preprocesamiento de texto (con Pandas + NLTK

In [20]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# ===============================================
# ‚öôÔ∏è DESCARGAR RECURSOS NECESARIOS (solo 1 vez)
# ===============================================
nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)  # mejora la lematizaci√≥n
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pzambonino\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [17]:
# ===============================================
# üìÇ CARGAR DATASET LIMPIO
# ===============================================
try:
    df = pd.read_csv("amazon_reviews_clean.csv")
    print("‚úÖ Dataset cargado correctamente.")
except FileNotFoundError:
    raise FileNotFoundError("‚ùå No se encontr√≥ el archivo 'amazon_reviews_clean.csv'. "
                            "Verifica la ruta o nombre del archivo.")

‚úÖ Dataset cargado correctamente.


In [18]:

# üßπ 1. UNIFICAR TEXTO (Resumen + Rese√±a)
# ===============================================
df["FullReview"] = df["Summary"].fillna("").astype(str) + " " + df["Text"].fillna("").astype(str)

# ===============================================
# üßΩ 2. LIMPIEZA DE TEXTO
# ===============================================
def clean_text(text: str) -> str:
    """Limpia el texto: min√∫sculas, sin URLs, sin signos, sin n√∫meros."""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)      # URLs
    text = re.sub(r"[^a-z\s]", " ", text)                    # solo letras y espacios
    text = re.sub(r"\s+", " ", text).strip()                 # espacios extra
    return text

df["CleanText"] = df["FullReview"].apply(clean_text)


In [21]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

# üîΩ Descargar los recursos necesarios de NLTK (solo la primera vez)
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")  # <- importante

# ===============================================
# üö´ 3. ELIMINAR STOPWORDS (palabras vac√≠as)
# ===============================================
stop_words = set(stopwords.words("english"))

def remove_stopwords(text: str) -> str:
    tokens = nltk.word_tokenize(text)
    return " ".join([w for w in tokens if w not in stop_words])

# Aplica la funci√≥n a tu columna
df["CleanText"] = df["CleanText"].apply(remove_stopwords)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pzambonino\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pzambonino\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pzambonino\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [22]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text: str) -> str:
    tokens = nltk.word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

df["CleanText"] = df["CleanText"].apply(lemmatize_text)

# ===============================================
# üíæ 5. GUARDAR DATASET PROCESADO
# ===============================================
cols_to_keep = ["ProductId", "Score", "Time", "CleanText", "HelpfulLabel"]
missing_cols = [col for col in cols_to_keep if col not in df.columns]

if missing_cols:
    print(f"‚ö†Ô∏è Advertencia: Faltan columnas en el dataset: {missing_cols}")
    df_prepared = df[[c for c in cols_to_keep if c in df.columns]]
else:
    df_prepared = df[cols_to_keep]

output_path = "amazon_reviews_prepared.csv"
df_prepared.to_csv(output_path, index=False)

print(f"\n‚úÖ Preprocesamiento completado. Archivo guardado como '{output_path}'.")
print("üìä Vista previa de los primeros registros:")
print(df_prepared.head())


‚úÖ Preprocesamiento completado. Archivo guardado como 'amazon_reviews_prepared.csv'.
üìä Vista previa de los primeros registros:
    ProductId  Score        Time  \
0  B001E4KFG0      5  2011-04-27   
1  B000LQOCH0      4  2008-08-18   
2  B000UA0QIQ      2  2011-06-13   
3  B000E7L2R4      5  2011-11-23   
4  B0001PB9FE      5  2005-02-08   

                                           CleanText  HelpfulLabel  
0  good quality dog food bought several vitality ...             1  
1  delight say confection around century light pi...             1  
2  cough medicine looking secret ingredient robit...             1  
3  yay barley right mostly sprouting cat eat gras...             1  
4  best hot sauce world know cactus tequila uniqu...             1  
