In [3]:
import pandas as pd
import re

def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'http\S+', '', text)  # eliminar URLs
    text = re.sub(r'#\w+', '', text)     # eliminar hashtags
    text = re.sub(r'@\w+', '', text)     # eliminar menciones
    text = re.sub(r'\s+', ' ', text)     # normalizar espacios
    return text.strip()

def chunk_text(text, chunk_size=400, overlap=80):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Cargar tus datos
#df = pd.read_csv("post_celsia.csv")  # o el DataFrame que tengas
df = pd.read_json("../../data/source/post_celsia.json", orient='records')
df["clean_post"] = df["post"].apply(clean_text)

# Crear chunks con metadata
data_chunks = []
for _, row in df.iterrows():
    for chunk in chunk_text(row["clean_post"]):
        data_chunks.append({
            "user": row.get("user", "Celsia"),
            "tiempo": row.get("tiempo", ""),
            "chunk": chunk,
            "source": "LinkedIn"
        })

chunks_df = pd.DataFrame(data_chunks)
chunks_df.to_csv("../../data/chunks/post_celsia_chunks.csv", index=False)
print("✅ Chunks generados y guardados correctamente.")


✅ Chunks generados y guardados correctamente.
