In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ahsenwaheed/youtube-comments-spam-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/ahsenwaheed/youtube-comments-spam-dataset?dataset_version_number=1...


100%|██████████| 159k/159k [00:00<00:00, 664kB/s]

Extracting files...
Path to dataset files: /home/miauuu/.cache/kagglehub/datasets/ahsenwaheed/youtube-comments-spam-dataset/versions/1





In [4]:
import pandas as pd
import re
from datetime import datetime

# Cargar datos
df = pd.read_csv('/home/miauuu/.cache/kagglehub/datasets/ahsenwaheed/youtube-comments-spam-dataset/versions/1/Youtube-Spam-Dataset.csv')

# 1. Limpieza básica
def clean_text(text):
    text = str(text).lower()  # Convertir a minúsculas
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remover URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remover menciones y hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remover puntuación
    return text.strip()

df['clean_content'] = df['CONTENT'].apply(clean_text)

# 2. Manejo de fechas (si existe la columna DATE)
if 'DATE' in df.columns:
    # Especificar formato que incluye microsegundos
    df['date'] = pd.to_datetime(
        df['DATE'],
        format='%Y-%m-%dT%H:%M:%S.%f',  # Agregar .%f para microsegundos
        errors='coerce'  # Convertir errores a NaT
    )
    
    # Eliminar filas con fechas inválidas (opcional)
    df = df.dropna(subset=['date'])
    
    df['year_month'] = df['date'].dt.to_period('M')
# 3. Eliminar duplicados exactos
df = df.drop_duplicates(subset=['clean_content'], keep='first')

# 4. Eliminar comentarios vacíos o muy cortos
df = df[df['clean_content'].str.len() > 10]

In [5]:
df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,VIDEO_NAME,CLASS,clean_content,date,year_month
700,z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k,Corey Wilson,2015-05-28T21:39:52.376000,"<a href=""http://www.youtube.com/watch?v=KQ6zr6...","LMFAO - Party Rock Anthem ft. Lauren Bennett, ...",0,a href best part,2015-05-28 21:39:52.376,2015-05
701,z124jvczaz3dxhnbc04cffk43oiugj25yzo0k,Epic Gaming,2015-05-28T20:07:20.610000,wierd but funny﻿,"LMFAO - Party Rock Anthem ft. Lauren Bennett, ...",0,wierd but funny,2015-05-28 20:07:20.610,2015-05
702,z13tczjy5xj0vjmu5231unho1ofey5zdk,LaS Music,2015-05-28T19:23:35.355000,"Hey guys, I&#39;m a human.<br /><br /><br />Bu...","LMFAO - Party Rock Anthem ft. Lauren Bennett, ...",1,hey guys i39m a humanbr br br but i don39t wan...,2015-05-28 19:23:35.355,2015-05
703,z13tzr0hdpnayhqqc04cd3zqqqjkf3ngckk0k,Cheryl Fox,2015-05-28T17:49:35.294000,Party Rock....lol...who wants to shuffle!!!﻿,"LMFAO - Party Rock Anthem ft. Lauren Bennett, ...",0,party rocklolwho wants to shuffle,2015-05-28 17:49:35.294,2015-05
707,z123izobdqqyszmsx231cfuahxfjwjmpk04,Alex DeFeo,2015-05-28T04:15:22.615000,This song is just really fun ﻿,"LMFAO - Party Rock Anthem ft. Lauren Bennett, ...",0,this song is just really fun,2015-05-28 04:15:22.615,2015-05


In [7]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# 1. Preparar embeddings
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# 2. Creación del modelo BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    language="multilingual",
    calculate_probabilities=True,
    verbose=True
)

# 3. Ajustar el modelo
topics, probs = topic_model.fit_transform(df['clean_content'])

# 4. Visualización jerárquica
hierarchical_topics = topic_model.hierarchical_topics(df['clean_content'])
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2025-04-25 11:35:26,464 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

2025-04-25 11:35:35,277 - BERTopic - Embedding - Completed ✓
2025-04-25 11:35:35,278 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-25 11:35:42,866 - BERTopic - Dimensionality - Completed ✓
2025-04-25 11:35:42,866 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-25 11:35:42,917 - BERTopic - Cluster - Completed ✓
2025-04-25 11:35:42,924 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-25 11:35:42,954 - BERTopic - Representation - Completed ✓
100%|██████████| 15/15 [00:00<00:00, 425.04it/s]
