In [None]:
import pandas as pd
from pymongo import MongoClient

# Conexión a MongoDB
mongo_client = MongoClient("mongodb://localhost:27017/")
db = mongo_client["reddit_db"]
collection = db["posts"]

# Cargar documentos en un DataFrame
cursor = collection.find()
df = pd.DataFrame(list(cursor))

# Convertir fecha si hace falta
df["fecha_utc"] = pd.to_datetime(df["fecha_utc"])


In [87]:
print(len(df))


3519


In [80]:
# print(df.head())  # Ver primeras filas
# print(df.describe())  # Estadísticas generales
# print(df.dtypes)  # Tipos de datos
df.head()

Unnamed: 0,_id,titulo,texto,score,comentarios,fecha_utc,url,fecha_dia,titulo_limpio,titulo_limpo2,sentimiento,ubicaciones
0,680cfb785f567a7d22646cad,Russia confirms North Korean troops are in Ukr...,,6,1,2025-04-26 15:17:45,https://www.nbcnews.com/world/ukraine/russia-c...,2025-04-26,russia confirm north korean troop be in ukrain...,russia confirm north korean troop be in ukrain...,0.0,"[russia, ukraine]"
1,680cfb785f567a7d22646cae,"Iran, Russia agree on 55 bcm of gas supplies, ...",,3,0,2025-04-26 15:14:01,https://www.reuters.com/world/iranian-oil-mini...,2025-04-26,iran russia agree on bcm of gas supply nuclear...,iran russia agree on bcm of gas supply nuclear...,0.0,"[iran, russia]"
2,680cfb785f567a7d22646caf,India lifts Jhelum River dams and floods river...,,17,1,2025-04-26 15:01:41,https://english.mathrubhumi.com/news/india/pok...,2025-04-26,india lifts jhelum river dam and flood riverbe...,india lifts jhelum river dam and flood riverbe...,0.1,[india]
3,680cfb785f567a7d22646cb0,1 in 3 women in S. Korea experienced violence ...,,22,8,2025-04-26 14:52:32,https://en.yna.co.kr/view/AEN20250424010900315,2025-04-26,in woman in korea experience violence at least...,in woman in korea experience violence at least...,-0.3,[korea]
4,680cfb785f567a7d22646cb1,U.S. and Iran conclude third round of nuclear ...,,11,2,2025-04-26 14:50:10,https://www.axios.com/2025/04/26/us-iran-nucle...,2025-04-26,and iran conclude third round of nuclear talk ...,and iran conclude third round of nuclear talk ...,-0.066667,[iran]


In [None]:
# df["titulo_limpo2"]=df["titulo_limpio"]
df.head()
# df.drop(columns=["titulo_limpo2"])


In [None]:
#Análisis por variables
#Puntaje promedio
print("Puntaje promedio:", df["score"].mean())
#Promedio de comentarios
print("Comentarios promedio:", df["comentarios"].mean())
#Cantidad de posts por día
df["fecha_dia"] = df["fecha_utc"].dt.date
posts_por_dia = df.groupby("fecha_dia").size()
print(posts_por_dia)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Gráfico de barras: posts por día
posts_por_dia.plot(kind="bar", figsize=(10,5), title="Posts por día")
plt.ylabel("Cantidad de posts")
plt.xlabel("Fecha")
plt.tight_layout()
plt.show()

# Distribución de scores
sns.histplot(df["score"], bins=20, kde=True)
plt.title("Distribución de puntajes")
plt.xlabel("Score")
plt.show()


In [None]:
# Rellenar campos vacíos con strings vacíos
df["texto"] = df["texto"].fillna("")
df["titulo"] = df["titulo"].fillna("")

import nltk
from nltk.corpus import stopwords
import spacy
import string

nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("spanish"))

def limpiar_texto(texto):
    doc = nlp(texto.lower())
    tokens_limpios = [
        token.lemma_ for token in doc 
        if token.is_alpha and token.lemma_ not in stop_words
    ]
    return " ".join(tokens_limpios)

# Aplicar limpieza al campo "titulo"
df["titulo_limpio"] = df["titulo"].apply(limpiar_texto)


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

texto_total = " ".join(df["titulo_limpio"].tolist())
# texto_total = " ".join(df["titulo"].tolist())

wordcloud = WordCloud(width=2000, height=1200, background_color='white').generate(texto_total)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("🔤 Palabras más comunes en los títulos")
plt.show()


In [None]:
from textblob import TextBlob

def sentimiento(texto):
    return TextBlob(texto).sentiment.polarity  # De -1 a +1

df["sentimiento"] = df["titulo_limpio"].apply(sentimiento)

# Visualizamos
import seaborn as sns

sns.histplot(df["sentimiento"], bins=20, kde=True)
plt.title("Sentimiento de los títulos")
plt.xlabel("Polaridad (-1 = negativo, +1 = positivo)")
plt.show()


In [77]:
import spacy

# Cargar el modelo
nlp = spacy.load("en_core_web_sm")

def extraer_ubicaciones(texto):
    doc = nlp(texto)
    ubicaciones = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    return ubicaciones

# Ejemplo de uso
df["ubicaciones"] = df["titulo_limpio"].apply(extraer_ubicaciones)

# Ver qué ubicaciones se encontraron
print(df[["titulo", "ubicaciones"]].head())


                                              titulo        ubicaciones
0  Russia confirms North Korean troops are in Ukr...  [russia, ukraine]
1  Iran, Russia agree on 55 bcm of gas supplies, ...     [iran, russia]
2  India lifts Jhelum River dams and floods river...            [india]
3  1 in 3 women in S. Korea experienced violence ...            [korea]
4  U.S. and Iran conclude third round of nuclear ...             [iran]


In [82]:
df_exploded = df.explode("ubicaciones")
df_exploded.head(10)

Unnamed: 0,_id,titulo,texto,score,comentarios,fecha_utc,url,fecha_dia,titulo_limpio,titulo_limpo2,sentimiento,ubicaciones
0,680cfb785f567a7d22646cad,Russia confirms North Korean troops are in Ukr...,,6,1,2025-04-26 15:17:45,https://www.nbcnews.com/world/ukraine/russia-c...,2025-04-26,russia confirm north korean troop be in ukrain...,russia confirm north korean troop be in ukrain...,0.0,russia
0,680cfb785f567a7d22646cad,Russia confirms North Korean troops are in Ukr...,,6,1,2025-04-26 15:17:45,https://www.nbcnews.com/world/ukraine/russia-c...,2025-04-26,russia confirm north korean troop be in ukrain...,russia confirm north korean troop be in ukrain...,0.0,ukraine
1,680cfb785f567a7d22646cae,"Iran, Russia agree on 55 bcm of gas supplies, ...",,3,0,2025-04-26 15:14:01,https://www.reuters.com/world/iranian-oil-mini...,2025-04-26,iran russia agree on bcm of gas supply nuclear...,iran russia agree on bcm of gas supply nuclear...,0.0,iran
1,680cfb785f567a7d22646cae,"Iran, Russia agree on 55 bcm of gas supplies, ...",,3,0,2025-04-26 15:14:01,https://www.reuters.com/world/iranian-oil-mini...,2025-04-26,iran russia agree on bcm of gas supply nuclear...,iran russia agree on bcm of gas supply nuclear...,0.0,russia
2,680cfb785f567a7d22646caf,India lifts Jhelum River dams and floods river...,,17,1,2025-04-26 15:01:41,https://english.mathrubhumi.com/news/india/pok...,2025-04-26,india lifts jhelum river dam and flood riverbe...,india lifts jhelum river dam and flood riverbe...,0.1,india
3,680cfb785f567a7d22646cb0,1 in 3 women in S. Korea experienced violence ...,,22,8,2025-04-26 14:52:32,https://en.yna.co.kr/view/AEN20250424010900315,2025-04-26,in woman in korea experience violence at least...,in woman in korea experience violence at least...,-0.3,korea
4,680cfb785f567a7d22646cb1,U.S. and Iran conclude third round of nuclear ...,,11,2,2025-04-26 14:50:10,https://www.axios.com/2025/04/26/us-iran-nucle...,2025-04-26,and iran conclude third round of nuclear talk ...,and iran conclude third round of nuclear talk ...,-0.066667,iran
5,680cfb785f567a7d22646cb2,Iran explosion: Deaths reported in major blast...,,20,5,2025-04-26 14:42:28,https://bbc.com/news/articles/cx251yyvwr3o,2025-04-26,iran explosion death report in major blast at ...,iran explosion death report in major blast at ...,0.0625,iran
5,680cfb785f567a7d22646cb2,Iran explosion: Deaths reported in major blast...,,20,5,2025-04-26 14:42:28,https://bbc.com/news/articles/cx251yyvwr3o,2025-04-26,iran explosion death report in major blast at ...,iran explosion death report in major blast at ...,0.0625,rajee
6,680cfb785f567a7d22646cb3,General Staff of the Armed Forces of Ukraine r...,,20,1,2025-04-26 14:41:30,https://unn.ua/en/news/the-general-staff-of-th...,2025-04-26,general staff of the armed force of ukraine re...,general staff of the armed force of ukraine re...,0.025,


In [83]:
# Contar la cantidad de posts por ubicación
posts_por_ubicacion = df_exploded.groupby(["ubicaciones"]).size().reset_index(name="cantidad")

# Mostrar los resultados
print(posts_por_ubicacion)

          ubicaciones  cantidad
0           actividad         4
1             acuerdo         4
2         afghanistan         5
3          ahora baja         3
4             algeria         1
..                ...       ...
169  west acknowledge         1
170             yemen         8
171            zurich         3
172             álbum         3
173             único         3

[174 rows x 2 columns]


In [None]:
# from geopy.geocoders import Nominatim

# # Inicializar el geolocalizador
# geolocator = Nominatim(user_agent="geopyExample")

# # Función para obtener las coordenadas
# def obtener_coordenadas(ubicacion):
#     try:
#         location = geolocator.geocode(ubicacion)
#         if location:
#             return location.latitude, location.longitude
#         else:
#             return None, None
#     except Exception as e:
#         return None, None

# # Aplicar geolocalización al DataFrame
# df["coordenadas"] = df["ubicaciones"].apply(lambda x: obtener_coordenadas(x[0] if x else ""))
# df["latitud"] = df["coordenadas"].apply(lambda x: x[0] if x else None)
# df["longitud"] = df["coordenadas"].apply(lambda x: x[1] if x else None)

# print(df)


In [None]:
# Aplicar sólo a filas donde latitud o longitud son nulas
mask = df['latitud'].isnull() | df['longitud'].isnull()

df.loc[mask, 'coordenadas'] = df.loc[mask, 'ubicaciones'].apply(lambda x: obtener_coordenadas(x[0] if x else ""))
df['latitud'] = df['coordenadas'].apply(lambda x: x[0] if x else None)
df['longitud'] = df['coordenadas'].apply(lambda x: x[1] if x else None)


In [None]:
import pandas as pd
import time
from geopy.geocoders import Nominatim

# Inicializar geolocalizador
geolocator = Nominatim(user_agent="geopyExample")

# Función segura para obtener coordenadas
def obtener_coordenadas_seguro(ubicacion, lat, lon):
    if pd.notnull(lat) and pd.notnull(lon):
        return lat, lon
    try:
        location = geolocator.geocode(ubicacion)
        time.sleep(1)  # Pausa de 1 segundo
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except Exception as e:
        return None, None

# Aplicar sólo donde falta lat o lon
mask = df['latitud'].isnull() | df['longitud'].isnull()
df.loc[mask, 'coordenadas'] = df.loc[mask].apply(
    lambda row: obtener_coordenadas_seguro(
        row['ubicaciones'][0] if row['ubicaciones'] else "",
        row.get('latitud'),
        row.get('longitud')
    ), axis=1
)

# Actualizar latitud y longitud
df['latitud'] = df['coordenadas'].apply(lambda x: x[0] if x else None)
df['longitud'] = df['coordenadas'].apply(lambda x: x[1] if x else None)


In [None]:
# Contar la cantidad de posts por ubicación
posts_por_ubicacion = df.groupby(["latitud", "longitud"]).size().reset_index(name="cantidad")

# Mostrar los resultados
print(posts_por_ubicacion)


In [None]:
import folium
from folium.plugins import MarkerCluster

# Crear un mapa base
m = folium.Map(location=[20,0], zoom_start=2)  # Ubicación central para el mapa mundial

# Crear un MarkerCluster para agrupar marcadores
marker_cluster = MarkerCluster().add_to(m)

# Agregar los marcadores al mapa
for _, row in posts_por_ubicacion.iterrows():
    lat, lon, cantidad = row['latitud'], row['longitud'], row['cantidad']
    if lat and lon:  # Verificar que las coordenadas sean válidas
        folium.CircleMarker(
            location=[lat, lon],
            radius=cantidad * 2,  # Tamaño del marcador proporcional a la cantidad
            popup=f"{cantidad} posts",
            color="blue",
            fill=True,
            fill_color="blue"
        ).add_to(marker_cluster)

# Mostrar el mapa
m.save("reddit_mapa_ubicaciones.html")


In [None]:
import json

# Cargar palabras formales
with open('palabras_formales.json', 'r', encoding='utf-8') as f:
    palabras_formales = json.load(f)

# Cargar palabras informales
with open('palabras_informales.json', 'r', encoding='utf-8') as f:
    palabras_informales = json.load(f)

# Ejemplo de uso
texto = "hubo un tiroteo en la terminal"
if any(palabra in texto for lista in palabras_formales.values() for palabra in lista):
    print("⚠️ Alerta formal detectada")

if any(palabra in texto for lista in palabras_informales.values() for palabra in lista):
    print("⚠️ Alerta informal detectada")


In [None]:
import pandas as pd
import re
import unicodedata
import json

# Cargar palabras
with open('palabras_formales.json', 'r', encoding='utf-8') as f:
    palabras_formales = json.load(f)

with open('palabras_informales.json', 'r', encoding='utf-8') as f:
    palabras_informales = json.load(f)

# Función para normalizar texto (borrar tildes)
def normalizar(texto):
    texto = unicodedata.normalize('NFKD', texto).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return texto.lower()

# Unir todas las palabras en listas planas
palabras_formales_flat = [p for lista in palabras_formales.values() for p in lista]
palabras_informales_flat = [p for lista in palabras_informales.values() for p in lista]

# Crear patrones regex para cada grupo
patron_formal = re.compile(r'\b(?:' + '|'.join(map(re.escape, palabras_formales_flat)) + r')\b', flags=re.IGNORECASE)
patron_informal = re.compile(r'\b(?:' + '|'.join(map(re.escape, palabras_informales_flat)) + r')\b', flags=re.IGNORECASE)

# Función para detectar con regex
def detectar_alerta_formal(texto):
    texto = normalizar(texto)
    return bool(patron_formal.search(texto))

def detectar_alerta_informal(texto):
    texto = normalizar(texto)
    return bool(patron_informal.search(texto))

# Ejemplo de DataFrame
# df = pd.DataFrame({
#     'texto': [
#         "hubo un tiroteo en la terminal de tren",
#         "se armó alto quilombo en el mercado",
#         "alerta de bomba en el aeropuerto",
#         "todo explotó mal anoche en la estación",
#         "heridos tras el ataque terrorista"
#     ]
# })

# Aplicarlo al DataFrame
df['alerta_formal'] = df['texto'].apply(detectar_alerta_formal)
df['alerta_informal'] = df['texto'].apply(detectar_alerta_informal)

print(df)


In [None]:
df['alerta_detectada'] = df['alerta_formal'] | df['alerta_informal']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))
sns.countplot(data=df, x='alerta_detectada', palette='coolwarm')
plt.title('¿Se detectó una alerta en el post?')
plt.xlabel('Alerta detectada')
plt.ylabel('Cantidad de posts')
plt.xticks([0,1], ['No', 'Sí'])
plt.show()


In [None]:
import folium


# Crear columna alerta_detectada
# df['alerta_detectada'] = df['alerta_formal'] | df['alerta_informal']


# Crear mapa centrado (por ejemplo en Buenos Aires)
m = folium.Map(location=[-34.6037, -58.3816], zoom_start=5)

# Agregar marcadores
for idx, row in df_alertas.iterrows():
    folium.Marker(
        location=[row['lat'], row['lon']],
        popup=f"Alerta: {row['tipo_alerta']}<br>Texto: {row['texto']}",
        icon=folium.Icon(color='red' if row['tipo_alerta'] == 'Formal' else 'blue' if row['tipo_alerta'] == 'Informal' else 'orange')
    ).add_to(m)

# Guardar mapa
m.save('mapa_alertas.html')
