In [1]:
# ============================================================
# --- LIMPIADOR DE NOTICIAS FOMC CON FINBERT ---
# ============================================================

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
from tqdm import tqdm

# ====================================================
# --- 1️⃣ Rutas ---
# ====================================================
ruta_origen = "../data/csv/noticias_con_sentimiento.csv"   # CSV con texto completo
ruta_destino = "../data/csv/fomc_sentiment_finbert_clean.csv"  # salida limpia

# ====================================================
# --- 2️⃣ Cargar CSV original ---
# ====================================================
df = pd.read_csv(ruta_origen)


  from .autonotebook import tqdm as notebook_tqdm





In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

# ============================================================
# --- Distribución de Sentiment_Score ---
# ============================================================

# Aseguramos nombre correcto (minúsculas)
col_sent = [c for c in df.columns if "sentiment_score" in c.lower()][0]

plt.figure(figsize=(9,5))
sns.histplot(df[col_sent], bins=30, kde=True, color="#0083B0", edgecolor="white")

plt.title("Distribución de Sentiment_Score", fontsize=14, weight="bold")
plt.xlabel("Sentiment_Score (de -1 a 1)")
plt.ylabel("Frecuencia")
plt.grid(alpha=0.3)
plt.axvline(0, color='black', linestyle='--', linewidth=1)
plt.show()

# Información complementaria
print("📊 Estadísticas básicas del Sentiment_Score:")
print(df[col_sent].describe().round(3))


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# ============================================================
# --- LIMPIEZA Y ANÁLISIS DE NOTICIAS FOMC CON FINBERT (TF) ---
# ============================================================

import torch
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline
import pandas as pd
from tqdm import tqdm

# ====================================================
# --- 1️⃣ Rutas ---
# ====================================================
ruta_origen = "../data/csv/noticias_con_sentimiento.csv"   # CSV con texto completo
ruta_destino = "../data/csv/fomc_sentiment_finbert_clean.csv"  # salida limpia


# ====================================================
# --- 2️⃣ Cargar CSV y seleccionar columnas ---
# ====================================================
df = pd.read_csv(ruta_origen, encoding="utf-8", on_bad_lines="skip")

# Nos quedamos solo con 'Release Date' y 'Text'
df = df[['Release Date', 'Text']].dropna()
df.columns = ['release_date', 'text']
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df = df.dropna(subset=['release_date', 'text'])

print(f"✅ Noticias cargadas: {len(df)} registros")
print(df.head(2))

# ====================================================
# --- 3️⃣ Cargar modelo FinBERT (TensorFlow) ---
# ====================================================
print("\n📦 Cargando modelo FinBERT (ProsusAI/finbert, versión TensorFlow)...")
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

finbert = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    framework="tf",   # importante: fuerza TensorFlow
    return_all_scores=False
)

# ====================================================
# --- 4️⃣ Analizar sentimiento con FinBERT ---
# ====================================================
labels, scores = [], []

print("\n🧠 Analizando sentimiento de cada noticia...\n")

for text in tqdm(df['text'].astype(str), total=len(df)):
    try:
        res = finbert(text[:512])  # FinBERT solo acepta hasta 512 tokens
        labels.append(res[0]['label'].lower())
        scores.append(round(res[0]['score'], 4))
    except Exception:
        labels.append("neutral")
        scores.append(0.0)

# ====================================================
# --- 5️⃣ Crear nuevo CSV limpio ---
# ====================================================
df_clean = pd.DataFrame({
    'release_date': df['release_date'],
    'sentiment_label': labels,
    'sentiment_score': scores
})

df_clean.to_csv(ruta_destino, index=False)
print(f"\n✅ Archivo limpio guardado correctamente en:\n{ruta_destino}")

print(df_clean.head(3))


✅ Noticias cargadas: 419 registros
  release_date                                               text
0   2025-05-28  Minutes of the Federal Open Market Committee\n...
1   2025-05-07  Although swings in net exports have affected t...

📦 Cargando modelo FinBERT (ProsusAI/finbert, versión TensorFlow)...







All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🧠 Analizando sentimiento de cada noticia...



100%|██████████| 419/419 [02:27<00:00,  2.84it/s]


✅ Archivo limpio guardado correctamente en:
C:\Users\josit\CUARTO CURSO\APRENDIZAJE AUTOMATICO\Caso02_Prediccion_BBVA_SANTANDER\data\csv\fomc_sentiment_finbert_clean.csv
  release_date sentiment_label  sentiment_score
0   2025-05-28         neutral           0.3628
1   2025-05-07        positive           0.3796
2   2025-03-19        negative           0.4154





In [5]:
print("Columnas:", df.columns.tolist())
print("Número de filas:", len(df))
df.head(3)


Columnas: ['Date', 'Release Date', 'Type', 'Text', 'Sentiment_Score', 'Sentiment_Label']
Número de filas: 448


Unnamed: 0,Date,Release Date,Type,Text,Sentiment_Score,Sentiment_Label
0,2025-05-07,2025-05-28,Minute,Minutes of the Federal Open Market Committee\n...,0.9999,positive
1,2025-05-07,2025-05-07,Statement,Although swings in net exports have affected t...,0.6249,positive
2,2025-03-19,2025-03-19,Statement,Recent indicators suggest that economic activi...,0.9306,positive


In [6]:
import pandas as pd
from datetime import datetime

# ============================================================
# --- 1️⃣ Cargar CSV original FinBERT limpio ---
# ============================================================
ruta_origen = r"C:\Users\josit\CUARTO CURSO\APRENDIZAJE AUTOMATICO\Caso02_Prediccion_BBVA_SANTANDER\data\csv\fomc_sentiment_finbert_clean.csv"
df = pd.read_csv(ruta_origen)

# Normalizar nombres por si acaso
df.columns = [c.strip().lower() for c in df.columns]
df.rename(columns={"release_date": "release_date"}, inplace=True)

df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")
df = df.dropna(subset=["release_date"]).copy()
df["fuente"] = "FOMC real"

# ============================================================
# --- 2️⃣ Crear lista de eventos históricos artificiales ---
# ============================================================
eventos = [
    # 🧨 CRISIS Y DESASTRES ECONÓMICOS
    {"release_date": datetime(2000, 3, 10), "Sentiment_Value": -0.7, "Evento": "Pinchazo burbuja punto-com"},
    {"release_date": datetime(2001, 9, 11), "Sentiment_Value": -1.0, "Evento": "Atentados del 11-S"},
    {"release_date": datetime(2003, 3, 20), "Sentiment_Value": -0.8, "Evento": "Inicio guerra de Irak"},
    {"release_date": datetime(2008, 9, 15), "Sentiment_Value": -1.0, "Evento": "Quiebra Lehman Brothers - crisis financiera global"},
    {"release_date": datetime(2010, 5, 6),  "Sentiment_Value": -0.7, "Evento": "Flash Crash en Wall Street"},
    {"release_date": datetime(2011, 8, 8),  "Sentiment_Value": -0.8, "Evento": "EE. UU. pierde calificación AAA"},
    {"release_date": datetime(2012, 6, 9),  "Sentiment_Value": -0.7, "Evento": "Crisis de deuda en España - rescate bancario"},
    {"release_date": datetime(2016, 6, 24), "Sentiment_Value": -0.7, "Evento": "Referéndum Brexit"},
    {"release_date": datetime(2020, 3, 16), "Sentiment_Value": -1.0, "Evento": "Crash global por COVID-19"},
    {"release_date": datetime(2022, 2, 24), "Sentiment_Value": -0.9, "Evento": "Inicio guerra Rusia-Ucrania"},
    {"release_date": datetime(2023, 3, 10), "Sentiment_Value": -0.8, "Evento": "Colapso del Silicon Valley Bank"},

    # 💰 RECUPERACIONES Y POLÍTICAS MONETARIAS POSITIVAS
    {"release_date": datetime(2009, 3, 9),  "Sentiment_Value": +0.8, "Evento": "Inicio de expansión cuantitativa (QE) - FED"},
    {"release_date": datetime(2012, 7, 26), "Sentiment_Value": +0.7, "Evento": "Draghi: 'Haré lo que sea necesario' (BCE)"},
    {"release_date": datetime(2020, 3, 23), "Sentiment_Value": +0.9, "Evento": "FED lanza estímulos masivos durante COVID"},
    {"release_date": datetime(2021, 11, 8), "Sentiment_Value": +0.7, "Evento": "Reapertura económica global post-COVID"},
    {"release_date": datetime(2023, 11, 1), "Sentiment_Value": +0.5, "Evento": "FED desacelera subidas de tipos"},

    # 🌍 OTROS EVENTOS GLOBALES RELEVANTES
    {"release_date": datetime(2007, 6, 29), "Sentiment_Value": +0.5, "Evento": "Lanzamiento del iPhone - auge tecnológico"},
    {"release_date": datetime(2019, 3, 11), "Sentiment_Value": -0.8, "Evento": "Muerte del presidente del Santander"},
    {"release_date": datetime(2021, 1, 6),  "Sentiment_Value": -0.5, "Evento": "Asalto al Capitolio de EE. UU."},
    {"release_date": datetime(2024, 11, 5), "Sentiment_Value": -0.4, "Evento": "Elecciones presidenciales polarizadas EE. UU."},
    {"release_date": datetime(2025, 2, 10), "Sentiment_Value": +0.6, "Evento": "Mercados optimistas ante bajada de tipos 2025"}
]

df_eventos = pd.DataFrame(eventos)

# ============================================================
# --- 3️⃣ Calcular label y score coherentes ---
# ============================================================

def label_from_value(v, thr=0.05):
    if v > thr: return "positive"
    elif v < -thr: return "negative"
    else: return "neutral"

def score_from_value(v):
    return abs(round(float(v), 4))

df_eventos["sentiment_label"] = df_eventos["Sentiment_Value"].apply(label_from_value)
df_eventos["sentiment_score"] = df_eventos["Sentiment_Value"].apply(score_from_value)
df_eventos["fuente"] = "Histórico artificial"

# ============================================================
# --- 4️⃣ Unificar estructura (igual que el CSV original) ---
# ============================================================

# Seleccionar columnas en el orden original + las nuevas
columnas_finales = [
    "release_date", "sentiment_label", "sentiment_score",
    "fuente", "Sentiment_Value", "Evento"
]

df_extendido = pd.concat([df, df_eventos[columnas_finales]], ignore_index=True)
df_extendido = df_extendido.sort_values("release_date").reset_index(drop=True)

# ============================================================
# --- 5️⃣ Guardar CSV final extendido ---
# ============================================================
ruta_guardado = r"C:\Users\josit\CUARTO CURSO\APRENDIZAJE AUTOMATICO\Caso02_Prediccion_BBVA_SANTANDER\data\csv\fomc_sentiment_extended_recent.csv"
df_extendido.to_csv(ruta_guardado, index=False)

print(f"✅ Dataset extendido correctamente con eventos 2000–2025 guardado en:\n{ruta_guardado}")
print("\n📊 Vista previa:")
print(df_extendido.tail(10))


✅ Dataset extendido correctamente con eventos 2000–2025 guardado en:
C:\Users\josit\CUARTO CURSO\APRENDIZAJE AUTOMATICO\Caso02_Prediccion_BBVA_SANTANDER\data\csv\fomc_sentiment_extended_recent.csv

📊 Vista previa:
    release_date sentiment_label  sentiment_score                fuente  \
430   2024-11-26         neutral           0.3650             FOMC real   
431   2024-12-18        negative           0.4177             FOMC real   
432   2025-01-08         neutral           0.3661             FOMC real   
433   2025-01-29        positive           0.3751             FOMC real   
434   2025-02-10        positive           0.6000  Histórico artificial   
435   2025-02-19         neutral           0.3728             FOMC real   
436   2025-03-19        negative           0.4154             FOMC real   
437   2025-04-09         neutral           0.3606             FOMC real   
438   2025-05-07        positive           0.3796             FOMC real   
439   2025-05-28         neutral    