### Notebook *NB01c – Preparación del sentimiento financiero (NLP)*  
**Autor:** Jesús Daniel Romeral Cortina

**Objetivo:**

Extracción y agregación diaria del sentimiento del mercado a partir de noticias financieras: 
Limpieza de noticias. Inferencia con FinBERT. Agregación diaria del sentimiento. Generación de métricas de sentimiento.



In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

2026-01-22 21:20:11.471309: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

NEWS_PATH = "../../datos/sp500_news_kaggle.csv"
OUT_PATH = "../../datos/sentiment_daily.csv"

In [3]:
df_news = pd.read_csv(NEWS_PATH)
df_news.head()

Unnamed: 0,Title,Date,CP
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02,1447.16
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02,1447.16
2,2008 predictions for the S&P 500,2008-01-02,1447.16
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18


In [4]:

df_news["Date"] = pd.to_datetime(df_news["Date"], errors="coerce")
df_news["text"] = (df_news["Title"].astype(str).fillna("").str.strip())
df_news = df_news.dropna(subset=["Date"])
df_news = df_news[df_news["text"].str.len() > 0].copy()


df_news.head()


Unnamed: 0,Title,Date,CP,text
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02,1447.16,"JPMorgan Predicts 2008 Will Be ""Nothing But Net"""
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02,1447.16,Dow Tallies Biggest First-session-of-year Poin...
2,2008 predictions for the S&P 500,2008-01-02,1447.16,2008 predictions for the S&P 500
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16,"U.S. Stocks Higher After Economic Data, Monsan..."
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18,U.S. Stocks Climb As Hopes Increase For More F...


In [None]:

model_name = "yiyanghkust/finbert-tone"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

print("FinBERT cargado correctamente.")




FinBERT cargado correctamente.


In [6]:
def finbert_predict(texts, batch_size=32):
    out = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        out.extend(finbert(list(batch)))
    return out

texts = df_news["text"].tolist()
pred = finbert_predict(texts, batch_size=32)

df_news["sentiment_label"] = [r["label"] for r in pred]
df_news["sentiment_score"] = [r["score"] for r in pred]
df_news[["sentiment_label", "sentiment_score"]].value_counts().head()


100%|██████████| 598/598 [23:25<00:00,  2.35s/it]


sentiment_label  sentiment_score
Positive         1.000000           447
                 1.000000           166
                 1.000000           101
                 1.000000            90
Neutral          0.999999            79
Name: count, dtype: int64

In [7]:

df_news["sentiment_label"] = df_news["sentiment_label"].str.lower()

label_to_sign = {"positive": 1, "negative": -1, "neutral": 0}
df_news["sentiment_signed"] = df_news["sentiment_label"].map(label_to_sign)*df_news["sentiment_score"]


In [8]:
df_news["day"] = df_news["Date"].dt.normalize()

daily_model = df_news.groupby("day").agg(
    sentiment_mean=("sentiment_signed", "mean"),
    n_news=("sentiment_signed", "count"),
).reset_index().rename(columns={"day":"Date"})

daily_model.sort_values("Date").head()

Unnamed: 0,Date,sentiment_mean,n_news
0,2008-01-02,0.162645,3
1,2008-01-03,0.999999,1
2,2008-01-07,0.983853,1
3,2008-01-09,-0.492317,2
4,2008-01-10,0.0,1


In [None]:
daily_model.to_csv(OUT_PATH, index=False)

if os.path.exists(OUT_PATH): 
    print(f"Archivo guardado correctamente en {OUT_PATH}") 
else: 
    print("Error: el archivo no se ha guardado.")

Archivo guardado correctamente en ../../datos/sentiment_daily.csv


In [11]:



daily_full = df_news.groupby("day").agg(
    sentiment_mean=("sentiment_signed", "mean"),
    sentiment_median=("sentiment_signed", "median"),
    sentiment_std=("sentiment_signed", "std"),
    n_news=("sentiment_signed", "count"),
    pos_ratio=("sentiment_label", lambda x: (x == "positive").mean()),
    neg_ratio=("sentiment_label", lambda x: (x == "negative").mean()),
    neu_ratio=("sentiment_label", lambda x: (x == "neutral").mean()),
).reset_index().rename(columns={"day":"Date"})

daily_full.sort_values("Date").head()


Unnamed: 0,Date,sentiment_mean,sentiment_median,sentiment_std,n_news,pos_ratio,neg_ratio,neu_ratio
0,2008-01-02,0.162645,0.0,0.281709,3,0.333333,0.0,0.666667
1,2008-01-03,0.999999,0.999999,,1,1.0,0.0,0.0
2,2008-01-07,0.983853,0.983853,,1,1.0,0.0,0.0
3,2008-01-09,-0.492317,-0.492317,0.696242,2,0.0,0.5,0.5
4,2008-01-10,0.0,0.0,,1,0.0,0.0,1.0


In [13]:
daily_full.to_csv("../../datos/sentiment_daily_FULL.csv", index=False)
print("Guardado:", "../../datos/sentiment_daily_FULL.csv")


Guardado: ../../datos/sentiment_daily_FULL.csv
