In [20]:
#############
##LIBRARIES##
#############

import yfinance as yf
from finvizfinance.quote import finvizfinance

from statsmodels.tsa.statespace.sarimax import SARIMAX

import plotly.graph_objects as go
import pandas as pd
import numpy as np

import holidays

from langchain_community.llms import Ollama


import streamlit as st


llm = Ollama(model='llama3')

def classify_sentiment(title):
    output = llm.invoke(f"Classify the sentiment as 'POSITIVE' or 'NEGATIVE' or 'NEUTRAL' with just that one")
    return output.strip()



def classify_sentiment_batch(titles):
    print(f"🔹 Clasificando {len(titles)} títulos de noticias.")

    prompt = (
        "For each news title below, classify the sentiment as 'POSITIVE', 'NEGATIVE' or 'NEUTRAL'.\n"
        "Return exactly one sentiment per title, in the same order as the titles, and NOTHING ELSE.\n"
        "Make sure to return exactly the same number of lines as the number of news titles.\n"
    )

    prompt += "\n".join(f"- {title}" for title in titles)

    output = llm.invoke(prompt)
    print(f"🔹 Respuesta de Ollama:\n{output}\n")

    valid_sentiments = {"POSITIVE", "NEGATIVE", "NEUTRAL"}
    sentiments = []

    # Dividir la salida en líneas y limpiar los espacios
    for line in output.split("\n"):
        print("Otro")
        line = line.strip().upper()

        sentiment = next((s for s in valid_sentiments if s in line), None)

        if sentiment:
            sentiments.append(sentiment)
        else:
            sentiments.append('NEUTRAL') 

    # Si el número de clasificaciones no coincide con el número de títulos, corregir
    if len(sentiments) != len(titles):
        print(f"ERROR: Ollama devolvió {len(sentiments)} sentimientos en lugar de {len(titles)}")
        
        # Rellenar con "NEUTRAL" si faltan clasificaciones
        while len(sentiments) < len(titles):
            sentiments.append("NEUTRAL")
    print(f"La longitud de los sentiments es {len(sentiments)}")
    print(sentiments)
    return sentiments


def get_news_ticker(ticker):
    stock = finvizfinance(ticker)
    news_df = stock.ticker_news()
    return news_df

# Function to get and process news data
def get_news_data(news_df_original):
    
    news_df =news_df_original.copy()
    print("Conseguimos las noticias")
    news_df['Title'] = news_df['Title'].str.lower()

    # Enviar todas las noticias en un solo prompt para evitar múltiples llamadas lentas
    news_df['sentiment'] = classify_sentiment_batch(news_df['Title'].tolist())

    print("Despues de analizar los sentimientos, los añadimos")
    news_df_sent = news_df.copy()
    news_df_sent = news_df_sent[news_df_sent['sentiment'] != 'NEUTRAL'].copy()
    print(f"La longitud de news_df_sent es {len(news_df_sent)}")
    print("Seguimos con las fechas")
    news_df_sent['Date'] = pd.to_datetime(news_df_sent['Date'])
    news_df_sent['DateOnly'] = news_df_sent['Date'].dt.date

    print("Tenemos news_df_sent")
    print(news_df_sent)

    return news_df_sent

In [None]:
# Function to group and process sentiment data
def process_sentiment_data(news_df):
    """
    Agrupa las noticias por día de cotización y calcula el sentimiento promedio en los últimos 7 días hábiles.
    """
    print(f"Procesamos los datos de news_df con columnas: {news_df.columns}")


    grouped = news_df.groupby(['Trading_Day', 'sentiment']).size().unstack(fill_value=0)
    grouped = grouped.reindex(columns=['POSITIVE', 'NEGATIVE'], fill_value=0)

    print("Grouped inicial")
    print(grouped)

    
    all_trading_days = pd.date_range(start=grouped.index.min(), end=grouped.index.max(), freq='B')
    grouped = grouped.reindex(all_trading_days, fill_value=0)


    grouped['7day_avg_positive'] = grouped['POSITIVE'].rolling('7D', min_periods=1).sum()
    grouped['7day_avg_negative'] = grouped['NEGATIVE'].rolling('7D', min_periods=1).sum()

    grouped['7day_pct_positive'] = grouped['POSITIVE'].expanding().sum() / (grouped['POSITIVE'].expanding().sum() + grouped['NEGATIVE'].expanding().sum())

    result_df = grouped.reset_index().rename(columns={'index': 'Trading_Day'})

    print("Final result_df")
    print(result_df)

    return result_df


# Function to fetch and process stock data
def get_stock_data(ticker, start_date, end_date):
    stock_data = yf.download(ticker, start=start_date, end=end_date)  
    stock_data['Pct_Change'] = stock_data['Close'].pct_change() * 100 
    stock_data.columns = stock_data.columns.droplevel('Ticker')
    stock_data.columns.name = None
    return stock_data

#We fill the weekends too
def fill_missing_stock_dates(stock_data):
    all_dates = pd.date_range(start=stock_data.index.min(), end=stock_data.index.max(), freq="D")
    
    stock_data = stock_data.reindex(all_dates)
    return stock_data


# Function to combine sentiment and stock data
def combine_data(result_df, stock_data):
    combined_df = result_df.set_index('Trading_Day').join(stock_data[['Pct_Change']], how='inner')
    combined_df['lagged_7day_pct_positive'] = combined_df['7day_pct_positive'].shift(1)  # Lag sentiment feature

    return combined_df

# Function to calculate Pearson correlation
def calculate_correlation(combined_df):
    correlation_pct_change = combined_df[['lagged_7day_pct_positive', 'Pct_Change']].corr().iloc[0, 1]
    return correlation_pct_change

# Function to get future dates excluding weekends and holidays
def get_future_dates(start_date, num_days):
    print("get future dates")
    us_holidays = holidays.US()
    future_dates = []
    current_date = start_date

    while len(future_dates) < num_days:
        if current_date.weekday() < 5 and current_date not in us_holidays:
            future_dates.append(current_date)
        current_date += pd.Timedelta(days=1)

    return future_dates

# Function to get future dates excluding weekends or holidays from the next day
def get_future_dates_next_day(combined_df, num_days):

    print("get future dates")
    us_holidays = holidays.US()
    future_dates = []


    last_real_date = combined_df.dropna(subset=['Pct_Change']).index[-1]
    current_date = last_real_date + pd.Timedelta(days=1)

    while len(future_dates) < num_days:
        if current_date.weekday() < 5 and current_date not in us_holidays:
            if current_date not in combined_df.index:
                future_dates.append(current_date)
        current_date += pd.Timedelta(days=1)

    return future_dates


def fit_and_forecast(combined_df, function_future_dates=get_future_dates ,forecast_steps=3):
    print("A predecir")
    endog = combined_df['Pct_Change'].dropna() 
    exog = combined_df['lagged_7day_pct_positive'].dropna() 
    print("GOING FOR THE ARIMAX MODEL")
    endog = endog.tail(200)
    exog = exog.loc[endog.index]  

    model = SARIMAX(endog, exog=exog, order=(1, 1, 1))
    fit = model.fit(disp=False, maxiter=50) 

    if function_future_dates == get_future_dates_next_day:
        future_dates = function_future_dates(combined_df, forecast_steps)
    else:
        print(combined_df)
        future_dates = function_future_dates(combined_df.index[-1], forecast_steps)
    
    future_exog = []
    for date in future_dates:
        if date in combined_df.index:
            future_exog.append(combined_df.loc[date, 'lagged_7day_pct_positive'])
        else:
            future_exog.append(combined_df['lagged_7day_pct_positive'].iloc[-1])
    
    future_exog = np.array(future_exog).reshape(-1, 1)

    forecast = fit.get_forecast(steps=forecast_steps, exog=future_exog)
    forecast_mean = forecast.predicted_mean
    forecast_ci = forecast.conf_int()

    return forecast_mean, forecast_ci, future_dates






In [12]:
ticker ='SBUX'

In [13]:
news_df_original = get_news_ticker(ticker)

In [14]:
news_df_original

Unnamed: 0,Date,Title,Link,Source
0,2025-02-20 21:00:00,"Smaller Menus, Better Vibes: How Starbuckss CE...",https://finance.yahoo.com/m/3d366a5e-a0a4-3087...,The Wall Street Journal
1,2025-02-20 15:40:00,Starbucks launches compostable cups in 14 stat...,https://www.nrn.com/quick-service/starbucks-la...,Nation's Restaurant News
2,2025-02-20 09:49:00,Starbucks hoped mobile orders and the drive-th...,https://finance.yahoo.com/news/starbucks-hoped...,Fortune
3,2025-02-20 09:45:00,These Were the 5 Top-Performing Stocks in the ...,https://finance.yahoo.com/m/c242fba6-9817-3913...,Motley Fool
4,2025-02-20 00:01:00,Starbucks CEO Tells Us His Plan to Turn Around...,https://finance.yahoo.com/m/2bfaf6ce-4259-35ae...,The Wall Street Journal
...,...,...,...,...
95,2025-01-30 12:37:00,Starbucks announces bittersweet change you wil...,https://finance.yahoo.com/m/e717cd3c-d160-3010...,TheStreet
96,2025-01-30 12:23:00,Starbucks CEO would rather close down stores t...,https://qz.com/starbucks-ceo-brian-niccol-open...,Quartz
97,2025-01-30 11:40:00,U.S. GDP Grew 2.5% in 2024,https://finance.yahoo.com/m/10e7ecea-d897-39e0...,The Wall Street Journal
98,2025-01-30 11:40:00,"U.S. GDP Grew 2.5% in 2024, but Slowed Slightl...",https://finance.yahoo.com/m/deeac24b-0d46-39c0...,The Wall Street Journal


In [15]:
news_df = get_news_data(news_df_original)


Conseguimos las noticias
🔹 Clasificando 100 títulos de noticias.
🔹 Respuesta de Ollama:
Here are the sentiment classifications:

- POSITIVE
- NEUTRAL
- NEGATIVE
- POSITIVE
- POSITIVE
- POSITIVE
- NEUTRAL
- POSITIVE
- NEUTRAL
- NEGATIVE
- POSITIVE
- POSITIVE
- POSITIVE
- POSITIVE
- NEGATIVE
- POSITIVE
- POSITIVE
- NEUTRAL
- POSITIVE
- NEGATIVE
- POSITIVE
- POSITIVE
- NEGATIVE
- POSITIVE
- NEUTRAL
- POSITIVE
- POSITIVE
- POSITIVE
- POSITIVE
- NEGATIVE
- POSITIVE
- NEGATIVE
- POSITIVE
- NEUTRAL
- POSITIVE
- POSITIVE
- POSITIVE
- POSITIVE
- NEGATIVE
- POSITIVE
- POSITIVE

Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
Otro
ERROR: Ollama devolvió 43 sentimientos en lugar de 100
La longitud de los sentiments es 100
['NEUTRAL', 'NEUTRAL', 'POSITIVE', 'NEUTRAL', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEUTRAL', 'POSITIVE', '

Since it is a cheap model we supposse that the ones that were not categorized are neutral.

In [240]:
news_df_original

Unnamed: 0,Date,Title,Link,Source,sentiment
0,2025-02-20 21:00:00,"smaller menus, better vibes: how starbuckss ce...",https://finance.yahoo.com/m/3d366a5e-a0a4-3087...,The Wall Street Journal,NEUTRAL
1,2025-02-20 15:40:00,starbucks launches compostable cups in 14 stat...,https://www.nrn.com/quick-service/starbucks-la...,Nation's Restaurant News,NEUTRAL
2,2025-02-20 09:49:00,starbucks hoped mobile orders and the drive-th...,https://finance.yahoo.com/news/starbucks-hoped...,Fortune,POSITIVE
3,2025-02-20 09:45:00,these were the 5 top-performing stocks in the ...,https://finance.yahoo.com/m/c242fba6-9817-3913...,Motley Fool,NEUTRAL
4,2025-02-20 00:01:00,starbucks ceo tells us his plan to turn around...,https://finance.yahoo.com/m/2bfaf6ce-4259-35ae...,The Wall Street Journal,NEGATIVE
...,...,...,...,...,...
95,2025-01-30 12:37:00,starbucks announces bittersweet change you wil...,https://finance.yahoo.com/m/e717cd3c-d160-3010...,TheStreet,NEUTRAL
96,2025-01-30 12:23:00,starbucks ceo would rather close down stores t...,https://qz.com/starbucks-ceo-brian-niccol-open...,Quartz,NEUTRAL
97,2025-01-30 11:40:00,u.s. gdp grew 2.5% in 2024,https://finance.yahoo.com/m/10e7ecea-d897-39e0...,The Wall Street Journal,NEUTRAL
98,2025-01-30 11:40:00,"u.s. gdp grew 2.5% in 2024, but slowed slightl...",https://finance.yahoo.com/m/deeac24b-0d46-39c0...,The Wall Street Journal,NEUTRAL


In [241]:
print(news_df.shape)
news_df.columns

(35, 7)


Index(['Date', 'Title', 'Link', 'Source', 'sentiment', 'DateOnly',
       'Trading_Day'],
      dtype='object')

We define a function for the news tweeted in holidays or weekends to be the sentiment for the labour next day

In [242]:
def next_trading_day(stock_dates, news_date):
    news_date = np.datetime64(news_date) 

    pos = np.searchsorted(stock_dates, news_date)
    if pos == len(stock_dates): 
        return stock_dates[-1]
    
    return stock_dates[pos]

def trading_day(stock_data, result_df):
    stock_dates = np.array(stock_data.index)

    result_df['Trading_Day'] = result_df['DateOnly'].apply(lambda date: next_trading_day(stock_dates, date))
    return result_df

In [243]:
#Dates to get the data
start_date = news_df['DateOnly'].min().strftime('%Y-%m-%d')
end_date = news_df['DateOnly'].max().strftime('%Y-%m-%d')

print(start_date)
print(end_date)

stock_data = get_stock_data(ticker, start_date, end_date)

[*********************100%***********************]  1 of 1 completed

2025-02-08
2025-02-20





In [244]:
stock_data = fill_missing_stock_dates(stock_data)

In [245]:
stock_data.tail()

Unnamed: 0,Close,High,Low,Open,Volume,Pct_Change
2025-02-15,,,,,,
2025-02-16,,,,,,
2025-02-17,,,,,,
2025-02-18,113.029999,113.360001,111.620003,111.800003,7574100.0,0.426473
2025-02-19,112.489998,113.339996,112.339996,112.440002,8268900.0,-0.47775


In [246]:
news_df.head()

Unnamed: 0,Date,Title,Link,Source,sentiment,DateOnly,Trading_Day
2,2025-02-20 09:49:00,starbucks hoped mobile orders and the drive-th...,https://finance.yahoo.com/news/starbucks-hoped...,Fortune,POSITIVE,2025-02-20,2025-02-19
4,2025-02-20 00:01:00,starbucks ceo tells us his plan to turn around...,https://finance.yahoo.com/m/2bfaf6ce-4259-35ae...,The Wall Street Journal,NEGATIVE,2025-02-20,2025-02-19
5,2025-02-20 00:01:00,how ceo brian niccol aims to fix starbucks,https://finance.yahoo.com/video/ceo-brian-nicc...,WSJ,POSITIVE,2025-02-20,2025-02-19
6,2025-02-19 16:26:00,"buy chevron, avoid starbucks: good buy or goodbye",https://finance.yahoo.com/video/buy-chevron-av...,Yahoo Finance Video,POSITIVE,2025-02-19,2025-02-19
7,2025-02-19 16:05:00,starbucks to webcast 2025 annual meeting of sh...,https://www.businesswire.com/news/home/2025021...,Business Wire,POSITIVE,2025-02-19,2025-02-19


In [247]:
news_df = trading_day(stock_data, news_df)
news_df.head()

Unnamed: 0,Date,Title,Link,Source,sentiment,DateOnly,Trading_Day
2,2025-02-20 09:49:00,starbucks hoped mobile orders and the drive-th...,https://finance.yahoo.com/news/starbucks-hoped...,Fortune,POSITIVE,2025-02-20,2025-02-19
4,2025-02-20 00:01:00,starbucks ceo tells us his plan to turn around...,https://finance.yahoo.com/m/2bfaf6ce-4259-35ae...,The Wall Street Journal,NEGATIVE,2025-02-20,2025-02-19
5,2025-02-20 00:01:00,how ceo brian niccol aims to fix starbucks,https://finance.yahoo.com/video/ceo-brian-nicc...,WSJ,POSITIVE,2025-02-20,2025-02-19
6,2025-02-19 16:26:00,"buy chevron, avoid starbucks: good buy or goodbye",https://finance.yahoo.com/video/buy-chevron-av...,Yahoo Finance Video,POSITIVE,2025-02-19,2025-02-19
7,2025-02-19 16:05:00,starbucks to webcast 2025 annual meeting of sh...,https://www.businesswire.com/news/home/2025021...,Business Wire,POSITIVE,2025-02-19,2025-02-19


## We process the news_df to calculate the statistics

In [248]:
news_df.columns

Index(['Date', 'Title', 'Link', 'Source', 'sentiment', 'DateOnly',
       'Trading_Day'],
      dtype='object')

In [249]:
result_df = process_sentiment_data(news_df)

Procesamos los datos de news_df con columnas: Index(['Date', 'Title', 'Link', 'Source', 'sentiment', 'DateOnly',
       'Trading_Day'],
      dtype='object')
Grouped inicial
sentiment    POSITIVE  NEGATIVE
Trading_Day                    
2025-02-10          8         2
2025-02-11          4         1
2025-02-12          3         1
2025-02-13          1         1
2025-02-14          3         1
2025-02-16          1         0
2025-02-17          2         0
2025-02-18          0         1
2025-02-19          5         1
Final result_df
sentiment Trading_Day  POSITIVE  NEGATIVE  7day_avg_positive  \
0          2025-02-10         8         2                8.0   
1          2025-02-11         4         1               12.0   
2          2025-02-12         3         1               15.0   
3          2025-02-13         1         1               16.0   
4          2025-02-14         3         1               19.0   
5          2025-02-17         2         0               13.0   
6         

In [250]:
result_df

sentiment,Trading_Day,POSITIVE,NEGATIVE,7day_avg_positive,7day_avg_negative,7day_pct_positive
0,2025-02-10,8,2,8.0,2.0,0.8
1,2025-02-11,4,1,12.0,3.0,0.8
2,2025-02-12,3,1,15.0,4.0,0.789474
3,2025-02-13,1,1,16.0,5.0,0.761905
4,2025-02-14,3,1,19.0,6.0,0.76
5,2025-02-17,2,0,13.0,4.0,0.777778
6,2025-02-18,0,1,9.0,4.0,0.75
7,2025-02-19,5,1,11.0,4.0,0.764706


In [251]:
result_df

sentiment,Trading_Day,POSITIVE,NEGATIVE,7day_avg_positive,7day_avg_negative,7day_pct_positive
0,2025-02-10,8,2,8.0,2.0,0.8
1,2025-02-11,4,1,12.0,3.0,0.8
2,2025-02-12,3,1,15.0,4.0,0.789474
3,2025-02-13,1,1,16.0,5.0,0.761905
4,2025-02-14,3,1,19.0,6.0,0.76
5,2025-02-17,2,0,13.0,4.0,0.777778
6,2025-02-18,0,1,9.0,4.0,0.75
7,2025-02-19,5,1,11.0,4.0,0.764706


We combine the news with the stock data

In [252]:
stock_data

Unnamed: 0,Close,High,Low,Open,Volume,Pct_Change
2025-02-10,110.848366,111.972271,110.400798,111.972271,8348400.0,
2025-02-11,110.430634,111.474968,110.012903,110.281442,4683100.0,-0.37685
2025-02-12,112.429787,112.578978,109.804039,109.963171,7269400.0,1.810325
2025-02-13,112.389999,112.787842,110.927934,112.509354,10046300.0,-0.035389
2025-02-14,112.550003,113.470001,112.029999,112.82,9115000.0,0.142365
2025-02-15,,,,,,
2025-02-16,,,,,,
2025-02-17,,,,,,
2025-02-18,113.029999,113.360001,111.620003,111.800003,7574100.0,0.426473
2025-02-19,112.489998,113.339996,112.339996,112.440002,8268900.0,-0.47775


In [253]:
result_df

sentiment,Trading_Day,POSITIVE,NEGATIVE,7day_avg_positive,7day_avg_negative,7day_pct_positive
0,2025-02-10,8,2,8.0,2.0,0.8
1,2025-02-11,4,1,12.0,3.0,0.8
2,2025-02-12,3,1,15.0,4.0,0.789474
3,2025-02-13,1,1,16.0,5.0,0.761905
4,2025-02-14,3,1,19.0,6.0,0.76
5,2025-02-17,2,0,13.0,4.0,0.777778
6,2025-02-18,0,1,9.0,4.0,0.75
7,2025-02-19,5,1,11.0,4.0,0.764706


In [254]:
combined_df = combine_data(result_df, stock_data)

In [255]:
combined_df

Unnamed: 0,POSITIVE,NEGATIVE,7day_avg_positive,7day_avg_negative,7day_pct_positive,Pct_Change,lagged_7day_pct_positive
2025-02-10,8,2,8.0,2.0,0.8,,
2025-02-11,4,1,12.0,3.0,0.8,-0.37685,0.8
2025-02-12,3,1,15.0,4.0,0.789474,1.810325,0.8
2025-02-13,1,1,16.0,5.0,0.761905,-0.035389,0.789474
2025-02-14,3,1,19.0,6.0,0.76,0.142365,0.761905
2025-02-17,2,0,13.0,4.0,0.777778,,0.76
2025-02-18,0,1,9.0,4.0,0.75,0.426473,0.777778
2025-02-19,5,1,11.0,4.0,0.764706,-0.47775,0.75


Correlation calculation

In [256]:
correlation_pct_change = calculate_correlation(combined_df)

In [257]:
correlation_pct_change

np.float64(0.4578510228122745)

In [258]:
print(f"Pearson correlation between lagged sentiment score and stock percentage change: {correlation_pct_change}")

Pearson correlation between lagged sentiment score and stock percentage change: 0.4578510228122745


In [262]:
forecast_mean, forecast_ci, forecast_index = fit_and_forecast(combined_df, get_future_dates_next_day)

A predecir
GOING FOR THE ARIMAX MODEL
get future dates



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


Non-invertible starting MA parameters found. Using zeros as starting parameters.


Maximum Likelihood optimization failed to converge. Check mle_retvals


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



In [263]:
combined_df

Unnamed: 0,POSITIVE,NEGATIVE,7day_avg_positive,7day_avg_negative,7day_pct_positive,Pct_Change,lagged_7day_pct_positive
2025-02-10,8,2,8.0,2.0,0.8,,
2025-02-11,4,1,12.0,3.0,0.8,-0.37685,0.8
2025-02-12,3,1,15.0,4.0,0.789474,1.810325,0.8
2025-02-13,1,1,16.0,5.0,0.761905,-0.035389,0.789474
2025-02-14,3,1,19.0,6.0,0.76,0.142365,0.761905
2025-02-17,2,0,13.0,4.0,0.777778,,0.76
2025-02-18,0,1,9.0,4.0,0.75,0.426473,0.777778
2025-02-19,5,1,11.0,4.0,0.764706,-0.47775,0.75


We join the prediction with the data

In [286]:
print(forecast_index)
forecast_mean

[Timestamp('2025-02-20 00:00:00'), Timestamp('2025-02-21 00:00:00'), Timestamp('2025-02-24 00:00:00')]


6   -0.180701
7   -0.449917
8   -0.205926
Name: predicted_mean, dtype: float64

In [289]:
print(type(forecast_index))
type(forecast_mean)

<class 'list'>


pandas.core.series.Series

In [311]:
combined_df

Unnamed: 0,POSITIVE,NEGATIVE,7day_avg_positive,7day_avg_negative,7day_pct_positive,Pct_Change,lagged_7day_pct_positive
2025-02-10,8,2,8.0,2.0,0.8,,
2025-02-11,4,1,12.0,3.0,0.8,-0.37685,0.8
2025-02-12,3,1,15.0,4.0,0.789474,1.810325,0.8
2025-02-13,1,1,16.0,5.0,0.761905,-0.035389,0.789474
2025-02-14,3,1,19.0,6.0,0.76,0.142365,0.761905
2025-02-17,2,0,13.0,4.0,0.777778,,0.76
2025-02-18,0,1,9.0,4.0,0.75,0.426473,0.777778
2025-02-19,5,1,11.0,4.0,0.764706,-0.47775,0.75


In [280]:
last_real_date = combined_df.dropna(subset=['Pct_Change']).index[-1]
last_real_value = combined_df['Pct_Change'].dropna().iloc[-1]
print(last_real_date)
print(last_real_value)

2025-02-19 00:00:00
-0.47775008525103857


In [None]:
"""
forecast_mean_from_start = pd.Series(
    [last_real_value] + forecast_mean,
    index=[last_real_date] + forecast_index
)

forecast_index_from_start = forecast_mean_from_start.index

forecast_mean_from_start = pd.concat([pd.Series([last_real_value], index=[forecast_mean.index[0]-1]), forecast_mean])

#Put (0,0) or (-0.5,0.5). We add the last value taht we know with an interval of confidence of (0,0) because we already know the value
forecast_ci_from_start = pd.concat([
    pd.DataFrame({'lower Pct_Change': [0], 'upper Pct_Change': [0]}, index=[forecast_ci.index[0] - 1]), 
    forecast_ci
])

pct_change_mean = combined_df['Pct_Change'].mean()
pct_change_std = combined_df['Pct_Change'].std()

forecast_mean_from_start_std = (forecast_mean_from_start - pct_change_mean) / pct_change_std

forecast_ci_from_start_std = pd.DataFrame({
    'lower Pct_Change': (forecast_ci_from_start['lower Pct_Change'] - pct_change_mean) / pct_change_std,
    'upper Pct_Change': (forecast_ci_from_start['upper Pct_Change'] - pct_change_mean) / pct_change_std
}, index=forecast_ci_from_start.index)
"""

We create the function preprocesing with this

In [None]:
forecast_mean_from_start = pd.Series(
    [last_real_value] + forecast_mean.tolist(),
    index=[last_real_date] + forecast_index
)

forecast_index_from_start = forecast_mean_from_start.index

forecast_mean_from_start = pd.concat([
    pd.Series([last_real_value], index=[last_real_date]),
    forecast_mean
])

forecast_ci_first_point = pd.DataFrame({
    'lower Pct_Change': [last_real_value], 
    'upper Pct_Change': [last_real_value]
}, index=[last_real_date])

forecast_ci_from_start = pd.concat([forecast_ci_first_point, forecast_ci])

pct_change_mean = combined_df['Pct_Change'].mean()
pct_change_std = combined_df['Pct_Change'].std()

forecast_mean_from_start_std = (forecast_mean_from_start - pct_change_mean) / pct_change_std

forecast_ci_from_start_std = pd.DataFrame({
    'lower Pct_Change': (forecast_ci_from_start['lower Pct_Change'] - pct_change_mean) / pct_change_std,
    'upper Pct_Change': (forecast_ci_from_start['upper Pct_Change'] - pct_change_mean) / pct_change_std
}, index=forecast_ci_from_start.index)


In [458]:
print(forecast_ci_from_start)
print(forecast_mean_from_start)
forecast_index_from_start

                     lower Pct_Change  upper Pct_Change
2025-02-19 00:00:00         -0.477750         -0.477750
6                           -0.901450          0.540048
7                           -1.338664          0.438831
8                           -1.280005          0.868152
2025-02-19 00:00:00   -0.477750
6                     -0.180701
7                     -0.449917
8                     -0.205926
dtype: float64


DatetimeIndex(['2025-02-19', '2025-02-20', '2025-02-21', '2025-02-24'], dtype='datetime64[ns]', freq=None)

In [459]:
def create_plot(combined_df, forecast_mean, forecast_ci, forecast_index):


    sentiment_std = (combined_df['7day_pct_positive'] - combined_df['7day_pct_positive'].mean()) / combined_df['7day_pct_positive'].std()

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=sentiment_std,
        name='Standardized Sentiment Proportion',
        line=dict(color='blue'),
        mode='lines'
    ))

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=combined_df['Pct_Change'],
        name='Stock Pct Change',
        line=dict(color='yellow'),
        yaxis='y2',
        mode='lines'
    ))

    fig.add_trace(go.Scatter(
        x=forecast_index,
        y=forecast_mean,
        name='Forecasted Stock Pct Change',
        line=dict(color='red'),
        mode='lines'
    ))

    fig.add_trace(go.Scatter(
        x=np.concatenate([forecast_index, forecast_index[::-1]]),
        y=np.concatenate([forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1][::-1]]),
        fill='toself',
        fillcolor='rgba(255,0,0,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    ))

    fig.update_layout(
        title='Sentiment Proportion and Stock Percentage Change with Forecast',
        xaxis_title='Date',
        yaxis=dict(
            title=dict(text='Standardized Sentiment Proportion', font=dict(color='yellow'))
        ),
        yaxis2=dict(
            title=dict(text='Stock Pct Change', font=dict(color='yellow')),
            overlaying='y',
            side='right'
        ),
        template='plotly_dark'
    )


    fig.show()


In [460]:
forecast_ci

Unnamed: 0,lower Pct_Change,upper Pct_Change
6,-0.90145,0.540048
7,-1.338664,0.438831
8,-1.280005,0.868152


Prueba

In [475]:
def create_plot(combined_df, forecast_mean, forecast_ci, forecast_index):


    sentiment_std = (combined_df['7day_pct_positive'] - combined_df['7day_pct_positive'].mean()) / combined_df['7day_pct_positive'].std()

    pct_change_std = (combined_df['Pct_Change'] - combined_df['Pct_Change'].mean()) / combined_df['Pct_Change'].std()


    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=sentiment_std,
        name='Standardized Sentiment Proportion',
        line=dict(color='blue'),
        mode='lines'
    ))

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=pct_change_std,
        name='Stock Pct Change (Standardized)',
        line=dict(color='yellow'),
        yaxis='y2',
        mode='lines+markers'
    ))


    fig.add_trace(go.Scatter(
        x=forecast_index,
        y=forecast_mean,
        name='Forecasted Stock Pct Change',
        line=dict(color='red'),
        yaxis='y2',
        mode='lines+markers'
    ))

    fig.add_trace(go.Scatter(
        x=np.concatenate([forecast_index, forecast_index[::-1]]),
        y=np.concatenate([forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1][::-1]]),
        fill='toself',
        fillcolor='rgba(255,0,0,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    ))

    fig.update_layout(
        title='Sentiment Proportion and Stock Percentage Change with Forecast',
        xaxis_title='Date',
        yaxis=dict(
            title=dict(text='Standardized Sentiment Proportion', font=dict(color='yellow'))
        ),
        yaxis2=dict(
            title=dict(text='Stock Pct Change', font=dict(color='yellow')),
            overlaying='y',
            side='right'
        ),
        template='plotly_dark'
    )


    fig.show()


Prueba

In [485]:
def create_plot(combined_df, forecast_mean, forecast_ci, forecast_index):


    sentiment_std = (combined_df['7day_pct_positive'] - combined_df['7day_pct_positive'].mean()) / combined_df['7day_pct_positive'].std()

    pct_change_std = (combined_df['Pct_Change'] - combined_df['Pct_Change'].mean()) / combined_df['Pct_Change'].std()


    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=sentiment_std,
        name='Standardized Sentiment Proportion',
        line=dict(color='blue'),
        mode='lines'
    ))

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=pct_change_std,
        name='Stock Pct Change (Standardized)',
        line=dict(color='yellow'),
        mode='lines+markers'
    ))


    fig.add_trace(go.Scatter(
        x=forecast_index,
        y=forecast_mean,
        name='Forecasted Stock Pct Change',
        line=dict(color='red'),
        mode='lines+markers'
    ))

    fig.add_trace(go.Scatter(
        x=np.concatenate([forecast_index, forecast_index[::-1]]),
        y=np.concatenate([forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1][::-1]]),
        fill='toself',
        fillcolor='rgba(255,0,0,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    ))

    fig.update_layout(
        title='Sentiment Proportion and Stock Percentage Change with Forecast',
        xaxis_title='Date',
        yaxis=dict(
            title=dict(text='Standardized Sentiment Proportion', font=dict(color='yellow'))
        ),
        yaxis2=dict(
            title=dict(text='Stock Pct Change', font=dict(color='yellow')),
            overlaying='y',
            side='right'
        ),
        template='plotly_dark'
    )


    fig.show()

Graficos

In [477]:
create_plot(combined_df, forecast_mean, forecast_ci, forecast_index)

In [494]:
create_plot(combined_df, forecast_mean_from_start_std, forecast_ci_from_start_std, forecast_index_from_start)

Graficos

In [478]:
combined_df

Unnamed: 0,POSITIVE,NEGATIVE,7day_avg_positive,7day_avg_negative,7day_pct_positive,Pct_Change,lagged_7day_pct_positive
2025-02-10,8,2,8.0,2.0,0.8,,
2025-02-11,4,1,12.0,3.0,0.8,-0.37685,0.8
2025-02-12,3,1,15.0,4.0,0.789474,1.810325,0.8
2025-02-13,1,1,16.0,5.0,0.761905,-0.035389,0.789474
2025-02-14,3,1,19.0,6.0,0.76,0.142365,0.761905
2025-02-17,2,0,13.0,4.0,0.777778,,0.76
2025-02-18,0,1,9.0,4.0,0.75,0.426473,0.777778
2025-02-19,5,1,11.0,4.0,0.764706,-0.47775,0.75


In [479]:
print(forecast_index_from_start)
forecast_mean_from_start_std

DatetimeIndex(['2025-02-19', '2025-02-20', '2025-02-21', '2025-02-24'], dtype='datetime64[ns]', freq=None)


2025-02-19 00:00:00   -0.869995
6                     -0.514002
7                     -0.836638
8                     -0.544233
dtype: float64

In [480]:
create_plot(combined_df, forecast_mean_from_start_std, forecast_ci_from_start_std, forecast_index_from_start)

# Function

We define the functions for the .py file

In [490]:
def preprocessing_data(combined_df):

    last_real_date = combined_df.dropna(subset=['Pct_Change']).index[-1]
    last_real_value = combined_df['Pct_Change'].dropna().iloc[-1]

    forecast_mean_from_start = pd.Series(
    [last_real_value] + forecast_mean.tolist(),
    index=[last_real_date] + forecast_index
    )

    forecast_index_from_start = forecast_mean_from_start.index

    forecast_mean_from_start = pd.concat([
        pd.Series([last_real_value], index=[last_real_date]),
        forecast_mean
    ])

    forecast_ci_first_point = pd.DataFrame({
        'lower Pct_Change': [last_real_value], 
        'upper Pct_Change': [last_real_value]
    }, index=[last_real_date])

    forecast_ci_from_start = pd.concat([forecast_ci_first_point, forecast_ci])

    pct_change_mean = combined_df['Pct_Change'].mean()
    pct_change_std = combined_df['Pct_Change'].std()

    forecast_mean_from_start_std = (forecast_mean_from_start - pct_change_mean) / pct_change_std

    forecast_ci_from_start_std = pd.DataFrame({
        'lower Pct_Change': (forecast_ci_from_start['lower Pct_Change'] - pct_change_mean) / pct_change_std,
        'upper Pct_Change': (forecast_ci_from_start['upper Pct_Change'] - pct_change_mean) / pct_change_std
    }, index=forecast_ci_from_start.index)

    return combined_df, forecast_mean_from_start_std, forecast_ci_from_start_std, forecast_index_from_start

In [491]:
def create_plot(combined_df, forecast_mean, forecast_ci, forecast_index):


    sentiment_std = (combined_df['7day_pct_positive'] - combined_df['7day_pct_positive'].mean()) / combined_df['7day_pct_positive'].std()

    pct_change_std = (combined_df['Pct_Change'] - combined_df['Pct_Change'].mean()) / combined_df['Pct_Change'].std()


    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=sentiment_std,
        name='Standardized Sentiment Proportion',
        line=dict(color='blue'),
        mode='lines'
    ))

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=pct_change_std,
        name='Stock Pct Change (Standardized)',
        line=dict(color='yellow'),
        mode='lines+markers'
    ))


    fig.add_trace(go.Scatter(
        x=forecast_index,
        y=forecast_mean,
        name='Forecasted Stock Pct Change',
        line=dict(color='red'),
        mode='lines+markers'
    ))

    fig.add_trace(go.Scatter(
        x=np.concatenate([forecast_index, forecast_index[::-1]]),
        y=np.concatenate([forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1][::-1]]),
        fill='toself',
        fillcolor='rgba(255,0,0,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    ))

    fig.update_layout(
        title='Sentiment Proportion and Stock Percentage Change with Forecast',
        xaxis_title='Date',
        yaxis=dict(
            title=dict(text='Standardized Sentiment Proportion', font=dict(color='yellow'))
        ),
        yaxis2=dict(
            title=dict(text='Stock Pct Change', font=dict(color='yellow')),
            overlaying='y',
            side='right'
        ),
        template='plotly_dark'
    )


    fig.show()

In [492]:
combined_df, forecast_mean_from_start_std, forecast_ci_from_start_std, forecast_index_from_start = preprocessing_data(combined_df)

In [493]:
create_plot(combined_df, forecast_mean_from_start_std, forecast_ci_from_start_std, forecast_index_from_start)

Streamlit?

In [None]:
"""
# Streamlit app
st.sidebar.title("Predicting Stock Prices by News Sentiment")
ticker = st.sidebar.text_input("Enter stock ticker, SBUX?:", value='SBUX')
run_button = st.sidebar.button("Run Analysis")

if run_button:
    news_df = get_news_data(ticker)
    result_df = process_sentiment_data(news_df)

    start_date = result_df['DateOnly'].min().strftime('%Y-%m-%d')
    end_date = result_df['DateOnly'].max().strftime('%Y-%m-%d')
    stock_data = get_stock_data(ticker, start_date, end_date)

    combined_df = combine_data(result_df, stock_data)
    correlation_pct_change = calculate_correlation(combined_df)

    st.write(f"Pearson correlation between lagged sentiment score and stock percentage change: {correlation_pct_change}")

    forecast_mean, forecast_ci, forecast_index = fit_and_forecast(combined_df, get_future_dates_next_day)

    combined_df, forecast_mean, forecast_ci, forecast_index = preprocessing_data(combined_df)
    create_plot(combined_df, forecast_mean, forecast_ci, forecast_index)

"""

# Final Test

In [6]:
#############
##LIBRARIES##
#############

import re
import math

import yfinance as yf
from finvizfinance.quote import finvizfinance

from statsmodels.tsa.statespace.sarimax import SARIMAX

import plotly.graph_objects as go
import pandas as pd
import numpy as np

import holidays

from langchain_community.llms import Ollama


import streamlit as st

llm = Ollama(model='llama3')

def classify_sentiment(title):
    output = llm.invoke(f"Classify the sentiment as 'POSITIVE' or 'NEGATIVE' or 'NEUTRAL' with just that one")
    return output.strip()



def classify_sentiment_batch(titles):
    print(f"Clasificando {len(titles)} títulos de noticias.")

    prompt = (
        "For each news title below, classify the sentiment as 'POSITIVE', 'NEGATIVE' or 'NEUTRAL'.\n"
        "Return exactly one sentiment per title, in the same order as the titles, and NOTHING ELSE.\n"
        "Make sure to return exactly the same number of lines as the number of news titles.\n"
    )

    prompt += "\n".join(f"- {title}" for title in titles)

    output = llm.invoke(prompt)
    print(f"🔹 Respuesta de Ollama:\n{output}\n")

    valid_sentiments = {"POSITIVE", "NEGATIVE", "NEUTRAL"}
    sentiments = []

    for line in output.split("\n"):
        print("Otro")
        line = line.strip().upper()

        sentiment = next((s for s in valid_sentiments if s in line), None)

        if sentiment:
            sentiments.append(sentiment)
        else:
            sentiments.append('NEUTRAL') 


        
        while len(sentiments) < len(titles):
            sentiments.append("NEUTRAL")
    print(f"La longitud de los sentiments es {len(sentiments)}")
    print(sentiments)
    return sentiments


def get_news_ticker(ticker):
    stock = finvizfinance(ticker)
    news_df = stock.ticker_news()
    return news_df

# Function to get and process news data
def get_news_data(news_df_original):
    
    news_df =news_df_original.copy()
    news_df['Title'] = news_df['Title'].str.lower()


    news_df['sentiment'] = classify_sentiment_batch(news_df['Title'].tolist())

    news_df_sent = news_df.copy()
    news_df_sent = news_df_sent[news_df_sent['sentiment'] != 'NEUTRAL'].copy()

    news_df_sent['Date'] = pd.to_datetime(news_df_sent['Date'])
    news_df_sent['DateOnly'] = news_df_sent['Date'].dt.date


    return news_df_sent


# Function to group and process sentiment data
def process_sentiment_data(news_df):
    """
    Agrupa las noticias por día de cotización y calcula el sentimiento promedio en los últimos 7 días hábiles.
    """
    print(f"Procesamos los datos de news_df con columnas: {news_df.columns}")

    grouped = news_df.groupby(['Trading_Day', 'sentiment']).size().unstack(fill_value=0)
    grouped = grouped.reindex(columns=['POSITIVE', 'NEGATIVE'], fill_value=0)

    print("Grouped inicial")
    print(grouped)

   
    all_trading_days = pd.date_range(start=grouped.index.min(), end=grouped.index.max(), freq='B')
    grouped = grouped.reindex(all_trading_days, fill_value=0)

    grouped['7day_avg_positive'] = grouped['POSITIVE'].rolling('7D', min_periods=1).sum()
    grouped['7day_avg_negative'] = grouped['NEGATIVE'].rolling('7D', min_periods=1).sum()

    grouped['7day_pct_positive'] = grouped['POSITIVE'].expanding().sum() / (grouped['POSITIVE'].expanding().sum() + grouped['NEGATIVE'].expanding().sum())

    result_df = grouped.reset_index().rename(columns={'index': 'Trading_Day'})

    print("Final result_df")
    print(result_df)

    return result_df

# Function to fetch and process stock data
def get_stock_data(ticker, start_date, end_date):
    stock_data = yf.download(ticker, start=start_date, end=end_date)  
    stock_data['Pct_Change'] = stock_data['Close'].pct_change() * 100 
    return stock_data


#We fill the weekends too
def fill_missing_stock_dates(stock_data):
    all_dates = pd.date_range(start=stock_data.index.min(), end=stock_data.index.max(), freq="D")
    
    stock_data = stock_data.reindex(all_dates)
    return stock_data



def next_trading_day(stock_dates, news_date):
    news_date = np.datetime64(news_date) 

    pos = np.searchsorted(stock_dates, news_date)
    if pos == len(stock_dates): 
        return stock_dates[-1]
    
    return stock_dates[pos]

def trading_day(stock_data, result_df):
    stock_dates = np.array(stock_data.index)

    result_df['Trading_Day'] = result_df['DateOnly'].apply(lambda date: next_trading_day(stock_dates, date))
    return result_df




# Function to combine sentiment and stock data
def combine_data(result_df, stock_data):
    combined_df = result_df.set_index('DateOnly').join(stock_data[['Pct_Change']], how='inner')
    combined_df['lagged_7day_pct_positive'] = combined_df['7day_pct_positive'].shift(1)  # Lag sentiment feature

    return combined_df

# Function to calculate Pearson correlation
def calculate_correlation(combined_df):
    correlation_pct_change = combined_df[['lagged_7day_pct_positive', 'Pct_Change']].corr().iloc[0, 1]
    return correlation_pct_change

# Function to get future dates excluding weekends and holidays
def get_future_dates(start_date, num_days):
    print("get future dates")
    us_holidays = holidays.US()
    future_dates = []
    current_date = start_date

    while len(future_dates) < num_days:
        if current_date.weekday() < 5 and current_date not in us_holidays:
            future_dates.append(current_date)
        current_date += pd.Timedelta(days=1)

    return future_dates


# Function to get future dates excluding weekends or holidays from the next day
def get_future_dates_next_day(combined_df, num_days):

    print("get future dates")
    us_holidays = holidays.US()
    future_dates = []


    last_real_date = combined_df.dropna(subset=['Pct_Change']).index[-1]
    current_date = last_real_date + pd.Timedelta(days=1)

    while len(future_dates) < num_days:
        if current_date.weekday() < 5 and current_date not in us_holidays:
            if current_date not in combined_df.index:
                future_dates.append(current_date)
        current_date += pd.Timedelta(days=1)

    return future_dates



def fit_and_forecast(combined_df, function_future_dates=get_future_dates ,forecast_steps=3):
    print("A predecir")
    endog = combined_df['Pct_Change'].dropna() 
    exog = combined_df['lagged_7day_pct_positive'].dropna() 
    print("GOING FOR THE ARIMAX MODEL")
    endog = endog.tail(200)
    exog = exog.loc[endog.index]  

    model = SARIMAX(endog, exog=exog, order=(1, 1, 1))
    fit = model.fit(disp=False, maxiter=50) 

    if function_future_dates == get_future_dates_next_day:
        future_dates = function_future_dates(combined_df, forecast_steps)
    else:
        print(combined_df)
        future_dates = function_future_dates(combined_df.index[-1], forecast_steps)
    
    future_exog = []
    for date in future_dates:
        if date in combined_df.index:
            future_exog.append(combined_df.loc[date, 'lagged_7day_pct_positive'])
        else:
            future_exog.append(combined_df['lagged_7day_pct_positive'].iloc[-1])
    
    future_exog = np.array(future_exog).reshape(-1, 1)

    forecast = fit.get_forecast(steps=forecast_steps, exog=future_exog)
    forecast_mean = forecast.predicted_mean
    forecast_ci = forecast.conf_int()

    return forecast_mean, forecast_ci, future_dates



def preprocessing_data(combined_df):

    last_real_date = combined_df.dropna(subset=['Pct_Change']).index[-1]
    last_real_value = combined_df['Pct_Change'].dropna().iloc[-1]

    forecast_mean_from_start = pd.Series(
    [last_real_value] + forecast_mean.tolist(),
    index=[last_real_date] + forecast_index
    )

    forecast_index_from_start = forecast_mean_from_start.index

    forecast_mean_from_start = pd.concat([
        pd.Series([last_real_value], index=[last_real_date]),
        forecast_mean
    ])

    forecast_ci_first_point = pd.DataFrame({
        'lower Pct_Change': [last_real_value], 
        'upper Pct_Change': [last_real_value]
    }, index=[last_real_date])

    forecast_ci_from_start = pd.concat([forecast_ci_first_point, forecast_ci])

    pct_change_mean = combined_df['Pct_Change'].mean()
    pct_change_std = combined_df['Pct_Change'].std()

    forecast_mean_from_start_std = (forecast_mean_from_start - pct_change_mean) / pct_change_std

    forecast_ci_from_start_std = pd.DataFrame({
        'lower Pct_Change': (forecast_ci_from_start['lower Pct_Change'] - pct_change_mean) / pct_change_std,
        'upper Pct_Change': (forecast_ci_from_start['upper Pct_Change'] - pct_change_mean) / pct_change_std
    }, index=forecast_ci_from_start.index)

    return combined_df, forecast_mean_from_start_std, forecast_ci_from_start_std, forecast_index_from_start



# Function to create and display plot
def create_plot(combined_df, forecast_mean, forecast_ci, forecast_index):


    sentiment_std = (combined_df['7day_pct_positive'] - combined_df['7day_pct_positive'].mean()) / combined_df['7day_pct_positive'].std()

    pct_change_std = (combined_df['Pct_Change'] - combined_df['Pct_Change'].mean()) / combined_df['Pct_Change'].std()


    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=sentiment_std,
        name='Standardized Sentiment Proportion',
        line=dict(color='blue'),
        mode='lines'
    ))

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=pct_change_std,
        name='Stock Pct Change (Standardized)',
        line=dict(color='yellow'),
        mode='lines+markers'
    ))


    fig.add_trace(go.Scatter(
        x=forecast_index,
        y=forecast_mean,
        name='Forecasted Stock Pct Change',
        line=dict(color='red'),
        mode='lines+markers'
    ))

    fig.add_trace(go.Scatter(
        x=np.concatenate([forecast_index, forecast_index[::-1]]),
        y=np.concatenate([forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1][::-1]]),
        fill='toself',
        fillcolor='rgba(255,0,0,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    ))

    fig.update_layout(
        title='Sentiment Proportion and Stock Percentage Change with Forecast',
        xaxis_title='Date',
        yaxis=dict(
            title=dict(text='Standardized Sentiment Proportion', font=dict(color='yellow'))
        ),
        yaxis2=dict(
            title=dict(text='Stock Pct Change', font=dict(color='yellow')),
            overlaying='y',
            side='right'
        ),
        template='plotly_dark'
    )


    fig.show()

In [7]:
def classify_sentiment(title):
    output = llm.invoke(f"Classify the sentiment as 'POSITIVE' or 'NEGATIVE' or 'NEUTRAL' with just that one")
    return output.strip()



def classify_sentiment_batch(titles, batch_size = 10):
    print(f"Clasificando {len(titles)} títulos de noticias.")


    valid_sentiments = {"POSITIVE", "NEGATIVE", "NEUTRAL"}
    sentiments = ["NEUTRAL"] * len(titles)

    num_batches = math.ceil(len(titles)/batch_size)

    for i in range(num_batches):
        print(f"Batch {i}")
        batch_titles = titles[i * batch_size:(i + 1) * batch_size]

        prompt = (
            "For each news title below, classify the sentiment as 'POSITIVE', 'NEGATIVE' or 'NEUTRAL'.\n"
            "Return exactly one sentiment per title, and a number with the order of the titles in the same order as the titles, and NOTHING ELSE.\n"
            "The answer can only contain a number with the order of the title and the words POSITIVE, NEGATIVE or NEUTRAL.\n"
            "Example:\n"
            "1 - POSITIVE\n"
            "2 - NEGATIVE\n"
            "3 - NEUTRAL\n"
        )

        prompt += "\n".join(f"{idx+1} - {title}" for idx, title in enumerate(batch_titles))

        output = llm.invoke(prompt)
        print(f"🔹 Respuesta de Ollama para el batch {i + 1}/{num_batches}:\n{output}\n")

        for line in output.split("\n"):
            line = line.strip().upper()
            match = re.match(r"(\d+)\s*-\s*(POSITIVE|NEGATIVE|NEUTRAL)", line)
            
            if match:
                index = int(match.group(1)) - 1 + (i * batch_size)  # Convertir a índice global
                sentiment = match.group(2)
                
                if 0 <= index < len(sentiments):  # Verificar que el índice sea válido
                    sentiments[index] = sentiment


        
    while len(sentiments) < len(titles):
        sentiments.append("NEUTRAL")
        
    if len(sentiments) > len(titles):
        sentiments = sentiments[:len(titles)]

    print(f"La longitud de los sentiments es {len(sentiments)}")
    print(f"Classification completed!:\n{sentiments}")
    return sentiments


def get_news_ticker(ticker):
    stock = finvizfinance(ticker)
    news_df = stock.ticker_news()
    return news_df

# Function to get and process news data
def get_news_data(news_df_original):
    
    news_df =news_df_original.copy()
    news_df['Title'] = news_df['Title'].str.lower()


    news_df['sentiment'] = classify_sentiment_batch(news_df['Title'].tolist())

    news_df_sent = news_df.copy()
    news_df_sent = news_df_sent[news_df_sent['sentiment'] != 'NEUTRAL'].copy()

    news_df_sent['Date'] = pd.to_datetime(news_df_sent['Date'])
    news_df_sent['DateOnly'] = news_df_sent['Date'].dt.date


    return news_df_sent

In [9]:
ticker ='SBUX'
news_df_original = get_news_ticker(ticker)


In [10]:
news_df = get_news_data(news_df_original)

Clasificando 100 títulos de noticias.
Batch 0
🔹 Respuesta de Ollama para el batch 1/10:
1 - NEUTRAL
2 - POSITIVE
3 - NEUTRAL
4 - NEGATIVE
5 - POSITIVE
6 - POSITIVE
7 - POSITIVE
8 - NEGATIVE
9 - NEUTRAL
10 - POSITIVE

Batch 1
🔹 Respuesta de Ollama para el batch 2/10:
1 - NEGATIVE
2 - POSITIVE
3 - NEUTRAL
4 - POSITIVE
5 - NEUTRAL
6 - POSITIVE
7 - NEGATIVE
8 - POSITIVE
9 - NEGATIVE
10 - POSITIVE

Batch 2
🔹 Respuesta de Ollama para el batch 3/10:
1 - NEGATIVE
2 - NEGATIVE
3 - NEUTRAL
4 - POSITIVE
5 - NEUTRAL
6 - NEGATIVE
7 - NEGATIVE
8 - NEUTRAL
9 - POSITIVE
10 - NEGATIVE

Batch 3
🔹 Respuesta de Ollama para el batch 4/10:
1 - POSITIVE
2 - NEUTRAL
3 - NEGATIVE
4 - POSITIVE
5 - NEGATIVE
6 - NEUTRAL
7 - POSITIVE
8 - NEGATIVE
9 - NEUTRAL
10 - POSITIVE

Batch 4
🔹 Respuesta de Ollama para el batch 5/10:
1 - NEGATIVE
2 - POSITIVE
3 - NEUTRAL
4 - NEGATIVE
5 - NEGATIVE
6 - NEUTRAL
7 - NEUTRAL
8 - NEGATIVE
9 - NEGATIVE
10 - POSITIVE

Batch 5
🔹 Respuesta de Ollama para el batch 6/10:
1 - NEGATIVE
2 -

In [11]:
news_df

Unnamed: 0,Date,Title,Link,Source,sentiment,DateOnly
1,2025-02-20 21:00:00,"smaller menus, better vibes: how starbuckss ce...",https://finance.yahoo.com/m/3d366a5e-a0a4-3087...,The Wall Street Journal,POSITIVE,2025-02-20
3,2025-02-20 09:49:00,starbucks hoped mobile orders and the drive-th...,https://finance.yahoo.com/news/starbucks-hoped...,Fortune,NEGATIVE,2025-02-20
4,2025-02-20 09:45:00,these were the 5 top-performing stocks in the ...,https://finance.yahoo.com/m/c242fba6-9817-3913...,Motley Fool,POSITIVE,2025-02-20
5,2025-02-20 00:01:00,starbucks ceo tells us his plan to turn around...,https://finance.yahoo.com/m/2bfaf6ce-4259-35ae...,The Wall Street Journal,POSITIVE,2025-02-20
6,2025-02-20 00:01:00,how ceo brian niccol aims to fix starbucks,https://finance.yahoo.com/video/ceo-brian-nicc...,WSJ,POSITIVE,2025-02-20
...,...,...,...,...,...,...
93,2025-01-30 14:28:00,starbucks stock perks up as new ceo pours expe...,https://finance.yahoo.com/m/f1718684-6105-3a94...,Investor's Business Daily,POSITIVE,2025-01-30
94,2025-01-30 13:51:00,epic construction site in the saudi desert is ...,https://finance.yahoo.com/m/54bd398b-08ea-3127...,The Wall Street Journal,NEGATIVE,2025-01-30
95,2025-01-30 13:32:00,starbucks would rather close down stores than ...,https://finance.yahoo.com/news/starbucks-ceo-r...,Quartz,NEGATIVE,2025-01-30
97,2025-01-30 12:23:00,starbucks ceo would rather close down stores t...,https://qz.com/starbucks-ceo-brian-niccol-open...,Quartz,NEGATIVE,2025-01-30


In [135]:
start_date = news_df['DateOnly'].min().strftime('%Y-%m-%d')
end_date = news_df['DateOnly'].max().strftime('%Y-%m-%d')

stock_data = get_stock_data(ticker, start_date, end_date)

[*********************100%***********************]  1 of 1 completed


In [136]:
stock_data.head()

Price,Close,High,Low,Open,Volume,Pct_Change
Ticker,SBUX,SBUX,SBUX,SBUX,SBUX,Unnamed: 6_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2025-01-30,108.411598,109.744361,107.446833,108.411598,15023000,
2025-01-31,107.098724,108.769656,106.919696,108.411598,19142500,-1.211009
2025-02-03,107.576134,107.854621,104.950378,105.149303,8894100,0.445766
2025-02-04,110.002953,110.470417,107.546285,108.083371,11123700,2.255908
2025-02-05,111.644051,111.813131,109.266953,109.585225,12826000,1.491867


In [137]:
stock_data.columns

MultiIndex([(     'Close', 'SBUX'),
            (      'High', 'SBUX'),
            (       'Low', 'SBUX'),
            (      'Open', 'SBUX'),
            (    'Volume', 'SBUX'),
            ('Pct_Change',     '')],
           names=['Price', 'Ticker'])

In [138]:
#We fill the weekends too
def fill_missing_stock_dates(stock_data):
    all_dates = pd.date_range(start=stock_data.index.min(), end=stock_data.index.max(), freq="D")
    
    stock_data = stock_data.reindex(all_dates)
    return stock_data

In [139]:
stock_data = fill_missing_stock_dates(stock_data)

In [140]:
stock_data.head()

Price,Close,High,Low,Open,Volume,Pct_Change
Ticker,SBUX,SBUX,SBUX,SBUX,SBUX,Unnamed: 6_level_1
2025-01-30,108.411598,109.744361,107.446833,108.411598,15023000.0,
2025-01-31,107.098724,108.769656,106.919696,108.411598,19142500.0,-1.211009
2025-02-01,,,,,,
2025-02-02,,,,,,
2025-02-03,107.576134,107.854621,104.950378,105.149303,8894100.0,0.445766


In [141]:
news_df = trading_day(stock_data, news_df)

In [142]:
result_df = process_sentiment_data(news_df)

Procesamos los datos de news_df con columnas: Index(['Date', 'Title', 'Link', 'Source', 'sentiment', 'DateOnly',
       'Trading_Day'],
      dtype='object')
Grouped inicial
sentiment    POSITIVE  NEGATIVE
Trading_Day                    
2025-01-30          2         3
2025-01-31          5         1
2025-02-01          2         0
2025-02-02          0         2
2025-02-03          2         1
2025-02-04          4         2
2025-02-05          3         3
2025-02-06          2         2
2025-02-07          0         2
2025-02-08          0         2
2025-02-09          1         1
2025-02-10          3         2
2025-02-11          2         2
2025-02-12          1         2
2025-02-13          0         2
2025-02-14          2         2
2025-02-16          1         0
2025-02-17          1         0
2025-02-19          6         3
Final result_df
sentiment Trading_Day  POSITIVE  NEGATIVE  7day_avg_positive  \
0          2025-01-30         2         3                2.0   
1         

In [143]:
result_df.head()

sentiment,Trading_Day,POSITIVE,NEGATIVE,7day_avg_positive,7day_avg_negative,7day_pct_positive
0,2025-01-30,2,3,2.0,3.0,0.4
1,2025-01-31,5,1,7.0,4.0,0.636364
2,2025-02-03,2,1,9.0,5.0,0.642857
3,2025-02-04,4,2,13.0,7.0,0.65
4,2025-02-05,3,3,16.0,10.0,0.615385


In [144]:
stock_data.head()

Price,Close,High,Low,Open,Volume,Pct_Change
Ticker,SBUX,SBUX,SBUX,SBUX,SBUX,Unnamed: 6_level_1
2025-01-30,108.411598,109.744361,107.446833,108.411598,15023000.0,
2025-01-31,107.098724,108.769656,106.919696,108.411598,19142500.0,-1.211009
2025-02-01,,,,,,
2025-02-02,,,,,,
2025-02-03,107.576134,107.854621,104.950378,105.149303,8894100.0,0.445766


In [145]:
stock_data.columns = stock_data.columns.droplevel(1)

In [146]:
stock_data.head()

Price,Close,High,Low,Open,Volume,Pct_Change
2025-01-30,108.411598,109.744361,107.446833,108.411598,15023000.0,
2025-01-31,107.098724,108.769656,106.919696,108.411598,19142500.0,-1.211009
2025-02-01,,,,,,
2025-02-02,,,,,,
2025-02-03,107.576134,107.854621,104.950378,105.149303,8894100.0,0.445766


In [147]:
stock_data.columns.name = None

stock_data.index.name = None

In [148]:
stock_data.columns

Index(['Close', 'High', 'Low', 'Open', 'Volume', 'Pct_Change'], dtype='object')

In [149]:
stock_data.head()

Unnamed: 0,Close,High,Low,Open,Volume,Pct_Change
2025-01-30,108.411598,109.744361,107.446833,108.411598,15023000.0,
2025-01-31,107.098724,108.769656,106.919696,108.411598,19142500.0,-1.211009
2025-02-01,,,,,,
2025-02-02,,,,,,
2025-02-03,107.576134,107.854621,104.950378,105.149303,8894100.0,0.445766


In [150]:
def combine_data(result_df, stock_data):
    combined_df = result_df.set_index('Trading_Day').join(stock_data[['Pct_Change']], how='inner')
    combined_df['lagged_7day_pct_positive'] = combined_df['7day_pct_positive'].shift(1)

    return combined_df

In [151]:
combined_df = combine_data(result_df, stock_data)

In [152]:
combined_df.head()

Unnamed: 0,POSITIVE,NEGATIVE,7day_avg_positive,7day_avg_negative,7day_pct_positive,Pct_Change,lagged_7day_pct_positive
2025-01-30,2,3,2.0,3.0,0.4,,
2025-01-31,5,1,7.0,4.0,0.636364,-1.211009,0.4
2025-02-03,2,1,9.0,5.0,0.642857,0.445766,0.636364
2025-02-04,4,2,13.0,7.0,0.65,2.255908,0.642857
2025-02-05,3,3,16.0,10.0,0.615385,1.491867,0.65


In [153]:
correlation_pct_change = calculate_correlation(combined_df)

print(f"Pearson correlation between lagged sentiment score and stock percentage change: {correlation_pct_change}")

Pearson correlation between lagged sentiment score and stock percentage change: 0.6166285005764097


In [154]:

forecast_mean, forecast_ci, forecast_index = fit_and_forecast(combined_df, get_future_dates_next_day)

A predecir
GOING FOR THE ARIMAX MODEL
get future dates


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


In [155]:
combined_df, forecast_mean, forecast_ci, forecast_index = preprocessing_data(combined_df)


In [156]:
create_plot(combined_df, forecast_mean, forecast_ci, forecast_index)

# Functions

In [1]:
#############
##LIBRARIES##
#############

import re
import math

import yfinance as yf
from finvizfinance.quote import finvizfinance

from statsmodels.tsa.statespace.sarimax import SARIMAX

import plotly.graph_objects as go
import pandas as pd
import numpy as np

import holidays

from langchain_community.llms import Ollama


import streamlit as st

llm = Ollama(model='llama3')

def classify_sentiment(title):
    output = llm.invoke(f"Classify the sentiment as 'POSITIVE' or 'NEGATIVE' or 'NEUTRAL' with just that one")
    return output.strip()



def classify_sentiment_batch(titles, batch_size = 10):
    print(f"Clasificando {len(titles)} títulos de noticias.")


    valid_sentiments = {"POSITIVE", "NEGATIVE", "NEUTRAL"}
    sentiments = ["NEUTRAL"] * len(titles)

    num_batches = math.ceil(len(titles)/batch_size)

    for i in range(num_batches):
        print(f"Batch {i}")
        batch_titles = titles[i * batch_size:(i + 1) * batch_size]

        prompt = (
            "For each news title below, classify the sentiment as 'POSITIVE', 'NEGATIVE' or 'NEUTRAL'.\n"
            "Return exactly one sentiment per title, and a number with the order of the titles in the same order as the titles, and NOTHING ELSE.\n"
            "The answer can only contain a number with the order of the title and the words POSITIVE, NEGATIVE or NEUTRAL.\n"
            "Example:\n"
            "1 - POSITIVE\n"
            "2 - NEGATIVE\n"
            "3 - NEUTRAL\n"
        )

        prompt += "\n".join(f"{idx+1} - {title}" for idx, title in enumerate(batch_titles))

        output = llm.invoke(prompt)
        print(f"🔹 Respuesta de Ollama para el batch {i + 1}/{num_batches}:\n{output}\n")

        for line in output.split("\n"):
            line = line.strip().upper()
            match = re.match(r"(\d+)\s*-\s*(POSITIVE|NEGATIVE|NEUTRAL)", line)
            
            if match:
                index = int(match.group(1)) - 1 + (i * batch_size)  # Convertir a índice global
                sentiment = match.group(2)
                
                if 0 <= index < len(sentiments):  # Verificar que el índice sea válido
                    sentiments[index] = sentiment


        
    while len(sentiments) < len(titles):
        sentiments.append("NEUTRAL")
        
    if len(sentiments) > len(titles):
        sentiments = sentiments[:len(titles)]

    print(f"La longitud de los sentiments es {len(sentiments)}")
    print(f"Classification completed!:\n{sentiments}")
    return sentiments


def get_news_ticker(ticker):
    stock = finvizfinance(ticker)
    news_df = stock.ticker_news()
    return news_df

# Function to get and process news data
def get_news_data(news_df_original):
    
    news_df =news_df_original.copy()
    news_df['Title'] = news_df['Title'].str.lower()


    news_df['sentiment'] = classify_sentiment_batch(news_df['Title'].tolist())

    news_df_sent = news_df.copy()
    news_df_sent = news_df_sent[news_df_sent['sentiment'] != 'NEUTRAL'].copy()

    news_df_sent['Date'] = pd.to_datetime(news_df_sent['Date'])
    news_df_sent['DateOnly'] = news_df_sent['Date'].dt.date


    return news_df_sent


# Function to group and process sentiment data
def process_sentiment_data(news_df):
    """
    Agrupa las noticias por día de cotización y calcula el sentimiento promedio en los últimos 7 días hábiles.
    """
    print(f"Procesamos los datos de news_df con columnas: {news_df.columns}")

    grouped = news_df.groupby(['Trading_Day', 'sentiment']).size().unstack(fill_value=0)
    grouped = grouped.reindex(columns=['POSITIVE', 'NEGATIVE'], fill_value=0)

    print("Grouped inicial")
    print(grouped)

   
    all_trading_days = pd.date_range(start=grouped.index.min(), end=grouped.index.max(), freq='B')
    grouped = grouped.reindex(all_trading_days, fill_value=0)

    grouped['7day_avg_positive'] = grouped['POSITIVE'].rolling('7D', min_periods=1).sum()
    grouped['7day_avg_negative'] = grouped['NEGATIVE'].rolling('7D', min_periods=1).sum()

    grouped['7day_pct_positive'] = grouped['POSITIVE'].expanding().sum() / (grouped['POSITIVE'].expanding().sum() + grouped['NEGATIVE'].expanding().sum())

    result_df = grouped.reset_index().rename(columns={'index': 'Trading_Day'})

    print("Final result_df")
    print(result_df)

    return result_df

# Function to fetch and process stock data
def get_stock_data(ticker, start_date, end_date):
    stock_data = yf.download(ticker, start=start_date, end=end_date)  
    stock_data['Pct_Change'] = stock_data['Close'].pct_change() * 100 
    return stock_data


#We fill the weekends too
def fill_missing_stock_dates(stock_data):
    all_dates = pd.date_range(start=stock_data.index.min(), end=stock_data.index.max(), freq="D")
    
    stock_data = stock_data.reindex(all_dates)
    return stock_data



def next_trading_day(stock_dates, news_date):
    news_date = np.datetime64(news_date) 

    pos = np.searchsorted(stock_dates, news_date)
    if pos == len(stock_dates): 
        return stock_dates[-1]
    
    return stock_dates[pos]

def trading_day(stock_data, result_df):
    stock_dates = np.array(stock_data.index)

    result_df['Trading_Day'] = result_df['DateOnly'].apply(lambda date: next_trading_day(stock_dates, date))
    return result_df


def preprocess_stock_data(stock_data):
    stock_data.columns = stock_data.columns.droplevel(1)

    stock_data.columns.name = None

    stock_data.index.name = None

    return stock_data




# Function to combine sentiment and stock data
def combine_data(result_df, stock_data):
    combined_df = result_df.set_index('Trading_Day').join(stock_data[['Pct_Change']], how='inner')
    combined_df['lagged_7day_pct_positive'] = combined_df['7day_pct_positive'].shift(1)  # Lag sentiment feature

    return combined_df

# Function to calculate Pearson correlation
def calculate_correlation(combined_df):
    correlation_pct_change = combined_df[['lagged_7day_pct_positive', 'Pct_Change']].corr().iloc[0, 1]
    return correlation_pct_change

# Function to get future dates excluding weekends and holidays
def get_future_dates(start_date, num_days):
    print("get future dates")
    us_holidays = holidays.US()
    future_dates = []
    current_date = start_date

    while len(future_dates) < num_days:
        if current_date.weekday() < 5 and current_date not in us_holidays:
            future_dates.append(current_date)
        current_date += pd.Timedelta(days=1)

    return future_dates


# Function to get future dates excluding weekends or holidays from the next day
def get_future_dates_next_day(combined_df, num_days):

    print("get future dates")
    us_holidays = holidays.US()
    future_dates = []


    last_real_date = combined_df.dropna(subset=['Pct_Change']).index[-1]
    current_date = last_real_date + pd.Timedelta(days=1)

    while len(future_dates) < num_days:
        if current_date.weekday() < 5 and current_date not in us_holidays:
            if current_date not in combined_df.index:
                future_dates.append(current_date)
        current_date += pd.Timedelta(days=1)

    return future_dates



def fit_and_forecast(combined_df, function_future_dates=get_future_dates ,forecast_steps=3):
    print("A predecir")
    endog = combined_df['Pct_Change'].dropna() 
    exog = combined_df['lagged_7day_pct_positive'].dropna() 
    print("GOING FOR THE ARIMAX MODEL")
    endog = endog.tail(200)
    exog = exog.loc[endog.index]  

    model = SARIMAX(endog, exog=exog, order=(1, 1, 1))
    fit = model.fit(disp=False, maxiter=50) 

    if function_future_dates == get_future_dates_next_day:
        future_dates = function_future_dates(combined_df, forecast_steps)
    else:
        print(combined_df)
        future_dates = function_future_dates(combined_df.index[-1], forecast_steps)
    
    future_exog = []
    for date in future_dates:
        if date in combined_df.index:
            future_exog.append(combined_df.loc[date, 'lagged_7day_pct_positive'])
        else:
            future_exog.append(combined_df['lagged_7day_pct_positive'].iloc[-1])
    
    future_exog = np.array(future_exog).reshape(-1, 1)

    forecast = fit.get_forecast(steps=forecast_steps, exog=future_exog)
    forecast_mean = forecast.predicted_mean
    forecast_ci = forecast.conf_int()

    return forecast_mean, forecast_ci, future_dates



def preprocessing_data(combined_df):

    last_real_date = combined_df.dropna(subset=['Pct_Change']).index[-1]
    last_real_value = combined_df['Pct_Change'].dropna().iloc[-1]

    forecast_mean_from_start = pd.Series(
    [last_real_value] + forecast_mean.tolist(),
    index=[last_real_date] + forecast_index
    )

    forecast_index_from_start = forecast_mean_from_start.index

    forecast_mean_from_start = pd.concat([
        pd.Series([last_real_value], index=[last_real_date]),
        forecast_mean
    ])

    forecast_ci_first_point = pd.DataFrame({
        'lower Pct_Change': [last_real_value], 
        'upper Pct_Change': [last_real_value]
    }, index=[last_real_date])

    forecast_ci_from_start = pd.concat([forecast_ci_first_point, forecast_ci])

    pct_change_mean = combined_df['Pct_Change'].mean()
    pct_change_std = combined_df['Pct_Change'].std()

    forecast_mean_from_start_std = (forecast_mean_from_start - pct_change_mean) / pct_change_std

    forecast_ci_from_start_std = pd.DataFrame({
        'lower Pct_Change': (forecast_ci_from_start['lower Pct_Change'] - pct_change_mean) / pct_change_std,
        'upper Pct_Change': (forecast_ci_from_start['upper Pct_Change'] - pct_change_mean) / pct_change_std
    }, index=forecast_ci_from_start.index)

    return combined_df, forecast_mean_from_start_std, forecast_ci_from_start_std, forecast_index_from_start



# Function to create and display plot
def create_plot(combined_df, forecast_mean, forecast_ci, forecast_index):


    sentiment_std = (combined_df['7day_pct_positive'] - combined_df['7day_pct_positive'].mean()) / combined_df['7day_pct_positive'].std()

    pct_change_std = (combined_df['Pct_Change'] - combined_df['Pct_Change'].mean()) / combined_df['Pct_Change'].std()


    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=sentiment_std,
        name='Standardized Sentiment Proportion',
        line=dict(color='blue'),
        mode='lines'
    ))

    fig.add_trace(go.Scatter(
        x=combined_df.index,
        y=pct_change_std,
        name='Stock Pct Change (Standardized)',
        line=dict(color='yellow'),
        mode='lines+markers'
    ))


    fig.add_trace(go.Scatter(
        x=forecast_index,
        y=forecast_mean,
        name='Forecasted Stock Pct Change',
        line=dict(color='red'),
        mode='lines+markers'
    ))

    fig.add_trace(go.Scatter(
        x=np.concatenate([forecast_index, forecast_index[::-1]]),
        y=np.concatenate([forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1][::-1]]),
        fill='toself',
        fillcolor='rgba(255,0,0,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    ))

    fig.update_layout(
        title='Sentiment Proportion and Stock Percentage Change with Forecast',
        xaxis_title='Date',
        yaxis=dict(
            title=dict(text='Standardized Sentiment Proportion', font=dict(color='yellow'))
        ),
        yaxis2=dict(
            title=dict(text='Stock Pct Change', font=dict(color='yellow')),
            overlaying='y',
            side='right'
        ),
        template='plotly_dark'
    )


    fig.show()

In [2]:
ticker ='SBUX'
news_df_original = get_news_ticker(ticker)


In [3]:
news_df = get_news_data(news_df_original)

Clasificando 100 títulos de noticias.
Batch 0
🔹 Respuesta de Ollama para el batch 1/10:
1 - NEUTRAL
2 - POSITIVE
3 - NEGATIVE
4 - NEGATIVE
5 - NEUTRAL
6 - POSITIVE
7 - POSITIVE
8 - NEGATIVE
9 - NEUTRAL
10 - POSITIVE

Batch 1
🔹 Respuesta de Ollama para el batch 2/10:
1 - NEGATIVE
2 - POSITIVE
3 - POSITIVE
4 - NEUTRAL
5 - NEUTRAL
6 - POSITIVE
7 - NEGATIVE
8 - POSITIVE
9 - NEGATIVE
10 - POSITIVE

Batch 2
🔹 Respuesta de Ollama para el batch 3/10:
1 - NEGATIVE
2 - NEGATIVE
3 - NEUTRAL
4 - POSITIVE
5 - NEUTRAL
6 - NEGATIVE
7 - NEGATIVE
8 - NEUTRAL
9 - POSITIVE
10 - NEGATIVE

Batch 3
🔹 Respuesta de Ollama para el batch 4/10:
1 - POSITIVE
2 - NEUTRAL
3 - NEGATIVE
4 - POSITIVE
5 - NEGATIVE
6 - NEUTRAL
7 - POSITIVE
8 - NEGATIVE
9 - NEUTRAL
10 - POSITIVE

Batch 4
🔹 Respuesta de Ollama para el batch 5/10:
1 - POSITIVE
2 - POSITIVE
3 - NEUTRAL
4 - NEGATIVE
5 - POSITIVE
6 - POSITIVE
7 - NEUTRAL
8 - NEGATIVE
9 - NEGATIVE
10 - POSITIVE

Batch 5
🔹 Respuesta de Ollama para el batch 6/10:
1 - NEGATIVE
2 

In [4]:
start_date = news_df['DateOnly'].min().strftime('%Y-%m-%d')
end_date = news_df['DateOnly'].max().strftime('%Y-%m-%d')

stock_data = get_stock_data(ticker, start_date, end_date)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [5]:
stock_data = fill_missing_stock_dates(stock_data)
news_df = trading_day(stock_data, news_df)
result_df = process_sentiment_data(news_df)



Procesamos los datos de news_df con columnas: Index(['Date', 'Title', 'Link', 'Source', 'sentiment', 'DateOnly',
       'Trading_Day'],
      dtype='object')
Grouped inicial
sentiment    POSITIVE  NEGATIVE
Trading_Day                    
2025-01-30          1         4
2025-01-31          8         1
2025-02-04          3         1
2025-02-05          4         2
2025-02-06          4         2
2025-02-07          1         2
2025-02-08          1         1
2025-02-09          2         0
2025-02-10          3         2
2025-02-11          2         2
2025-02-12          1         2
2025-02-13          0         2
2025-02-14          2         2
2025-02-16          1         0
2025-02-18          1         0
2025-02-19          5         4
Final result_df
sentiment Trading_Day  POSITIVE  NEGATIVE  7day_avg_positive  \
0          2025-01-30         1         4                1.0   
1          2025-01-31         8         1                9.0   
2          2025-02-03         0         0 

In [6]:
stock_data

Price,Close,High,Low,Open,Volume,Pct_Change
Ticker,SBUX,SBUX,SBUX,SBUX,SBUX,Unnamed: 6_level_1
2025-01-30,108.411598,109.744361,107.446833,108.411598,15023000.0,
2025-01-31,107.098724,108.769656,106.919696,108.411598,19142500.0,-1.211009
2025-02-01,,,,,,
2025-02-02,,,,,,
2025-02-03,107.576134,107.854621,104.950378,105.149303,8894100.0,0.445766
2025-02-04,110.002953,110.470417,107.546285,108.083371,11123700.0,2.255908
2025-02-05,111.644051,111.813131,109.266953,109.585225,12826000.0,1.491867
2025-02-06,111.097015,111.703723,110.45053,111.445132,11967300.0,-0.489982
2025-02-07,111.594315,112.012053,109.465867,110.828475,11502400.0,0.447626
2025-02-08,,,,,,


In [7]:
stock_data = preprocess_stock_data(stock_data)

In [8]:
combined_df = combine_data(result_df, stock_data)

correlation_pct_change = calculate_correlation(combined_df)

print(f"Pearson correlation between lagged sentiment score and stock percentage change: {correlation_pct_change}")

Pearson correlation between lagged sentiment score and stock percentage change: 0.476329693329414


In [9]:
forecast_mean, forecast_ci, forecast_index = fit_and_forecast(combined_df, get_future_dates_next_day)

create_plot(combined_df, forecast_mean, forecast_ci, forecast_index)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


A predecir
GOING FOR THE ARIMAX MODEL
get future dates


  return get_prediction_index(
  return get_prediction_index(


# Compare with gpt 3.5 turbo and mistral