In [69]:
import pandas as pd
import plotly.express as px
from dotenv import load_dotenv
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import ta

In [2]:
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Load Alpaca credentials
load_dotenv()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/niroren/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


False

In [59]:
def score_sentiment(news_df):
    news_df["sentiment"] = news_df["headline"].apply(lambda x: sid.polarity_scores(x)["compound"])
    print("done")
    return news_df

In [137]:
clean = pd.read_csv("data/cleaned_news.csv").drop(columns=['Unnamed: 0'])

# add sentiments, and extract date
news_df = score_sentiment(clean)
news_df.created_at = pd.to_datetime(news_df.created_at)
news_df['date'] = news_df.created_at.dt.date

done


In [11]:
tickers = ['AAPL','GOOGL']

In [138]:
ticker_news = {}
daily_sentiment = {}
for symbol in tickers:
    # extract news articles that have to do with the ticker
    ticker_news[symbol] = news_df[news_df['symbols'].apply(lambda x: f"'{symbol}'" in x)]
    
    # calculate average sentiment per day
    daily_sentiment[symbol] = ticker_news[symbol].groupby('date')['sentiment'].mean()


In [202]:
# merge sentiment, add other indicators

price_dfs = {}
for ticker in tickers:
    price_dfs[ticker] = pd.read_csv(f'data/{ticker}.csv').set_index('Date')
    price_dfs[ticker].index = pd.to_datetime(price_dfs[ticker].index,
                                             format="%Y-%m-%d"
                                             )
    price_dfs[ticker].sort_index(ascending=True)
    price_dfs[ticker]['news_sentiment'] = daily_sentiment[ticker]
    price_dfs[ticker]['return'] = price_dfs[ticker]['Close'].diff()
    price_dfs[ticker]['fwd_return'] = price_dfs[ticker]['Close'].diff().shift(-1)

    # EMA
    price_dfs[ticker]['ema'] = ta.trend.ema_indicator(
        price_dfs[ticker]['Close'], window=20)
    
    # RSI
    price_dfs[ticker]['rsi_14'] = ta.momentum.rsi(price_dfs[ticker]['Close'], window=14)

    # PVT
    price_dfs[ticker]['pvt'] = ta.volume.volume_price_trend(price_dfs[ticker]['Close'], price_dfs[ticker]['Volume'])

    # MACD
    price_dfs[ticker]['macd'] = ta.trend.macd(price_dfs[ticker]['Close'])
    price_dfs[ticker]['macd_signal'] = ta.trend.macd_signal(price_dfs[ticker]['Close'])
    price_dfs[ticker]['macd_diff'] = ta.trend.macd_diff(price_dfs[ticker]['Close'])

    price_dfs[ticker]['sma_20'] = ta.trend.sma_indicator(price_dfs[ticker]['Close'], window=20)
    price_dfs[ticker]['williams_r'] = ta.momentum.williams_r(price_dfs[ticker]['High'], price_dfs[ticker]['Low'], price_dfs[ticker]['Close'], lbp=14)
    price_dfs[ticker]['obv'] = ta.volume.on_balance_volume(price_dfs[ticker]['Close'], price_dfs[ticker]['Volume'])
    price_dfs[ticker]['atr'] = ta.volatility.average_true_range(price_dfs[ticker]['High'], price_dfs[ticker]['Low'], price_dfs[ticker]['Close'], window=14)

In [204]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [209]:
# If you already have the DataFrame loaded as `df` and cleaned:
df = price_dfs['AAPL'].copy()

# Drop rows with NaNs (from indicators or returns)
df = df.dropna()

# Define features and target
features = [
    "Open", "High", "Low", "Close", "Volume", "news_sentiment", "return",
    "ema", "rsi_14", "pvt", "macd", "macd_signal", "macd_diff"
]
target = "fwd_return"

X = df[features]
y = df[target]

# Optional: scale volume and pvt if needed (many orders of magnitude larger)
X["Volume"] = X["Volume"] / 1e6
X["pvt"] = X["pvt"] / 1e6

# Use time-based split (e.g., last 20% as test)
split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Volume"] = X["Volume"] / 1e6
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["pvt"] = X["pvt"] / 1e6


In [210]:
model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [211]:
y_pred

array([ 6.6049182e-01,  2.2158876e-01,  7.1578157e-01,  7.6225597e-01,
        5.8112592e-01,  2.4590573e+00,  5.1995534e-01, -1.0435242e-01,
        6.1390054e-01,  1.8669962e+00, -1.5154355e+00, -4.6582937e-01,
        1.9057027e-01,  3.2402429e-01,  1.5315729e-01,  9.1165364e-01,
       -9.8593369e-02,  2.1254475e-01,  4.0598592e-01,  6.7641157e-01,
       -1.1565353e+00,  5.7785350e-01,  5.6576818e-01,  2.5070670e-01,
        2.3274438e+00, -1.0262272e-01,  6.4755130e-01,  4.9240738e-01,
        4.0125403e-01, -6.6743791e-01,  4.1415945e-01, -4.6165106e-01,
        2.7586466e-01, -2.7625808e-01, -5.9576485e-02,  6.7158413e-01,
        1.4886911e+00,  1.6562418e+00,  1.2059078e+00,  3.5240948e+00,
        1.6342069e+00,  1.9375148e+00,  1.0285567e+00,  1.2806182e+00,
        1.0943863e+00,  8.3391112e-01,  2.6221514e-01,  1.3098198e+00,
        8.4363443e-01, -8.7077208e-02, -2.1639802e-03, -8.8602453e-01,
       -8.8913018e-01, -4.2129552e-01, -7.6877469e-01, -8.7907451e-01,
      

In [212]:
results = pd.DataFrame(y_test)
results['pred'] = y_pred

results['actual_dir'] = results['fwd_return'].apply(np.sign)
results['pred_dir'] = results['pred'].apply(np.sign)

len(results[results.actual_dir==results.pred_dir])/len(results)

0.5590062111801242