1. Imports + Set Up

In [6]:
import yfinance as yf
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


2. Stock + News

In [7]:
def get_stock_data(ticker, start, end, api_key):
    url = (
        f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY"
        f"&symbol={ticker}&outputsize=compact&apikey={api_key}"
    )
    response = requests.get(url)
    data = response.json()

    if "Time Series (Daily)" not in data:
        print(f"[!] Alpha Vantage error: {data}")
        return None

    df = pd.DataFrame.from_dict(data["Time Series (Daily)"], orient="index", dtype=float)
    df = df.rename(columns={
        "1. open": "Open",
        "2. high": "High",
        "3. low": "Low",
        "4. close": "Close",
        "5. volume": "Volume"
    })
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)

    # Filter to match date range
    df = df[(df.index.date >= start) & (df.index.date <= end)]
    df["daily_return"] = df["Close"].pct_change()
    df["volume_change"] = df["Volume"].pct_change()
    return df


def get_news_articles(query, from_date, to_date, api_key="NEWS_API_KEY"):
    url = (
        f'https://newsapi.org/v2/everything?q={query}&from={from_date}&to={to_date}'
        f'&language=en&sortBy=publishedAt&pageSize=100&apiKey={api_key}'
    )
    response = requests.get(url)
    data = response.json()
    return data.get('articles', [])

3. Sentiment

In [8]:
def analyze_sentiment(articles):
    records = []

    for article in articles:
        text = (article.get("title", "") or "") + " " + (article.get("description", "") or "")
        if not text.strip():
            continue

        try:
            result = sentiment_pipeline(text[:512])[0]  # Truncate to 512 tokens max
            label = result["label"]
            score = {"Positive": 1, "Neutral": 0, "Negative": -1}.get(label, 0)
            date = article['publishedAt'][:10]
            records.append((date, score))
        except Exception as e:
            print(f"Skipping article due to error: {e}")
            continue

    df = pd.DataFrame(records, columns=["date", "sentiment"])
    df["date"] = pd.to_datetime(df["date"])
    return df.groupby("date").mean()

4. Model

In [9]:
def merge_and_label(stock_df, sentiment_df):
    df = stock_df.merge(sentiment_df, how='left', left_index=True, right_index=True)
    df['sentiment'] = df['sentiment'].fillna(0)
    df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
    return df.dropna()

def train_and_evaluate(df):
    features = ['sentiment', 'daily_return', 'volume_change']
    X = df[features]
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))
    return model

5. Pipeline

In [12]:
# Params
from datetime import datetime, timedelta

end_date = datetime.today().date() - timedelta(days=1)
start_date = end_date - timedelta(days=100)

start_str = start_date.strftime("%Y-%m-%d")
end_str = end_date.strftime("%Y-%m-%d")

ticker = "AAPL"

# Run
stock_df = get_stock_data(ticker, start_date, end_date, api_key="3MOQ4Q8MBBMQ5KCD")
news_articles = get_news_articles(ticker, start_date, end_date, api_key="90d80862968d439ab1f7cb56d1c054a3")
sentiment_df = analyze_sentiment(news_articles)
merged_df = merge_and_label(stock_df, sentiment_df)
model = train_and_evaluate(merged_df)

Accuracy: 0.5714285714285714
              precision    recall  f1-score   support

           0       0.40      0.40      0.40         5
           1       0.67      0.67      0.67         9

    accuracy                           0.57        14
   macro avg       0.53      0.53      0.53        14
weighted avg       0.57      0.57      0.57        14



  df['sentiment'] = df['sentiment'].fillna(0)
