# Project: Sentiment Analysis and Stock Trends Forecast

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import time
import requests
import re
import nltk
import glob
import xgboost
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from textblob import TextBlob
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiangranliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jiangranliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
print(xgboost.__version__)

1.7.6


# Data Preparation

### Fetch S&P500 Companies and Tickers from Wikipedia

There are 503 companies in S&P500.

In [3]:
def fetch_sp500_ticker_to_company():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    response = requests.get(url)

    # Parse S&P500 Companies using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'id': 'constituents'})

    ticker_to_company = {}

    # Iterate over the rows of the table
    for row in table.find_all('tr')[1:]:
        cols = row.find_all('td')
        ticker = cols[0].text.strip() # Ticker
        company_name = cols[1].text.strip() # Company
        ticker_to_company[ticker] = company_name
        
    return ticker_to_company

ticker_to_company_dict = fetch_sp500_ticker_to_company()
len(ticker_to_company_dict)

503

### Fetch Stock Prices from Yahoo Finance

* Changes below -2.5% are labeled "Significantly Decreased."
* Changes between -2.5% and -1% are labeled "Decreased."
* Changes ranging from -1% to 1% are labeled "Neutral."
* Changes between 1% and 2.5% are labeled "Increased."
* Changes above 2.5% are labeled "Significantly Increased."

In [4]:
df_price = pd.DataFrame()
for ticker in ticker_to_company_dict:
    stock_data = yf.download(ticker, start = "2024-12-27", end = "2025-02-01")
    
    stock_data.columns = stock_data.columns.get_level_values(0)
    stock_data.reset_index(inplace = True)
    stock_data["Pct_Change"] = stock_data["Adj Close"].pct_change() * 100
    stock_data["Trend"] = stock_data["Pct_Change"].apply(lambda x: "Significantly Decreased" if x < -2.5 
                                                         else "Decreased" if -2.5 <= x < -1
                                                         else "Neutral" if -1 <= x < 1
                                                         else "Increased" if 1 <= x < 2.5
                                                         else "Significantly Increased")
    stock_data = stock_data.drop(columns = ["Close", "High", "Low", "Open", "Volume"])
    stock_data["Ticker"] = ticker
    stock_data = stock_data.dropna()
    df_price = pd.concat([df_price, stock_data], ignore_index = True)

df_price["Date"] = pd.to_datetime(df_price["Date"]).dt.tz_localize(None)
df_price.head()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Price,Date,Adj Close,Pct_Change,Trend,Ticker
0,2024-12-30,129.130005,-0.806566,Neutral,MMM
1,2024-12-31,129.089996,-0.030983,Neutral,MMM
2,2025-01-02,129.699997,0.472539,Neutral,MMM
3,2025-01-03,129.869995,0.13107,Neutral,MMM
4,2025-01-06,130.289993,0.323399,Neutral,MMM


There are 22 trading days within the specified time range.

In [5]:
df_price["Date"].unique()

array(['2024-12-30T00:00:00.000000000', '2024-12-31T00:00:00.000000000',
       '2025-01-02T00:00:00.000000000', '2025-01-03T00:00:00.000000000',
       '2025-01-06T00:00:00.000000000', '2025-01-07T00:00:00.000000000',
       '2025-01-08T00:00:00.000000000', '2025-01-10T00:00:00.000000000',
       '2025-01-13T00:00:00.000000000', '2025-01-14T00:00:00.000000000',
       '2025-01-15T00:00:00.000000000', '2025-01-16T00:00:00.000000000',
       '2025-01-17T00:00:00.000000000', '2025-01-21T00:00:00.000000000',
       '2025-01-22T00:00:00.000000000', '2025-01-23T00:00:00.000000000',
       '2025-01-24T00:00:00.000000000', '2025-01-27T00:00:00.000000000',
       '2025-01-28T00:00:00.000000000', '2025-01-29T00:00:00.000000000',
       '2025-01-30T00:00:00.000000000', '2025-01-31T00:00:00.000000000'],
      dtype='datetime64[ns]')

### Fetch Financial News from newsapi.org

In [6]:
# API Keys
API_KEYS = ["f21bce77f80644ec8b27415547a0f69f", "8ab509b3fa284525af061e9f4ba3d9df",
            "b9964e7836ce429d8d2450715ea79a7c", "5482d27c6339473abedaa3b5b6371be3"]
API_KEY_INDEX = 0
BASE_URL = "https://newsapi.org/v2/everything"

In [7]:
# Stopwords
custom_stopwords = set(stopwords.words("english"))

In [8]:
# Switch API keys if rate limit is reached
def switch_api_key():
    global API_KEY_INDEX
    API_KEY_INDEX = (API_KEY_INDEX + 1) % len(API_KEYS)
    print(f"Switching to API key {API_KEY_INDEX + 1}.")

In [9]:
# Function to clean text
def clean_text(text):
    if pd.isna(text):
        return ""
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    
    # Remove non-alphanumeric characters
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in custom_stopwords] # Remove stopwords
    return " ".join(tokens)

If we retrieve articles just by company names and tickers, many irrelevant results will appear. To filter them effectively, we need functions that calculate the density of the company name and ticker within the article.

In [17]:
# Define a function to calculate keyword density
def keyword_density(text, keywords):
    text = text.lower()
    tokens = text.split()
    keyword_count = sum(1 for word in tokens if word in keywords)
    if len(tokens) > 0:
        return keyword_count / len(tokens) 
    else:
        return 0

In [18]:
# Analyze sentiment
def get_sentiment(text):
    if isinstance(text, str):
        return TextBlob(text).sentiment.polarity
    else:
        return 0

In [19]:
# Determine if an article is relevant
def is_relevant_article(article, ticker, company_name):
    title = article.get("title", "")
    description = article.get("description", "")
    content = article.get("content", "")

    # Ensure all fields are strings before applying .lower()
    title = title.lower() if title else ""
    description = description.lower() if description else ""
    content = content.lower() if content else ""

    # Check if the company name/ticker is in title, description, or content
    return (ticker.lower() in title or 
            ticker.lower() in description or 
            ticker.lower() in content or
            company_name.lower() in title or 
            company_name.lower() in description or 
            company_name.lower() in content)

Now define a function to fetch news for a single company on a specified date, including error handling.

In [20]:
# Fetch news for a single company on a specific date
def fetch_news(ticker, date, company_name):
    global API_KEY_INDEX
    keywords = f'"{ticker}" AND "{company_name}"' # The article must have both ticker and company name
    
    params = {"q": keywords, "from": date, "to": date, "language": "en", "sortBy": "relevancy",
              "pageSize": 50, "apiKey": API_KEYS[API_KEY_INDEX]}
    
    response = requests.get(BASE_URL, params = params)
    
    if response.status_code == 200:
        return response.json().get("articles", [])
    elif response.status_code == 429: # Limit reached for the current API key
        print("Switching API key.")
        switch_api_key()
        time.sleep(1)
        return fetch_news(ticker, date, company_name) # Retry a new API key
    else:
        return []

In [23]:
# Fetch news for all tickers on the specified date
fetch_date = "2025-01-31"
all_news = []

# Fetch and process news articles for each company
for ticker, company_name in ticker_to_company_dict.items():
    print(f"Fetching news for {ticker} ({company_name}) on {fetch_date}.")
    articles = fetch_news(ticker, fetch_date, company_name)
    
    for article in articles:
        if not is_relevant_article(article, ticker, company_name):
            continue  # Skip articles that do not contain the ticker or company name

        content = article.get("content", "No Content")
        cleaned_content = clean_text(content)  # Clean the content
        density_ticker = keyword_density(content, [ticker.lower()])  # Density for ticker
        density_company = keyword_density(content, company_name.lower().split())  # Density for company name
        density_total = density_ticker + density_company

        # set a threshold to filter out unrelated articles
        if density_total > 0.02: 
            all_news.append({"Date": fetch_date, 
                             "Ticker": ticker, 
                             "Company_Name": company_name,
                             "Title": article.get("title", "No Title"), 
                             "Cleaned_Content": cleaned_content,
                             "Density_Ticker": density_ticker, 
                             "Density_Company": density_company,
                             "Sentiment": get_sentiment(cleaned_content), 
                             "Url": article.get("url", "No URL"),
                             "PublishedAt": article.get("publishedAt", "No Date"),
                             "Source": article.get("source", {}).get("name", "Unknown")})
    
    time.sleep(1)

news_df = pd.DataFrame(all_news)
output_filename = f"news_{fetch_date}.csv"
news_df.to_csv(output_filename, index = False)

print(f"Processed and filtered news for {fetch_date} saved to {output_filename}!")

Fetching news for MMM (3M) on 2025-01-31.
Fetching news for AOS (A. O. Smith) on 2025-01-31.
Fetching news for ABT (Abbott Laboratories) on 2025-01-31.
Fetching news for ABBV (AbbVie) on 2025-01-31.
Fetching news for ACN (Accenture) on 2025-01-31.
Fetching news for ADBE (Adobe Inc.) on 2025-01-31.
Fetching news for AMD (Advanced Micro Devices) on 2025-01-31.
Fetching news for AES (AES Corporation) on 2025-01-31.
Fetching news for AFL (Aflac) on 2025-01-31.
Fetching news for A (Agilent Technologies) on 2025-01-31.
Fetching news for APD (Air Products) on 2025-01-31.
Fetching news for ABNB (Airbnb) on 2025-01-31.
Fetching news for AKAM (Akamai Technologies) on 2025-01-31.
Fetching news for ALB (Albemarle Corporation) on 2025-01-31.
Fetching news for ARE (Alexandria Real Estate Equities) on 2025-01-31.
Fetching news for ALGN (Align Technology) on 2025-01-31.
Fetching news for ALLE (Allegion) on 2025-01-31.
Fetching news for LNT (Alliant Energy) on 2025-01-31.
Fetching news for ALL (Allstat

Fetching news for D (Dominion Energy) on 2025-01-31.
Fetching news for DPZ (Domino's) on 2025-01-31.
Fetching news for DOV (Dover Corporation) on 2025-01-31.
Fetching news for DOW (Dow Inc.) on 2025-01-31.
Fetching news for DHI (D. R. Horton) on 2025-01-31.
Fetching news for DTE (DTE Energy) on 2025-01-31.
Fetching news for DUK (Duke Energy) on 2025-01-31.
Fetching news for DD (DuPont) on 2025-01-31.
Fetching news for EMN (Eastman Chemical Company) on 2025-01-31.
Fetching news for ETN (Eaton Corporation) on 2025-01-31.
Fetching news for EBAY (eBay) on 2025-01-31.
Fetching news for ECL (Ecolab) on 2025-01-31.
Fetching news for EIX (Edison International) on 2025-01-31.
Fetching news for EW (Edwards Lifesciences) on 2025-01-31.
Fetching news for EA (Electronic Arts) on 2025-01-31.
Fetching news for ELV (Elevance Health) on 2025-01-31.
Fetching news for EMR (Emerson Electric) on 2025-01-31.
Fetching news for ENPH (Enphase Energy) on 2025-01-31.
Fetching news for ETR (Entergy) on 2025-01-31

Fetching news for MLM (Martin Marietta Materials) on 2025-01-31.
Fetching news for MAS (Masco) on 2025-01-31.
Fetching news for MA (Mastercard) on 2025-01-31.
Fetching news for MTCH (Match Group) on 2025-01-31.
Fetching news for MKC (McCormick & Company) on 2025-01-31.
Fetching news for MCD (McDonald's) on 2025-01-31.
Fetching news for MCK (McKesson Corporation) on 2025-01-31.
Fetching news for MDT (Medtronic) on 2025-01-31.
Fetching news for MRK (Merck & Co.) on 2025-01-31.
Fetching news for META (Meta Platforms) on 2025-01-31.
Fetching news for MET (MetLife) on 2025-01-31.
Fetching news for MTD (Mettler Toledo) on 2025-01-31.
Fetching news for MGM (MGM Resorts) on 2025-01-31.
Fetching news for MCHP (Microchip Technology) on 2025-01-31.
Fetching news for MU (Micron Technology) on 2025-01-31.
Fetching news for MSFT (Microsoft) on 2025-01-31.
Fetching news for MAA (Mid-America Apartment Communities) on 2025-01-31.
Fetching news for MRNA (Moderna) on 2025-01-31.
Fetching news for MHK (Mo

Fetching news for TDG (TransDigm Group) on 2025-01-31.
Fetching news for TRV (Travelers Companies (The)) on 2025-01-31.
Fetching news for TRMB (Trimble Inc.) on 2025-01-31.
Fetching news for TFC (Truist Financial) on 2025-01-31.
Fetching news for TYL (Tyler Technologies) on 2025-01-31.
Fetching news for TSN (Tyson Foods) on 2025-01-31.
Fetching news for USB (U.S. Bancorp) on 2025-01-31.
Fetching news for UBER (Uber) on 2025-01-31.
Fetching news for UDR (UDR, Inc.) on 2025-01-31.
Fetching news for ULTA (Ulta Beauty) on 2025-01-31.
Fetching news for UNP (Union Pacific Corporation) on 2025-01-31.
Fetching news for UAL (United Airlines Holdings) on 2025-01-31.
Fetching news for UPS (United Parcel Service) on 2025-01-31.
Fetching news for URI (United Rentals) on 2025-01-31.
Fetching news for UNH (UnitedHealth Group) on 2025-01-31.
Fetching news for UHS (Universal Health Services) on 2025-01-31.
Fetching news for VLO (Valero Energy) on 2025-01-31.
Fetching news for VTR (Ventas) on 2025-01-31

### Merge Prices and News Data

In [24]:
def process_news_and_price(price_df: pd.DataFrame) -> tuple:
    # Concatenate all the news csv files
    news_files = glob.glob("news_*.csv")
    df_news = pd.concat([pd.read_csv(file) for file in news_files], ignore_index = True)

    # Rows with invalid date entries
    valid_date_pattern = r'^\d{4}-\d{2}-\d{2}$'
    df_news = df_news[df_news['Date'].astype(str).str.match(valid_date_pattern, na = False)]
    df_news['Date'] = pd.to_datetime(df_news['Date'], format = '%Y-%m-%d')
    
    # Organize df_price
    valid_dates = df_news['Date'].unique()
    df_price = price_df[price_df['Date'].isin(valid_dates)]

    return df_news, df_price

In [25]:
df_news, df_price = process_news_and_price(df_price)
df_price

Price,Date,Adj Close,Pct_Change,Trend,Ticker
0,2024-12-30,129.130005,-0.806566,Neutral,MMM
1,2024-12-31,129.089996,-0.030983,Neutral,MMM
2,2025-01-02,129.699997,0.472539,Neutral,MMM
3,2025-01-03,129.869995,0.131070,Neutral,MMM
4,2025-01-06,130.289993,0.323399,Neutral,MMM
...,...,...,...,...,...
11017,2025-01-27,173.029999,2.621433,Significantly Increased,ZTS
11018,2025-01-28,170.419998,-1.508409,Decreased,ZTS
11019,2025-01-29,170.220001,-0.117355,Neutral,ZTS
11020,2025-01-30,172.710007,1.462816,Increased,ZTS


There are some cases that some companies may not have any news on a given day, and some companies have multiple news, so we use an inner join to ensure that all records have both price and news data and all news are included in our dataframe. There are 10,336 records after merging.

In [26]:
df_merged = pd.merge(df_price, df_news, on = ['Date', 'Ticker'], how = "inner")
df_merged.shape[0]

10336

In [27]:
df_merged.head()

Unnamed: 0,Date,Adj Close,Pct_Change,Trend,Ticker,Company_Name,Title,Cleaned_Content,Density_Ticker,Density_Company,Sentiment,Url,PublishedAt,Source
0,2024-12-31,129.089996,-0.030983,Neutral,MMM,3M,Is 3M a Top Stock to Buy for 2025?,years underperformance nyse mmm stock delivere...,0.0,0.028571,0.75,https://biztoc.com/x/e02ba693e47977ea,2024-12-31T11:47:04Z,Biztoc.com
1,2025-01-08,134.529999,1.325596,Increased,MMM,3M,3M Target of Unusually Large Options Trading (...,nyse mmm get free report saw unusually large o...,0.0,0.028571,0.154762,https://www.etfdailynews.com/2025/01/08/3m-tar...,2025-01-08T06:48:46Z,ETF Daily News
2,2025-01-10,131.210007,-2.467845,Decreased,MMM,3M,International Assets Investment Management LLC...,international assets investment management llc...,0.0,0.030303,0.1,https://www.etfdailynews.com/2025/01/10/intern...,2025-01-10T10:00:44Z,ETF Daily News
3,2025-01-10,131.210007,-2.467845,Decreased,MMM,3M,Barclays Has Lowered Expectations for 3M (NYSE...,nyse mmm get free report target price dropped ...,0.0,0.029412,0.4,https://www.etfdailynews.com/2025/01/10/barcla...,2025-01-10T09:20:52Z,ETF Daily News
4,2025-01-10,131.210007,-2.467845,Decreased,MMM,3M,"Nordea Investment Management AB Buys 2,448 Sha...",nordea investment management ab grew stake nys...,0.0,0.028571,0.2,https://www.etfdailynews.com/2025/01/10/nordea...,2025-01-10T08:41:00Z,ETF Daily News


# Use Word2Vec to calculate Embeddings

In [29]:
seed = 42

df_merged = df_merged[df_merged["Cleaned_Content"].notna()]  
sentences = df_merged["Cleaned_Content"].apply(lambda x: x.split()).tolist()

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, size = 100, window = 5, min_count = 1, workers = 4, sg = 1, seed = seed)

# Compute sentence embeddings
def get_sentence_embedding(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis = 0)
    else:
        return np.zeros(model.vector_size)
    
# Apply sentence embeddings
df_merged["Embeddings"] = df_merged["Cleaned_Content"].apply(
    lambda x: get_sentence_embedding(x.split(), word2vec_model))
df_merged.head()

Unnamed: 0,Date,Adj Close,Pct_Change,Trend,Ticker,Company_Name,Title,Cleaned_Content,Density_Ticker,Density_Company,Sentiment,Url,PublishedAt,Source,Embeddings
0,2024-12-31,129.089996,-0.030983,Neutral,MMM,3M,Is 3M a Top Stock to Buy for 2025?,years underperformance nyse mmm stock delivere...,0.0,0.028571,0.75,https://biztoc.com/x/e02ba693e47977ea,2024-12-31T11:47:04Z,Biztoc.com,"[0.0655782, -0.022768958, 0.11069959, -0.07197..."
1,2025-01-08,134.529999,1.325596,Increased,MMM,3M,3M Target of Unusually Large Options Trading (...,nyse mmm get free report saw unusually large o...,0.0,0.028571,0.154762,https://www.etfdailynews.com/2025/01/08/3m-tar...,2025-01-08T06:48:46Z,ETF Daily News,"[0.20044059, -0.017308442, 0.09214182, 0.10744..."
2,2025-01-10,131.210007,-2.467845,Decreased,MMM,3M,International Assets Investment Management LLC...,international assets investment management llc...,0.0,0.030303,0.1,https://www.etfdailynews.com/2025/01/10/intern...,2025-01-10T10:00:44Z,ETF Daily News,"[0.27881715, -0.025818255, 0.13487041, 0.25882..."
3,2025-01-10,131.210007,-2.467845,Decreased,MMM,3M,Barclays Has Lowered Expectations for 3M (NYSE...,nyse mmm get free report target price dropped ...,0.0,0.029412,0.4,https://www.etfdailynews.com/2025/01/10/barcla...,2025-01-10T09:20:52Z,ETF Daily News,"[0.35062176, 0.14997256, -0.11875903, 0.011174..."
4,2025-01-10,131.210007,-2.467845,Decreased,MMM,3M,"Nordea Investment Management AB Buys 2,448 Sha...",nordea investment management ab grew stake nys...,0.0,0.028571,0.2,https://www.etfdailynews.com/2025/01/10/nordea...,2025-01-10T08:41:00Z,ETF Daily News,"[0.29891127, -0.04058986, 0.11056614, 0.226031..."


Now, all 10,336 records have embeddings.

In [30]:
df_merged.head()

Unnamed: 0,Date,Adj Close,Pct_Change,Trend,Ticker,Company_Name,Title,Cleaned_Content,Density_Ticker,Density_Company,Sentiment,Url,PublishedAt,Source,Embeddings
0,2024-12-31,129.089996,-0.030983,Neutral,MMM,3M,Is 3M a Top Stock to Buy for 2025?,years underperformance nyse mmm stock delivere...,0.0,0.028571,0.75,https://biztoc.com/x/e02ba693e47977ea,2024-12-31T11:47:04Z,Biztoc.com,"[0.0655782, -0.022768958, 0.11069959, -0.07197..."
1,2025-01-08,134.529999,1.325596,Increased,MMM,3M,3M Target of Unusually Large Options Trading (...,nyse mmm get free report saw unusually large o...,0.0,0.028571,0.154762,https://www.etfdailynews.com/2025/01/08/3m-tar...,2025-01-08T06:48:46Z,ETF Daily News,"[0.20044059, -0.017308442, 0.09214182, 0.10744..."
2,2025-01-10,131.210007,-2.467845,Decreased,MMM,3M,International Assets Investment Management LLC...,international assets investment management llc...,0.0,0.030303,0.1,https://www.etfdailynews.com/2025/01/10/intern...,2025-01-10T10:00:44Z,ETF Daily News,"[0.27881715, -0.025818255, 0.13487041, 0.25882..."
3,2025-01-10,131.210007,-2.467845,Decreased,MMM,3M,Barclays Has Lowered Expectations for 3M (NYSE...,nyse mmm get free report target price dropped ...,0.0,0.029412,0.4,https://www.etfdailynews.com/2025/01/10/barcla...,2025-01-10T09:20:52Z,ETF Daily News,"[0.35062176, 0.14997256, -0.11875903, 0.011174..."
4,2025-01-10,131.210007,-2.467845,Decreased,MMM,3M,"Nordea Investment Management AB Buys 2,448 Sha...",nordea investment management ab grew stake nys...,0.0,0.028571,0.2,https://www.etfdailynews.com/2025/01/10/nordea...,2025-01-10T08:41:00Z,ETF Daily News,"[0.29891127, -0.04058986, 0.11056614, 0.226031..."


There could be multiple articles to the same company within a day, so we take the average of the embeddings for each company.

In [31]:
# Aggregate embeddings by taking the mean for each company
aggregated_news = df_merged.groupby(["Date", "Ticker", ])["Embeddings"].apply(
    lambda x: np.mean(np.stack(x), axis = 0)).reset_index()
aggregated_news.rename(columns = {"Embeddings": "Aggregated_Embeddings"}, inplace = True)
aggregated_news.shape[0]

4854

Now each ticker only has one embedding on a certain day.

In [32]:
aggregated_news[aggregated_news['Ticker'] == 'AAPL']

Unnamed: 0,Date,Ticker,Aggregated_Embeddings
0,2024-12-30,AAPL,"[0.2839345, -0.11395664, 0.06545841, 0.1147077..."
99,2024-12-31,AAPL,"[0.28776222, -0.12800653, 0.055249512, 0.11619..."
228,2025-01-02,AAPL,"[0.2227637, -0.13687655, 0.07871343, 0.1124470..."
365,2025-01-03,AAPL,"[0.27805713, -0.122526295, 0.055144105, 0.1227..."
516,2025-01-06,AAPL,"[0.26260996, -0.11783125, 0.06840638, 0.091146..."
769,2025-01-07,AAPL,"[0.254363, -0.12911123, 0.051528934, 0.1286762..."
1014,2025-01-08,AAPL,"[0.28314662, -0.08208701, 0.067079514, 0.14002..."
1236,2025-01-10,AAPL,"[0.26126158, -0.14334746, 0.05285517, 0.117959..."
1463,2025-01-13,AAPL,"[0.25150153, -0.13502286, 0.046275754, 0.11705..."
1689,2025-01-14,AAPL,"[0.24009737, -0.13298705, 0.06780878, 0.114377..."


In [33]:
# Take a look at the first embedding to see if it looks correct
first_embedding = aggregated_news["Aggregated_Embeddings"].iloc[0]
first_embedding

array([ 2.8393450e-01, -1.1395664e-01,  6.5458409e-02,  1.1470773e-01,
        2.7478755e-01,  2.9227015e-01, -3.5073161e-01,  4.5298710e-01,
       -8.9568747e-03, -1.5387256e-01, -1.4358388e-01,  1.7257239e-01,
        1.7823840e-02,  3.1008363e-01,  1.0935153e-01, -9.2633680e-02,
       -4.1375417e-04,  1.7213710e-01,  3.9959887e-01, -2.4713920e-02,
        2.4659416e-01, -2.6368552e-03,  1.4772511e-01, -2.6821390e-01,
       -2.1912874e-01,  3.6082318e-01, -3.8405636e-01,  1.8659259e-01,
        1.3724102e-01,  2.9166132e-01, -3.6439395e-01,  5.8012134e-03,
       -2.2961426e-01, -1.2113942e-01,  1.4537701e-01, -4.9547794e-01,
       -1.2552068e-01, -2.7270952e-01, -2.4836341e-01,  2.2360812e-01,
        6.1648570e-02, -3.1307894e-01, -2.1813324e-01, -8.9085558e-03,
        2.0510271e-01, -8.8989928e-02, -4.6144310e-01,  5.2446824e-01,
        3.5533530e-01, -2.5327632e-03, -5.1424719e-02,  2.8051195e-01,
        3.0256420e-01, -5.4090238e-01,  4.1556738e-02, -3.7195843e-03,
      

In [34]:
# Merge the aggregated features back into df_price to create df
df = aggregated_news.merge(df_price, on = ["Ticker", "Date"], how = "inner")
df = df.sort_values(by = "Date").reset_index(drop = True)
df.head()

Unnamed: 0,Date,Ticker,Aggregated_Embeddings,Adj Close,Pct_Change,Trend
0,2024-12-30,AAPL,"[0.2839345, -0.11395664, 0.06545841, 0.1147077...",252.199997,-1.326343,Decreased
1,2024-12-30,ON,"[0.11126511, -0.017620947, 0.032517523, -0.117...",63.459999,-3.863056,Significantly Decreased
2,2024-12-30,NVDA,"[0.14468181, -0.080395475, 0.04517415, 0.00973...",137.490005,0.350347,Neutral
3,2024-12-30,NEE,"[0.13733919, 0.06469267, 0.27831322, -0.150761...",71.760002,-0.485367,Neutral
4,2024-12-30,NDSN,"[0.3174634, -0.010491661, 0.00015380916, 0.075...",208.051529,-1.154922,Decreased


Check class distribution after merging. The classes are imbalanced, we will see if it's necessary to apply oversampling or SMOTE in later training.

In [35]:
df['Trend'].value_counts()

Neutral                    2441
Increased                   990
Decreased                   714
Significantly Increased     412
Significantly Decreased     297
Name: Trend, dtype: int64

# Model Training

To prevent data leakage, we perform expanding window cross validation with 20 days of data. The rest 2 days are for final validation on a holdout set.

* 1st Fold: Use day 1-8 for training and day 9-12 for testing.
* 2nd Fold: Use day 1-10 for training and day 11-14 for testing.
* 3rd Fold: Use day 1-12 for training and day 13-16 for testing.
* 4th Fold: Use day 1-14 for training and day 15-28 for testing.
* 5th Fold: Use day 1-16 for training and day 17-20 for testing.

In [36]:
# Unique tickers and dates
unique_tickers = df["Ticker"].unique()
unique_dates = sorted(df["Date"].unique())

# Set aside 50 tickers for final validation
np.random.seed(42)
holdout_tickers = np.random.choice(unique_tickers, size = 50, replace = False)

# Split df into a set for modeling and a holdout set for validation
df_validation = df[df["Ticker"].isin(holdout_tickers)]  # Holdout tickers
df_modeling = df[~df["Ticker"].isin(holdout_tickers)]   # Used for CV (Train & Test)

# Set aside last 2 days for final validation
validation_dates = unique_dates[-2:]  # Last 2 days
df_validation = df_validation[df_validation["Date"].isin(validation_dates)]

# Use first 20 days and 453 tickers for Expanding Window CV
df_modeling = df_modeling[df_modeling["Date"].isin(unique_dates[:20])]

Convert embeddings vector to 100 columns for the holdout set

In [37]:
# Validation on df_validation (Last 2 Days)
X_validation = df_validation.drop(["Trend", "Date", "Ticker"], axis = 1)
y_validation = df_validation["Trend"]

# Convert embeddings vector to 100 columns for the holdout set
if "Aggregated_Embeddings" in X_validation.columns and not X_validation["Aggregated_Embeddings"].isna().all():
    X_embeddings_validation = np.stack(X_validation["Aggregated_Embeddings"])
    X_validation = pd.concat([X_validation.drop(["Aggregated_Embeddings"], axis = 1).reset_index(drop = True),
                              pd.DataFrame(X_embeddings_validation)], axis = 1)

# Ensure numeric columns only
X_validation = X_validation.select_dtypes(include = [np.number])
X_validation.columns = X_validation.columns.astype(str)

Define Expanding Window CV Folds

In [38]:
folds = [(0, 8, 9, 12), (0, 10, 11, 14), (0, 12, 13, 16), (0, 14, 15, 18), (0, 16, 17, 20)]

### Logistic Regression

We first trained a logistic regression model and noticed that applying SMOTE resulted in unusually good performance. This could be becuase SMOTE creates new synthetic samples based on existing minority class instances.
These synthetic samples are interpolations, which may lack real-world variability and make classification too easy for the model.

As the class distribution is not highly imbalanced, we decide not to use SMOTE.

The model has a Mean Expanding Window CV Macro-F1 Score: 0.94. The logistic regression model is doing very well.

In [41]:
cv_scores = []

for fold_idx, (train_start, train_end, test_start, test_end) in enumerate(folds):
    train_dates = unique_dates[train_start:train_end]  # Training range
    test_dates = unique_dates[test_start:test_end]  # Testing range

    # Create masks to filter data
    train_mask = df_modeling["Date"].isin(train_dates)
    test_mask = df_modeling["Date"].isin(test_dates)

    # Apply masks before extracting features
    X_train = df_modeling.loc[train_mask].drop(["Trend", "Date", "Ticker"], axis = 1).reset_index(drop = True)
    y_train = df_modeling.loc[train_mask]["Trend"].reset_index(drop = True)

    X_test = df_modeling.loc[test_mask].drop(["Trend", "Date", "Ticker"], axis = 1).reset_index(drop = True)
    y_test = df_modeling.loc[test_mask]["Trend"].reset_index(drop = True)

    # Convert embeddings
    if "Aggregated_Embeddings" in X_train.columns and not X_train["Aggregated_Embeddings"].isna().all():
        X_embeddings_train = np.stack(X_train["Aggregated_Embeddings"])
        X_train = pd.concat([X_train.drop(["Aggregated_Embeddings"], axis = 1).reset_index(drop = True),
                             pd.DataFrame(X_embeddings_train)], axis = 1)

    if "Aggregated_Embeddings" in X_test.columns and not X_test["Aggregated_Embeddings"].isna().all():
        X_embeddings_test = np.stack(X_test["Aggregated_Embeddings"])
        X_test = pd.concat([X_test.drop(["Aggregated_Embeddings"], axis = 1).reset_index(drop = True),
                            pd.DataFrame(X_embeddings_test)], axis = 1)

    # Ensure only numeric columns
    X_train = X_train.select_dtypes(include = [np.number])
    X_train.columns = X_train.columns.astype(str)

    X_test = X_test.select_dtypes(include = [np.number])
    X_test.columns = X_test.columns.astype(str)

    # Apply SMOTE to balance training data
    # smote = SMOTE(random_state = 42)
    # X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    # Print class distribution after SMOTE
    # unique, counts = np.unique(y_train_resampled, return_counts = True)
    # print(f"Class distribution after SMOTE for Fold {fold_idx + 1}: {dict(zip(unique, counts))}")

    # Define and train Logistic Regression model
    lr = LogisticRegression(max_iter = 1000, class_weight = "balanced", random_state = 42)
    lr.fit(X_train, y_train)

    # Evaluate model
    y_pred = lr.predict(X_test)
    f1 = f1_score(y_test, y_pred, average = "macro")
    cv_scores.append(f1)

    print(f"Results for Fold {fold_idx + 1}:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("------------------------------------------------------------")

print("Expanding Window CV Macro-F1 Scores:", cv_scores)
print("Mean Expanding Window CV Macro-F1 Score:", np.mean(cv_scores))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Results for Fold 1:
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.79      1.00      0.88        55
              Increased       0.84      0.99      0.91       220
                Neutral       1.00      0.84      0.92       300
Significantly Decreased       1.00      0.92      0.96        13
Significantly Increased       0.97      0.91      0.94        94

               accuracy                           0.91       682
              macro avg       0.92      0.93      0.92       682
           weighted avg       0.93      0.91      0.91       682

Confusion Matrix:
[[ 55   0   0   0   0]
 [  0 217   0   0   3]
 [ 14  33 253   0   0]
 [  1   0   0  12   0]
 [  0   8   0   0  86]]
------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Results for Fold 2:
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.87      1.00      0.93        47
              Increased       0.88      0.96      0.92       212
                Neutral       1.00      0.92      0.96       335
Significantly Decreased       1.00      0.94      0.97        17
Significantly Increased       0.90      0.91      0.90        78

               accuracy                           0.94       689
              macro avg       0.93      0.95      0.94       689
           weighted avg       0.94      0.94      0.94       689

Confusion Matrix:
[[ 47   0   0   0   0]
 [  0 204   0   0   8]
 [  6  22 307   0   0]
 [  1   0   0  16   0]
 [  0   7   0   0  71]]
------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Results for Fold 3:
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.86      0.96      0.91        75
              Increased       0.90      0.96      0.93       150
                Neutral       1.00      0.92      0.96       333
Significantly Decreased       0.89      1.00      0.94        25
Significantly Increased       0.92      0.99      0.95        68

               accuracy                           0.94       651
              macro avg       0.91      0.96      0.94       651
           weighted avg       0.95      0.94      0.94       651

Confusion Matrix:
[[ 72   0   0   3   0]
 [  0 144   0   0   6]
 [ 12  15 306   0   0]
 [  0   0   0  25   0]
 [  0   1   0   0  67]]
------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Results for Fold 4:
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.83      0.98      0.90        61
              Increased       0.91      0.97      0.94       158
                Neutral       1.00      0.92      0.96       315
Significantly Decreased       0.97      1.00      0.99        38
Significantly Increased       0.94      0.97      0.96        66

               accuracy                           0.95       638
              macro avg       0.93      0.97      0.95       638
           weighted avg       0.95      0.95      0.95       638

Confusion Matrix:
[[ 60   0   0   1   0]
 [  0 154   0   0   4]
 [ 12  13 290   0   0]
 [  0   0   0  38   0]
 [  0   2   0   0  64]]
------------------------------------------------------------
Results for Fold 5:
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.87      1.00      0.93       128


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Prediction on the holdout set yields a macro f1 socre of 0.87.

In [42]:
# Predict on final validation set
y_validation_pred = lr.predict(X_validation)

# Evaluate final performance
print("Final Validation on Last 2 Days (50 Tickers)")
print("Classification Report:")
print(classification_report(y_validation, y_validation_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_validation, y_validation_pred))

Final Validation on Last 2 Days (50 Tickers)
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.67      1.00      0.80         6
              Increased       0.90      0.75      0.82        12
                Neutral       1.00      0.84      0.91        25
Significantly Decreased       1.00      1.00      1.00         3
Significantly Increased       0.73      1.00      0.84         8

               accuracy                           0.87        54
              macro avg       0.86      0.92      0.87        54
           weighted avg       0.90      0.87      0.87        54

Confusion Matrix:
[[ 6  0  0  0  0]
 [ 0  9  0  0  3]
 [ 3  1 21  0  0]
 [ 0  0  0  3  0]
 [ 0  0  0  0  8]]


### MLP

The MLP model has a Mean Expanding Window CV Macro-F1 Score: 0.85.

In [39]:
cv_scores = []

for fold_idx, (train_start, train_end, test_start, test_end) in enumerate(folds):
    train_dates = unique_dates[train_start:train_end]  # Training range
    test_dates = unique_dates[test_start:test_end]  # Testing range

    # Create masks to filter data
    train_mask = df_modeling["Date"].isin(train_dates)
    test_mask = df_modeling["Date"].isin(test_dates)

    # Apply masks before extracting features
    X_train = df_modeling.loc[train_mask].drop(["Trend", "Date", "Ticker"], axis = 1).reset_index(drop = True)
    y_train = df_modeling.loc[train_mask]["Trend"].reset_index(drop = True)

    X_test = df_modeling.loc[test_mask].drop(["Trend", "Date", "Ticker"], axis = 1).reset_index(drop = True)
    y_test = df_modeling.loc[test_mask]["Trend"].reset_index(drop = True)

    # Convert embeddings
    if "Aggregated_Embeddings" in X_train.columns and not X_train["Aggregated_Embeddings"].isna().all():
        X_embeddings_train = np.stack(X_train["Aggregated_Embeddings"])
        X_train = pd.concat([X_train.drop(["Aggregated_Embeddings"], axis = 1).reset_index(drop = True),
                             pd.DataFrame(X_embeddings_train)], axis = 1)

    if "Aggregated_Embeddings" in X_test.columns and not X_test["Aggregated_Embeddings"].isna().all():
        X_embeddings_test = np.stack(X_test["Aggregated_Embeddings"])
        X_test = pd.concat([X_test.drop(["Aggregated_Embeddings"], axis = 1).reset_index(drop = True),
                            pd.DataFrame(X_embeddings_test)], axis = 1)

    # Ensure only numeric columns
    X_train = X_train.select_dtypes(include = [np.number])
    X_train.columns = X_train.columns.astype(str)

    X_test = X_test.select_dtypes(include = [np.number])
    X_test.columns = X_test.columns.astype(str)

    # Apply SMOTE to balance training data
    # smote = SMOTE(random_state = 42)
    # X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    # Print class distribution after SMOTE
    # unique, counts = np.unique(y_train_resampled, return_counts = True)
    # print(f"Class distribution after SMOTE for Fold {fold_idx + 1}: {dict(zip(unique, counts))}")

    # Define and train MLP model
    mlp = MLPClassifier(hidden_layer_sizes = (100,), activation = "relu", solver = "adam", 
                        max_iter = 500, random_state = 42)
    mlp.fit(X_train, y_train)

    # Evaluate model
    y_pred = mlp.predict(X_test)
    f1 = f1_score(y_test, y_pred, average = "macro")
    cv_scores.append(f1)

    print(f"Results for Fold {fold_idx + 1}:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("------------------------------------------------------------")

print("Expanding Window CV Macro-F1 Scores:", cv_scores)
print("Mean Expanding Window CV Macro-F1 Score:", np.mean(cv_scores))

Results for Fold 1:
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.97      0.53      0.68        55
              Increased       0.72      0.35      0.47       220
                Neutral       0.66      1.00      0.79       300
Significantly Decreased       0.92      0.85      0.88        13
Significantly Increased       0.83      0.68      0.75        94

               accuracy                           0.71       682
              macro avg       0.82      0.68      0.72       682
           weighted avg       0.73      0.71      0.68       682

Confusion Matrix:
[[ 29   0  25   1   0]
 [  0  77 130   0  13]
 [  0   0 300   0   0]
 [  1   0   1  11   0]
 [  0  30   0   0  64]]
------------------------------------------------------------
Results for Fold 2:
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.70      0.94      0.80        47


Prediction on the holdout set yields a macro f1 socre of 0.85.

In [40]:
# Predict on final validation set
y_validation_pred = mlp.predict(X_validation)

# Evaluate final performance
print("Final Validation on Last 2 Days (50 Tickers)")
print("Classification Report:")
print(classification_report(y_validation, y_validation_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_validation, y_validation_pred))

Final Validation on Last 2 Days (50 Tickers)
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.67      1.00      0.80         6
              Increased       0.91      0.83      0.87        12
                Neutral       1.00      0.92      0.96        25
Significantly Decreased       1.00      0.67      0.80         3
Significantly Increased       0.78      0.88      0.82         8

               accuracy                           0.89        54
              macro avg       0.87      0.86      0.85        54
           weighted avg       0.91      0.89      0.89        54

Confusion Matrix:
[[ 6  0  0  0  0]
 [ 0 10  0  0  2]
 [ 2  0 23  0  0]
 [ 1  0  0  2  0]
 [ 0  1  0  0  7]]


### Ramdom Forest

The random forest model has a Mean Expanding Window CV Macro-F1 Score: 0.85.

In [45]:
cv_scores = []

for fold_idx, (train_start, train_end, test_start, test_end) in enumerate(folds):
    train_dates = unique_dates[train_start:train_end]  # Training range
    test_dates = unique_dates[test_start:test_end]  # Testing range

    # Create masks to filter data
    train_mask = df_modeling["Date"].isin(train_dates)
    test_mask = df_modeling["Date"].isin(test_dates)

    # Apply masks before extracting features
    X_train = df_modeling.loc[train_mask].drop(["Trend", "Date", "Ticker"], axis = 1).reset_index(drop = True)
    y_train = df_modeling.loc[train_mask]["Trend"].reset_index(drop = True)

    X_test = df_modeling.loc[test_mask].drop(["Trend", "Date", "Ticker"], axis = 1).reset_index(drop = True)
    y_test = df_modeling.loc[test_mask]["Trend"].reset_index(drop = True)

    # Convert embeddings if applicable
    if "Aggregated_Embeddings" in X_train.columns and not X_train["Aggregated_Embeddings"].isna().all():
        X_embeddings_train = np.stack(X_train["Aggregated_Embeddings"])
        X_train = pd.concat([X_train.drop(["Aggregated_Embeddings"], axis = 1).reset_index(drop = True),
                             pd.DataFrame(X_embeddings_train)], axis = 1)

    if "Aggregated_Embeddings" in X_test.columns and not X_test["Aggregated_Embeddings"].isna().all():
        X_embeddings_test = np.stack(X_test["Aggregated_Embeddings"])
        X_test = pd.concat([X_test.drop(["Aggregated_Embeddings"], axis = 1).reset_index(drop = True),
                            pd.DataFrame(X_embeddings_test)], axis = 1)

    # Ensure only numeric columns
    X_train = X_train.select_dtypes(include = [np.number])
    X_train.columns = X_train.columns.astype(str)

    X_test = X_test.select_dtypes(include = [np.number])
    X_test.columns = X_test.columns.astype(str)

    # Apply SMOTE to balance training data
    # smote = SMOTE(random_state = 42)
    # X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    # Print class distribution after SMOTE
    # unique, counts = np.unique(y_train_resampled, return_counts = True)
    # print(f"Class distribution after SMOTE for Fold {fold_idx + 1}: {dict(zip(unique, counts))}")

    # Define Random Forest with Hyperparameter Tuning
    rf = RandomForestClassifier(random_state = 42)
    param_dist = {"n_estimators": [100, 200, 300, 500],           
                  "max_depth": [10, 20, 30, None],      
                  "min_samples_split": [2, 5, 10],            
                  "min_samples_leaf": [1, 2, 4],             
                  "max_features": ["sqrt", "log2"],        # Number of features considered for splits
                  "class_weight": ["balanced", None]}    

    random_search = RandomizedSearchCV(estimator = rf, param_distributions = param_dist, n_iter = 50,
                                       scoring = "f1_macro", cv = 5, verbose = 2, random_state = 42,
                                       n_jobs = -1)

    random_search.fit(X_train, y_train)
    best_rf = random_search.best_estimator_

    # Evaluate model
    y_pred = best_rf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average = "macro")
    cv_scores.append(f1)

    print(f"Results for Fold {fold_idx + 1}:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("------------------------------------------------------------")

print("Expanding Window CV Macro-F1 Scores:", cv_scores)
print("Mean Expanding Window CV Macro-F1 Score:", np.mean(cv_scores))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Results for Fold 1:
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.98      0.89      0.93        55
              Increased       0.75      0.80      0.77       220
                Neutral       0.85      1.00      0.92       300
Significantly Decreased       0.93      1.00      0.96        13
Significantly Increased       1.00      0.34      0.51        94

               accuracy                           0.83       682
              macro avg       0.90      0.80      0.82       682
           weighted avg       0.85      0.83      0.82       682

Confusion Matrix:
[[ 49   0   6   0   0]
 [  0 175  44   1   0]
 [  1   0 299   0   0]
 [  0   0   0  13   0]
 [  0  58   4   0  32]]
------------------------------------------------------------
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Results for Fold 2:
Classification Report:
       

[CV] END class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   2.7s
[CV] END class_weight=None, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   1.9s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   1.9s
[CV] END class_weight=balanced, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   3.1s
[CV] END class_weight=balanced, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.7s
[CV] END class_weight=None, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   4.4s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   2.6s
[CV] END

[CV] END class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   2.8s
[CV] END class_weight=None, max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   1.9s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.0s
[CV] END class_weight=balanced, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   3.3s
[CV] END class_weight=balanced, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.6s
[CV] END class_weight=None, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   4.6s
[CV] END class_weight=balanced, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=   4.4s
[CV]

[CV] END class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   2.5s
[CV] END class_weight=None, max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   1.8s
[CV] END class_weight=None, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   1.7s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END class_weight=balanced, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   3.2s
[CV] END class_weight=None, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.8s
[CV] END class_weight=None, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.8s
[CV] END cla

[CV] END class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   2.6s
[CV] END class_weight=None, max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   1.9s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.0s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   1.6s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   1.6s
[CV] END class_weight=None, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.9s
[CV] END cl

[CV] END class_weight=balanced, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   1.8s
[CV] END class_weight=balanced, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.0s
[CV] END class_weight=None, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   1.9s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   1.6s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   1.6s
[CV] END 

Prediction on the holdout set yields a macro f1 socre of 0.91.

In [44]:
# Predict on final validation set
y_validation_pred = best_rf.predict(X_validation)

# Evaluate final performance
print("Final Validation on Last 2 Days (50 Tickers)")
print("Classification Report:")
print(classification_report(y_validation, y_validation_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_validation, y_validation_pred))

Final Validation on Last 2 Days (50 Tickers)
Classification Report:
                         precision    recall  f1-score   support

              Decreased       0.83      0.83      0.83         6
              Increased       1.00      0.92      0.96        12
                Neutral       0.93      1.00      0.96        25
Significantly Decreased       1.00      0.67      0.80         3
Significantly Increased       1.00      1.00      1.00         8

               accuracy                           0.94        54
              macro avg       0.95      0.88      0.91        54
           weighted avg       0.95      0.94      0.94        54

Confusion Matrix:
[[ 5  0  1  0  0]
 [ 0 11  1  0  0]
 [ 0  0 25  0  0]
 [ 1  0  0  2  0]
 [ 0  0  0  0  8]]
[CV] END class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   3.3s
[CV] END class_weight=None, max_depth=10, max_features=log2, min_samples_leaf=2, min_sampl

[CV] END class_weight=balanced, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   2.1s
[CV] END class_weight=balanced, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   3.3s
[CV] END class_weight=None, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   2.0s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.8s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.8s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   1.9s
[CV] END class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   2.2s
[CV] END 

[CV] END class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   3.0s
[CV] END class_weight=None, max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   3.0s
[CV] END class_weight=None, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   2.0s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.8s
[CV] END class_weight=balanced, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   4.1s
[CV] END class_weight=balanced, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   3.2s
[CV] END class_weight=None, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   5.1s
[CV] END 

[CV] END class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   3.3s
[CV] END class_weight=None, max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=   3.0s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.4s
[CV] END class_weight=balanced, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   4.0s
[CV] END class_weight=balanced, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   3.0s
[CV] END class_weight=None, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.4s
[CV] END class_weight=balanced, max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   1.4s
[CV

[CV] END class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   3.4s
[CV] END class_weight=None, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=   3.1s
[CV] END class_weight=None, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   2.3s
[CV] END class_weight=balanced, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   4.2s
[CV] END class_weight=None, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   1.1s
[CV] END class_weight=None, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   1.0s
[CV] END class_weight=None, max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.4s
[CV] END clas

### XGBoost

The dataset might be too easy for XGBoost. Even after applying techniques to mitigate overfitting, the XGBoost model still has a Mean Expanding Window CV Macro-F1 Score: 0.95.

In [80]:
!pip uninstall -y scikit-learn
!pip install scikit-learn == 1.5.2

Found existing installation: scikit-learn 1.5.2
Uninstalling scikit-learn-1.5.2:
  Successfully uninstalled scikit-learn-1.5.2
Collecting scikit-learn==1.5.2
  Using cached scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (13 kB)
Using cached scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl (11.0 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.5.2


In [43]:
cv_scores = []

for fold_idx, (train_start, train_end, test_start, test_end) in enumerate(folds):
    train_dates = unique_dates[train_start:train_end]  # Training range
    test_dates = unique_dates[test_start:test_end]  # Testing range

    # Create masks to filter data
    train_mask = df_modeling["Date"].isin(train_dates)
    test_mask = df_modeling["Date"].isin(test_dates)

    # Apply masks before extracting features
    X_train = df_modeling.loc[train_mask].drop(["Trend", "Date", "Ticker"], axis = 1).reset_index(drop = True)
    y_train = df_modeling.loc[train_mask]["Trend"].reset_index(drop = True)

    X_test = df_modeling.loc[test_mask].drop(["Trend", "Date", "Ticker"], axis = 1).reset_index(drop = True)
    y_test = df_modeling.loc[test_mask]["Trend"].reset_index(drop = True)

    # Convert embeddings
    if "Aggregated_Embeddings" in X_train.columns and not X_train["Aggregated_Embeddings"].isna().all():
        X_embeddings_train = np.stack(X_train["Aggregated_Embeddings"])
        X_train = pd.concat([X_train.drop(["Aggregated_Embeddings"], axis = 1).reset_index(drop = True),
                             pd.DataFrame(X_embeddings_train)], axis = 1)

    if "Aggregated_Embeddings" in X_test.columns and not X_test["Aggregated_Embeddings"].isna().all():
        X_embeddings_test = np.stack(X_test["Aggregated_Embeddings"])
        X_test = pd.concat([X_test.drop(["Aggregated_Embeddings"], axis = 1).reset_index(drop = True),
                            pd.DataFrame(X_embeddings_test)], axis = 1)

    # Ensure only numeric columns
    X_train = X_train.select_dtypes(include = [np.number])
    X_train.columns = X_train.columns.astype(str)

    X_test = X_test.select_dtypes(include = [np.number])
    X_test.columns = X_test.columns.astype(str)

    # Apply SMOTE to balance training data
    # smote = SMOTE(random_state = 42)
    # X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    # Print class distribution after SMOTE
    # unique, counts = np.unique(y_train_resampled, return_counts = True)
    # print(f"Class distribution after SMOTE for Fold {fold_idx + 1}: {dict(zip(unique, counts))}")

    # Encode y_train_resampled and y_test for evaluation consistency
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    # Reduce Overfitting in XGBoost
    xgb = XGBClassifier(random_state = 42, n_estimators = 50, max_depth = 3, learning_rate = 0.05,
                        subsample = 0.5, colsample_bytree = 0.6, reg_alpha = 10.0, reg_lambda = 50.0,
                        gamma = 5.0, eval_metric = "mlogloss", use_label_encoder = False)
    xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds = 10, verbose = False)

    # Evaluate model
    y_pred = xgb.predict(X_test)
    f1 = f1_score(y_test, y_pred, average = "macro")
    cv_scores.append(f1)

    print(f"Results for Fold {fold_idx + 1}:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("------------------------------------------------------------")

print("Expanding Window CV Macro-F1 Scores:", cv_scores)
print("Mean Expanding Window CV Macro-F1 Score:", np.mean(cv_scores))



Results for Fold 1:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        55
           1       1.00      1.00      1.00       220
           2       0.76      1.00      0.86       300
           3       1.00      1.00      1.00        13
           4       0.00      0.00      0.00        94

    accuracy                           0.86       682
   macro avg       0.75      0.80      0.77       682
weighted avg       0.76      0.86      0.80       682

Confusion Matrix:
[[ 55   0   0   0   0]
 [  0 220   0   0   0]
 [  0   1 299   0   0]
 [  0   0   0  13   0]
 [  0   0  94   0   0]]
------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for Fold 2:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        47
           1       1.00      1.00      1.00       212
           2       1.00      1.00      1.00       335
           3       1.00      1.00      1.00        17
           4       0.99      1.00      0.99        78

    accuracy                           1.00       689
   macro avg       1.00      1.00      1.00       689
weighted avg       1.00      1.00      1.00       689

Confusion Matrix:
[[ 47   0   0   0   0]
 [  0 211   0   0   1]
 [  0   0 335   0   0]
 [  0   0   0  17   0]
 [  0   0   0   0  78]]
------------------------------------------------------------




Results for Fold 3:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        75
           1       1.00      1.00      1.00       150
           2       1.00      1.00      1.00       333
           3       1.00      1.00      1.00        25
           4       1.00      1.00      1.00        68

    accuracy                           1.00       651
   macro avg       1.00      1.00      1.00       651
weighted avg       1.00      1.00      1.00       651

Confusion Matrix:
[[ 75   0   0   0   0]
 [  0 150   0   0   0]
 [  0   0 333   0   0]
 [  0   0   0  25   0]
 [  0   0   0   0  68]]
------------------------------------------------------------




Results for Fold 4:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       1.00      0.99      1.00       158
           2       1.00      1.00      1.00       315
           3       1.00      1.00      1.00        38
           4       1.00      1.00      1.00        66

    accuracy                           1.00       638
   macro avg       1.00      1.00      1.00       638
weighted avg       1.00      1.00      1.00       638

Confusion Matrix:
[[ 61   0   0   0   0]
 [  0 157   1   0   0]
 [  0   0 315   0   0]
 [  0   0   0  38   0]
 [  0   0   0   0  66]]
------------------------------------------------------------




Results for Fold 5:
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       128
           1       1.00      1.00      1.00       111
           2       1.00      1.00      1.00       280
           3       1.00      0.98      0.99        61
           4       1.00      1.00      1.00        69

    accuracy                           1.00       649
   macro avg       1.00      1.00      1.00       649
weighted avg       1.00      1.00      1.00       649

Confusion Matrix:
[[128   0   0   0   0]
 [  0 111   0   0   0]
 [  0   0 280   0   0]
 [  1   0   0  60   0]
 [  0   0   0   0  69]]
------------------------------------------------------------
Expanding Window CV Macro-F1 Scores: [0.7721294578437436, 0.9982533014109108, 1.0, 0.9990481221543028, 0.9975688973212851]
Mean Expanding Window CV Macro-F1 Score: 0.9533999557460484


In [44]:
# Predict on final validation set
y_validation_pred = xgb.predict(X_validation)
y_validation_pred_labels = le.inverse_transform(y_validation_pred)

# Evaluate final performance
print("Final Validation on Last 2 Days (50 Tickers)")
print("Final Validation Classification Report:")
print(classification_report(y_validation, y_validation_pred_labels))
print("Final Validation Confusion Matrix:")
print(confusion_matrix(y_validation, y_validation_pred_labels))

Final Validation on Last 2 Days (50 Tickers)
Final Validation Classification Report:
                         precision    recall  f1-score   support

              Decreased       1.00      1.00      1.00         6
              Increased       1.00      1.00      1.00        12
                Neutral       1.00      1.00      1.00        25
Significantly Decreased       1.00      1.00      1.00         3
Significantly Increased       1.00      1.00      1.00         8

               accuracy                           1.00        54
              macro avg       1.00      1.00      1.00        54
           weighted avg       1.00      1.00      1.00        54

Final Validation Confusion Matrix:
[[ 6  0  0  0  0]
 [ 0 12  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0  3  0]
 [ 0  0  0  0  8]]


# End