<a href="https://colab.research.google.com/github/inactdev/CS668/blob/master/Deep_Value_Funnel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download and setup the libraries

In [2]:
!pip install sec-edgar-downloader beautifulsoup4

import transformers
import pandas
import numpy
from bs4 import BeautifulSoup
import torch

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Download Filings to Google Drive

In [211]:
from sec_edgar_downloader import Downloader

filepath = "/content/drive/My Drive/Pace/SEC Filings"
dl = Downloader("Ari Perez", "ap79624n@pace.edu", filepath)  # Required for SEC compliance

poor_performing_equity_ids = ["AIG", "C", "WBA", "PARA", "APA", "CCL", "VTRS", "KEY"]
well_performing_equity_ids = ["NVDA", "AMD", "TPL", "FICO", "BLDR", "AXON", "AVGO", "ANET"]
avg_performing_equity_ids = ["JNJ", "PG", "KO", "PEP", "WMT", "HD", "LOW", "MCD", "UNH", "JPM", "BAC", "V", "MA", "XOM", "CVX", "PFE", "MRK", "INTC", "CSCO", "VZ"]

for equity_id in poor_performing_equity_ids:
  dl.get("10-K", equity_id, limit=10, before="2022-01-01") # Getting before 2022 so that we have 3 full years to check for doubles

for equity_id in well_performing_equity_ids:
  dl.get("10-K", equity_id, limit=10, before="2022-01-01") # Getting before 2022 so that we have 3 full years to check for doubles

for equity_id in avg_performing_equity_ids:
  dl.get("10-K", equity_id, limit=10, before="2022-01-01") # Getting before 2022 so that we have 3 full years to check for doubles

Pull filenames for SEC filings

In [212]:
import os

rootdir = '/content/drive/My Drive/Pace/SEC Filings/sec-edgar-filings'
filings = []

for subdir, dirs, files in os.walk(rootdir):
  for file in files:
    if "full-submission" in file:
      filings.append(os.path.join(subdir, file))

Preprocess Filings Methods

In [213]:
import re

def find_item_title(text):
    match = re.search(r'^item\b\s*[a-zA-Z0-9]+', text, re.IGNORECASE)
    if match:
        return match.group(0)
    return None

In [214]:
import re

def return_year_end(text):
    # Define the regex
    pattern = r"(?i)year ended\s+([A-Za-z]+\s+\d+,\s+\d{4})"

    # Match it
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    return None


In [215]:
from bs4 import BeautifulSoup
import pdb

def extract_sections(file_name):
    """
    Downloads a 10-K SEC filing from the given URL, extracts the text, and splits it into sections
    based on the index (table of contents).

    Args:
        url (str): URL of the 10-K filing (e.g., Tesla or Microsoft filing URL).

    Returns:
        list: List of dictionaries, each with 'title' and 'content' keys, or None if processing fails.
    """


    # Step 1: Parse the HTML
    with open(file_name, "r") as f:
      soup = BeautifulSoup(f, "html.parser")

    year_end = return_year_end(soup.get_text())

    # Step 2: Find the table of contents rows
    toc_rows = soup.find_all(lambda tag: tag.name in ['tr'] and ('item' in tag.text.lower() or 'signatures' in tag.text.lower()))

    if not toc_rows:
        print("Table of contents rows not found in the filing.")
        return None

    # Step 3: Extract section links from the table of contents rows
    sections = []

    for row in toc_rows:
      row_text = row.get_text().strip()
      title = find_item_title(row_text)

      if not title:
        title = 'Signatures'

      link = row.find('a', href=lambda x: x and x.startswith('#'))

      if link is None:
        continue

      anchor = link['href'][1:]  # Remove '#' to get the anchor name
      sections.append({'title': title.upper(), 'anchor': anchor})

    if not sections:
        print("No sections found in the table of contents.")
        return None

    # Step 4: Find all anchor tags in the document
    anchor_tags = {sec['anchor']: soup.find(id=sec['anchor']) for sec in sections if soup.find(id=sec['anchor'])}

    # Step 5: Extract content for each section
    for i in range(len(sections)):
        anchor = sections[i]['anchor']

        if anchor not in anchor_tags:
            print(f"Warning: Anchor '{anchor}' not found in document for section '{sections[i]['title']}'.")
            sections[i]['content'] = ""
            continue

        current_anchor = anchor_tags[anchor]

        # Get the next anchor tag, or None if it's the last section
        next_anchor = None

        for j in range(i + 1, len(sections)):
            if sections[j]['anchor'] in anchor_tags:
                next_anchor = anchor_tags[sections[j]['anchor']]
                break

        content = []
        # Iterate through all elements following the current anchor
        for element in current_anchor.next_elements:
            # Stop if we reach the next anchor
            if next_anchor and element is next_anchor:
                break
            # Collect text from string nodes
            if isinstance(element, str):
                text = element.strip()
                if text:  # Only add non-empty strings
                    content.append(text)

        # Store the joined text in the section
        sections[i]['content'] = ' '.join(content)

    year_end = return_year_end(soup.get_text())

    return sections, year_end

In [144]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

def sentiment_analysis(symbol, year, sections):
  results=[]

  for section in sections:
    item = section['title']
    content = section['content']

    # # Only want the items in the filing
    if 'item' in item.lower():
      item_number = item.split(" ")[-1]
      inputs = tokenizer(content, return_tensors="pt", truncation=True, max_length=512)

      with torch.no_grad():
          outputs = model(**inputs)
          logits = outputs.logits
          sentiments = torch.softmax(logits, dim=1)
          sentiments_list = sentiments.tolist()[0]
          sentiment_labels = ["negative", "neutral", "positive"]
          predicted_sentiment = sentiment_labels[torch.argmax(sentiments, dim=1).item()]
          result = {"Symbol": symbol, "Year": year, "Section": item, "Positive": sentiments_list[0], "Neutral": sentiments_list[1], "Negative": sentiments_list[2]}
          # print(result)
          results.append(result)
    else:
      print(f"Not an item")

  return results

In [173]:
# Function to get the last closing price of a year
from datetime import datetime, timedelta
import yfinance as yf

date_filing_format = "%B %d, %Y"
date_finance_format = "%Y-%m-%d"
days_to_subtract = 10
days_to_add = 366

def get_year_end_price(symbol, year_end, years_from=0):
    start_datetime_object = datetime.strptime(year_end, date_filing_format)

    # # Fetch data
    if years_from == 0:
      end_datetime_object = start_datetime_object
      start_datetime_object = start_datetime_object - timedelta(days=days_to_subtract)
    else:
      end_datetime_object = start_datetime_object + timedelta(days=days_to_add * years_from)

    start_date = start_datetime_object.strftime(date_finance_format)
    end_date = end_datetime_object.strftime(date_finance_format)

    stock_data = yf.download(symbol, start=start_date, end=end_date, progress=False)
    if not stock_data.empty:
        # Return the last available closing price
        return stock_data['Close'].iloc[-1]
    return None

In [207]:
def doubled_check(start_price, end_price):
  if start_price is not None and end_price is not None:
    if (start_price * 2) <= end_price:
      return "Yes"
    else:
      return "No"
  else:
    return "N/A"

In [209]:
def get_stock_prices(symbol, year, year_end):
  current_price = get_year_end_price(symbol, year_end)

  year_1_price = get_year_end_price(symbol, year_end, 1)
  year_1_double = doubled_check(current_price.to_dict()[symbol], year_1_price.to_dict()[symbol])

  year_2_price = get_year_end_price(symbol, year_end, 2)
  year_2_double = doubled_check(current_price.to_dict()[symbol], year_2_price.to_dict()[symbol])

  year_3_price = get_year_end_price(symbol, year_end, 3)
  year_3_double = doubled_check(current_price.to_dict()[symbol], year_3_price.to_dict()[symbol])

  return {"Symbol": symbol, "Year": year, "doubled": year_1_double, "doubled_2": year_2_double, "doubled_3": year_3_double }

In [1]:
results = []
stock_prices = []

for file in filings:
  print(f"\nProcessing {file}")
  sections, year_end = extract_sections(file)

  year = file.split("/")[9].split("-")[1]
  symbol = file.split("/")[7]

  if sections:
      print(f"Found {len(sections)} sections for {symbol} - {year}")
      for sec in sections:
          print(f"\nSection: {sec['title']}")
          print(f"Content (last 100 chars): {sec['content'][-10:]}...")
  else:
      print("Failed to extract sections.")


  analysis = sentiment_analysis(symbol, year, sections)
  results += analysis
  stock_price = get_stock_prices(symbol, year, year_end)
  stock_prices.append(stock_price)

NameError: name 'filings' is not defined

In [141]:
df = pandas.DataFrame(results)
df.to_csv('sentiment_analysis.csv', index=False)
display(df)

Unnamed: 0,Symbol,Year,Section,Positive,Neutral,Negative
0,TSLA,25,ITEM 1,0.178942,0.008778,0.812280
1,TSLA,25,ITEM 1A,0.072955,0.266532,0.660512
2,TSLA,25,ITEM 1B,0.023510,0.344801,0.631689
3,TSLA,25,ITEM 1C,0.037303,0.029045,0.933652
4,TSLA,25,ITEM 2,0.023396,0.019662,0.956941
...,...,...,...,...,...,...
215,MSFT,20,ITEM 12,0.019879,0.029820,0.950301
216,MSFT,20,ITEM 13,0.022086,0.023367,0.954547
217,MSFT,20,ITEM 14,0.019256,0.039924,0.940820
218,MSFT,20,ITEM 15,0.017690,0.050330,0.931980


In [208]:
df = pandas.DataFrame(stock_prices)
df.to_csv('stock_prices.csv', index=False)
display(df)

Unnamed: 0,Symbol,Year,1 Year Double,2 Year Double,3 Year Double
0,TSLA,25,No,No,No
1,TSLA,24,No,No,No
2,TSLA,23,Yes,Yes,Yes
3,TSLA,22,No,No,No
4,TSLA,21,No,No,No
5,MSFT,24,No,No,No
6,MSFT,23,No,No,No
7,MSFT,22,No,No,No
8,MSFT,21,No,No,No
9,MSFT,20,No,No,No
