## import your modules needed

In [85]:
import requests as req
import time
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import pandas as pd

In [89]:
load_dotenv()
sec_api = os.getenv("SECIO_KEY")
google_api=os.getenv("GOOGLE_KEY")
email=os.getenv("EMAIL")


In [79]:
#company_tickers = ['UPST', 'KO', 'TSLA', 'INTC']
company_tickers = ['UPST']

## use edgar api to search company using their ticker to fetch FR document 

In [80]:
def get_comp_sec(ticker):
    """
    Get the company SECURITIES AND EXCHANGE COMMISSION reports for a given ticker symbol.
    
    Args:
        ticker (str): The ticker symbol of the company.
    
    Returns:
        object: filings of the company.
    """
    base_url = "https://api.sec-api.io"
    payload = {
                "query": f'formType:\"10-K\" AND ticker:{ticker} AND filedAt:[2020-01-01 TO 2025-01-31]',
                "from": "0",
                "size": "50",
                "sort": [{ "filedAt": { "order": "desc" }}]
            }
    response = req.post(base_url, json=payload, headers={'Authorization': sec_api})
    
    if response.status_code == 200:
        comp_tenk_filing = response.json()
        return comp_tenk_filing
    else:
        raise Exception(f'Error fetching Company 10 k filings. Status code: {response.status_code}')

In [81]:
def safe_get_filings(tkr):
    try:
        time.sleep(1)
        return get_comp_sec(tkr)
    except Exception as e:
        print(f"Error fetching {tkr}: {e}")
        return None

In [82]:
company_filings_info = [safe_get_filings(tkr) for tkr in company_tickers]

In [88]:
#intialize an empty list
data=[]

for comp in company_filings_info:
    if comp is not None:
        for filing in comp['filings']:
            data.append({
                "company": filing["companyName"],
                "filing_date": filing["filedAt"],
                "link": filing["linkToFilingDetails"]
            })
   
    else:
        print("No filings found.")

data_df=pd.DataFrame(data)

data_df.head()

Unnamed: 0,company,filing_date,link
0,"Upstart Holdings, Inc.",2024-02-15T16:16:15-05:00,https://www.sec.gov/Archives/edgar/data/164763...
1,"Upstart Holdings, Inc.",2023-02-16T15:49:35-05:00,https://www.sec.gov/Archives/edgar/data/164763...
2,"Upstart Holdings, Inc.",2022-02-17T19:36:15-05:00,https://www.sec.gov/Archives/edgar/data/164763...
3,"Upstart Holdings, Inc.",2021-03-18T16:42:45-04:00,https://www.sec.gov/Archives/edgar/data/164763...


## instantiate  and create a gemini ai object using the provided api key

In [90]:
#use a request to get the data
#url="https://www.sec.gov/Archives/edgar/data/1647639/000164763924000009/upst-20231231.htm"

# Create an empty column for filing text
data_df["filing_text"] = None

#define the header for the web scrape
headers={'User-Agent': email}

#make the request
for idx, row in data_df.iterrows():
    url = row["link"]
    response = req.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    filing_text = soup.get_text()

    # Assign the filing text to the appropriate row
    data_df.at[idx, "filing_text"] = filing_text



In [93]:
data_df

Unnamed: 0,company,filing_date,link,filing_text
0,"Upstart Holdings, Inc.",2024-02-15T16:16:15-05:00,https://www.sec.gov/Archives/edgar/data/164763...,\n\nupst-20231231false2023FY0001647639P1Mhttp:...
1,"Upstart Holdings, Inc.",2023-02-16T15:49:35-05:00,https://www.sec.gov/Archives/edgar/data/164763...,\nupst-20221231false2022FY0001647639http://fas...
2,"Upstart Holdings, Inc.",2022-02-17T19:36:15-05:00,https://www.sec.gov/Archives/edgar/data/164763...,\nupst-20211231false2021FY000164763912.50.0035...
3,"Upstart Holdings, Inc.",2021-03-18T16:42:45-04:00,https://www.sec.gov/Archives/edgar/data/164763...,\n10-K\n1\nupstrtholdingsinc202010-k.htm\n10-K...


In [None]:
#this may be interesting at some point but not being used.

#import google.generativeai as genai


#genai.configure(api_key=google_api)
#model=genai.GenerativeModel("models/gemini-2.0-flash")

#response = model.generate_content("summarize the content at this url including revenue trends and risk assessment:\n\n" + filing_text)

#print(response.text)

In [99]:
#sentiment analysis
from transformers import pipeline, logging
import numpy as np

# Silence transformers logging (optional)
logging.set_verbosity_error()

# Load FinBERT sentiment analysis pipeline
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert"
)

#define function to get overall sentiment
def overall_sentiment(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return {
            "overall_sentiment": "error",
            "average_score": None
        }
    
    # Break the text into chunks (basic method)
    chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]

    #make empty list for sentiment score
    sentiment_score = []

    # Analyze sentiment
    for chunk in chunks:
        result = sentiment_analyzer(chunk[:512])
        sentiment_score.append(result[0]["score"])
    
    # Calculate the average score
    average_score = np.mean(sentiment_score)
    
    # Determine the overall sentiment based on the average score
    if average_score >= 0.75:
        overall_sentiment = 'positive'
    elif average_score <= 0.25:
        overall_sentiment = 'negative'
    else:
        overall_sentiment = 'neutral'
    
    return {
           "overall_sentiment": overall_sentiment,
           "average_score": average_score
    }

#add to data_df dataframe
sentiment_results = data_df["filing_text"].apply(overall_sentiment)
data_df["sentiment"] = sentiment_results.apply(lambda x: x["overall_sentiment"])
data_df["sentiment_score"] = sentiment_results.apply(lambda x: x["average_score"])
    



In [100]:
data_df

Unnamed: 0,company,filing_date,link,filing_text,sentiment,sentiment_score
0,"Upstart Holdings, Inc.",2024-02-15T16:16:15-05:00,https://www.sec.gov/Archives/edgar/data/164763...,\n\nupst-20231231false2023FY0001647639P1Mhttp:...,positive,0.840972
1,"Upstart Holdings, Inc.",2023-02-16T15:49:35-05:00,https://www.sec.gov/Archives/edgar/data/164763...,\nupst-20221231false2022FY0001647639http://fas...,positive,0.837433
2,"Upstart Holdings, Inc.",2022-02-17T19:36:15-05:00,https://www.sec.gov/Archives/edgar/data/164763...,\nupst-20211231false2021FY000164763912.50.0035...,positive,0.835659
3,"Upstart Holdings, Inc.",2021-03-18T16:42:45-04:00,https://www.sec.gov/Archives/edgar/data/164763...,\n10-K\n1\nupstrtholdingsinc202010-k.htm\n10-K...,positive,0.842054


## create a LLM that model that iterate through the list of pdf files to analyze, syntesize, and summarize the pdf pages content