## import your modules needed

In [1]:
import requests as req
import time
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import pandas as pd
from transformers import pipeline, logging
import numpy as np
from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
load_dotenv()
SEC_API_KEY = os.getenv('SEC_API_KEY')
OPENAI_KEY = os.getenv('OPENAI_API_KEY')
EMAIL=os.getenv("EMAIL")


In [3]:
#company_tickers = ['UPST', 'KO', 'TSLA', 'INTC']
company_tickers = ['UPST']

## use edgar api to search company using their ticker to fetch FR document 

In [4]:
def get_comp_sec(ticker):
    """
    Get the company SECURITIES AND EXCHANGE COMMISSION reports for a given ticker symbol.
    
    Args:
        ticker (str): The ticker symbol of the company.
    
    Returns:
        object: filings of the company.
    """
    base_url = "https://api.sec-api.io"
    payload = {
                "query": f'formType:\"10-K\" AND ticker:{ticker} AND filedAt:[2020-01-01 TO 2025-01-31]',
                "from": "0",
                "size": "50",
                "sort": [{ "filedAt": { "order": "desc" }}]
            }
    response = req.post(base_url, json=payload, headers={'Authorization': SEC_API_KEY})
    
    if response.status_code == 200:
        comp_tenk_filing = response.json()
        return comp_tenk_filing
    else:
        raise Exception(f'Error fetching Company 10 k filings. Status code: {response.status_code}')

In [5]:
def safe_get_filings(tkr):
    try:
        time.sleep(1)
        return get_comp_sec(tkr)
    except Exception as e:
        print(f"Error fetching {tkr}: {e}")
        return None

In [6]:
company_filings_info = [safe_get_filings(tkr) for tkr in company_tickers]
print(company_filings_info)

[{'total': {'value': 4, 'relation': 'eq'}, 'query': {'from': 0, 'size': 50}, 'filings': [{'ticker': 'UPST', 'formType': '10-K', 'accessionNo': '0001647639-24-000009', 'cik': '1647639', 'companyNameLong': 'Upstart Holdings, Inc. (Filer)', 'companyName': 'Upstart Holdings, Inc.', 'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/1647639/000164763924000009/upst-20231231.htm', 'description': 'Form 10-K - Annual report [Section 13 and 15(d), not S-K Item 405]', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/1647639/000164763924000009/0001647639-24-000009.txt', 'filedAt': '2024-02-15T16:16:15-05:00', 'documentFormatFiles': [{'sequence': '1', 'size': '3097961', 'documentUrl': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/1647639/000164763924000009/upst-20231231.htm', 'description': '10-K', 'type': '10-K'}, {'sequence': '2', 'size': '4470', 'documentUrl': 'https://www.sec.gov/Archives/edgar/data/1647639/000164763924000009/exhibit211subsidiariesofth.htm', 'description': 

In [None]:
# Extract 

In [None]:
#intialize an empty list
data=[]
for comp in company_filings_info:
    if comp is not None:
        for filing in comp['filings']:
            data.append({
                "company": filing["companyName"],
                "filing_date": filing["filedAt"],
                "link": filing["linkToFilingDetails"],
                "linkToTxt": filing["linkToTxt"]
            })
   
    else:
        print("No filings found.")


Total filings: 4


https://www.sec.gov/Archives/edgar/data/1647639/000164763924000009/0001647639-24-000009.txt


### Dataframe of financial report for companies

In [9]:
financial_report_df = pd.DataFrame(data)
financial_report_df.head()

Unnamed: 0,company,filing_date,link,linkToTxt
0,"Upstart Holdings, Inc.",2024-02-15T16:16:15-05:00,https://www.sec.gov/Archives/edgar/data/164763...,https://www.sec.gov/Archives/edgar/data/164763...
1,"Upstart Holdings, Inc.",2023-02-16T15:49:35-05:00,https://www.sec.gov/Archives/edgar/data/164763...,https://www.sec.gov/Archives/edgar/data/164763...
2,"Upstart Holdings, Inc.",2022-02-17T19:36:15-05:00,https://www.sec.gov/Archives/edgar/data/164763...,https://www.sec.gov/Archives/edgar/data/164763...
3,"Upstart Holdings, Inc.",2021-03-18T16:42:45-04:00,https://www.sec.gov/Archives/edgar/data/164763...,https://www.sec.gov/Archives/edgar/data/164763...


## instantiate  and create a open ai object using the provided api key and prompt engineer to summarize a given financial report

In [None]:

llm = OpenAI(openai_api_key=OPENAI_KEY, temperature=0.9)
prompt_template = PromptTemplate(
    template="Give me an example of a meal that could be made using the following ingredients: {ingredients}",
    input_variables=["ingredients"]
)

meal_chain = LLMChain(
    llm=llm, 
    prompt=prompt_template, 
    verbose=True
)

### Get sentiment analyses of financial report using ProsusAI/finbert

In [None]:
def beautiful_soup(url):
    """
    Get the soup object from the url.
    
    Args:
        url (str): The url of the filing.
    
    Returns:
        object: soup object.
    """
    headers = {
    'User-Agent': EMAIL
    }
    response = req.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'lxml')
        return soup.get_text()
    else:
        raise Exception(f'Error fetching filing. Status code: {response.status_code}')

In [None]:
#use a request to get the data
#url="https://www.sec.gov/Archives/edgar/data/1647639/000164763924000009/upst-20231231.htm"

# Create an empty column for filing text
financial_report_df["filing_text"] = None
#make the request
for idx, row in financial_report_df.iterrows():
    url = row["link"]
    filing_text = beautiful_soup(url)
    # Assign the filing text to the appropriate row
    financial_report_df.at[idx, "filing_text"] = filing_text



In [None]:
financial_report_df

In [None]:
#sentiment analysis


# Silence transformers logging (optional)
logging.set_verbosity_error()

# Load FinBERT sentiment analysis pipeline
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert"
)

#define function to get overall sentiment
def overall_sentiment(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return {
            "overall_sentiment": "error",
            "average_score": None
        }
    
    # Break the text into chunks (basic method)
    chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]

    #make empty list for sentiment score
    sentiment_score = []

    # Analyze sentiment
    for chunk in chunks:
        result = sentiment_analyzer(chunk[:512])
        sentiment_score.append(result[0]["score"])
    
    # Calculate the average score
    average_score = np.mean(sentiment_score)
    
    # Determine the overall sentiment based on the average score
    if average_score >= 0.75:
        overall_sentiment = 'positive'
    elif average_score <= 0.25:
        overall_sentiment = 'negative'
    else:
        overall_sentiment = 'neutral'
    
    return {
           "overall_sentiment": overall_sentiment,
           "average_score": average_score
    }

#add to data_df dataframe
sentiment_results = financial_report_df["filing_text"].apply(overall_sentiment)
financial_report_df["sentiment"] = sentiment_results.apply(lambda x: x["overall_sentiment"])
financial_report_df["sentiment_score"] = sentiment_results.apply(lambda x: x["average_score"])
    



In [None]:
financial_report_df

## create a LLM that model that iterate through the list of pdf files to analyze, syntesize, and summarize the pdf pages content