## import your modules needed

In [1]:
import requests as req
import time
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import pandas as pd
from transformers import pipeline, logging
import numpy as np
from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import random
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
load_dotenv()
SEC_API_KEY = os.getenv('SEC_API_KEY')
OPENAI_KEY = os.getenv('OPENAI_API_KEY')
FMP_KEY = os.getenv('FMP_API_KEY')
EMAIL=os.getenv("EMAIL")

sleep_time = 0.5


In [3]:
#company_tickers = ['UPST', 'KO', 'TSLA', 'INTC']
company_tickers = ['INTC']

#generate a code that get company ticker by provided name
def get_ticker_by_company_name(company_name):
    url = "https://financialmodelingprep.com/api/v3/search"
    params = {
        "query": company_name,
        "limit": 1,
        "exchange": "NASDAQ",  # You can use "NYSE" or leave it blank for all
        "apikey": FMP_KEY
    }

    try:
        response = req.get(url, params=params)
        response.raise_for_status()
        results = response.json()
        if results:
            return results[0]["symbol"]
        else:
            print(f"No ticker found for: {company_name}")
            return None
    except Exception as e:
        print(f"Error fetching ticker: {e}")
        return None

In [13]:
company_names = input("Enter the company name, use , to separate multiple inputs: ")
print(f"Company name: {company_names}")
company_names = company_names.split(",")
print(f"Company names: {company_names}")
company_tickers = [get_ticker_by_company_name(name.strip()) for name in company_names]
print(f"Ticker for '{company_names}' is: {company_tickers}")

Company name: apple, tesla, upstart
Company names: ['apple', ' tesla', ' upstart']
Ticker for '['apple', ' tesla', ' upstart']' is: ['AAPL', 'TSLA', 'UPST']


## use edgar api to search company using their ticker to fetch FR document 

In [14]:
def get_comp_sec(ticker):
    """
    Get the company SECURITIES AND EXCHANGE COMMISSION reports for a given ticker symbol.
    
    Args:
        ticker (str): The ticker symbol of the company.
    
    Returns:
        object: filings of the company.
    """
    base_url = "https://api.sec-api.io"
    payload = {
                "query": f'formType:\"10-K\" AND ticker:{ticker} AND filedAt:[2020-01-01 TO 2025-01-31]',
                "from": "0",
                "size": "50",
                "sort": [{ "filedAt": { "order": "desc" }}]
            }
    response = req.post(base_url, json=payload, headers={'Authorization': SEC_API_KEY})
    
    if response.status_code == 200:
        comp_tenk_filing = response.json()
        return comp_tenk_filing
    else:
        raise Exception(f'Error fetching Company 10 k filings. Status code: {response.status_code}')

In [15]:
def safe_get_filings(tkr):
    try:
        time.sleep(sleep_time + random.uniform(0, 0.5))
        return get_comp_sec(tkr)
    except Exception as e:
        print(f"Error fetching {tkr}: {e}")
        return None

In [16]:
company_filings_info = [safe_get_filings(tkr) for tkr in company_tickers]
print(company_filings_info)

[{'total': {'value': 5, 'relation': 'eq'}, 'query': {'from': 0, 'size': 50}, 'filings': [{'ticker': 'AAPL', 'formType': '10-K', 'accessionNo': '0000320193-24-000123', 'cik': '320193', 'companyNameLong': 'Apple Inc. (Filer)', 'companyName': 'Apple Inc.', 'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm', 'description': 'Form 10-K - Annual report [Section 13 and 15(d), not S-K Item 405]', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/0000320193-24-000123.txt', 'filedAt': '2024-11-01T06:01:36-04:00', 'documentFormatFiles': [{'sequence': '1', 'size': '1503780', 'documentUrl': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm', 'description': '10-K', 'type': '10-K'}, {'sequence': '2', 'size': '120785', 'documentUrl': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/a10-kexhibit4109282024.htm', 'description': 'EX-4.1', 'type': 'EX-4.1'}, {'

In [17]:
#intialize an empty list
data=[]
for comp in company_filings_info:
    if comp is not None:
        for filing in comp['filings']:
            data.append({
                "company": filing["companyName"],
                "filing_date": filing["filedAt"],
                "link": filing["linkToFilingDetails"],
                "linkToTxt": filing["linkToTxt"]
            })
   
    else:
        print("No filings found.")


In [None]:
# Save the filing text to a file
# Create a directory to save the filing text files
def create_directory(company, date, filing_text):
    if not os.path.exists("filing_texts"):
        os.makedirs("filing_texts")
    if filing_text:
        with open(f"filing_texts/{company}_{date}.txt", "w") as f:
            f.write(filing_text)
   

In [12]:
def beautiful_soup(url):
    """
    Get the soup object from the url.
    
    Args:
        url (str): The url of the filing.
    
    Returns:
        object: soup object.
    """
    headers = {
    'User-Agent': EMAIL
    }
    response = req.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'lxml')
        return soup.get_text()
    else:
        raise Exception(f'Error fetching filing. Status code: {response.status_code}')

### Dataframe of financial report for companies

In [21]:
financial_report_df = pd.DataFrame(data)
financial_report_df.head()

Unnamed: 0,company,filing_date,link,linkToTxt
0,Apple Inc.,2024-11-01T06:01:36-04:00,https://www.sec.gov/Archives/edgar/data/320193...,https://www.sec.gov/Archives/edgar/data/320193...
1,Apple Inc.,2023-11-02T18:08:27-04:00,https://www.sec.gov/Archives/edgar/data/320193...,https://www.sec.gov/Archives/edgar/data/320193...
2,Apple Inc.,2022-10-27T18:01:14-04:00,https://www.sec.gov/Archives/edgar/data/320193...,https://www.sec.gov/Archives/edgar/data/320193...
3,Apple Inc.,2021-10-28T18:04:28-04:00,https://www.sec.gov/Archives/edgar/data/320193...,https://www.sec.gov/Archives/edgar/data/320193...
4,Apple Inc.,2020-10-29T18:06:25-04:00,https://www.sec.gov/Archives/edgar/data/320193...,https://www.sec.gov/Archives/edgar/data/320193...


## instantiate  and create a open ai object using the provided api key and prompt engineer to summarize a given financial report

In [11]:

llm = OpenAI(openai_api_key=OPENAI_KEY, temperature=0.9)
prompt_template = PromptTemplate(
    template="Give me an example of a meal that could be made using the following ingredients: {ingredients}",
    input_variables=["ingredients"]
)

meal_chain = LLMChain(
    llm=llm, 
    prompt=prompt_template, 
    verbose=True
)

  meal_chain = LLMChain(


### Get sentiment analyses of financial report using ProsusAI/finbert

In [None]:
#use a request to get the data
# Create an empty column for filing text
def build_financial_report_df(filing_text):
    financial_report_df["filing_text"] = None
#make the request
    for idx, row in financial_report_df.iterrows():
        url = row["link"]
        print(filing_text)
        # Assign the filing text to the appropriate row
        financial_report_df.at[idx, "filing_text"] = filing_text



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(response.content, 'lxml')



intc-20241228000005086312-28December 28, 20242024FYfalse0.0010.0015050——0.0010.00110,00010,0004,3304,2284,3304,228http://fasb.org/us-gaap/2024#CostOfGoodsAndServicesSoldhttp://fasb.org/us-gaap/2024#PropertyPlantAndEquipmentAndFinanceLeaseRightOfUseAssetAfterAccumulatedDepreciationAndAmortizationhttp://fasb.org/us-gaap/2024#PropertyPlantAndEquipmentAndFinanceLeaseRightOfUseAssetAfterAccumulatedDepreciationAndAmortizationhttp://fasb.org/us-gaap/2024#CostOfGoodsAndServicesSoldhttp://fasb.org/us-gaap/2024#CostOfGoodsAndServicesSoldhttp://fasb.org/us-gaap/2024#CostOfGoodsAndServicesSoldhttp://fasb.org/us-gaap/2024#CostOfGoodsAndServicesSoldhttp://fasb.org/us-gaap/2024#CostOfGoodsAndServicesSoldhttp://fasb.org/us-gaap/2024#CostOfGoodsAndServicesSoldhttp://fasb.org/us-gaap/2024#OtherAssetsCurrenthttp://fasb.org/us-gaap/2024#OtherAssetsCurrenthttp://fasb.org/us-gaap/2024#OtherAssetsNoncurrenthttp://fasb.org/us-gaap/2024#OtherAssetsNoncurrenthttp://fasb.org/us-gaap/2024#AccruedIncomeTaxesCurre

In [None]:
financial_report_df

In [None]:
#sentiment analysis


# Silence transformers logging (optional)
logging.set_verbosity_error()

# Load FinBERT sentiment analysis pipeline
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert"
)

#define function to get overall sentiment
def overall_sentiment(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return {
            "overall_sentiment": "error",
            "average_score": None
        }
    
    # Break the text into chunks (basic method)
    chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]

    #make empty list for sentiment score
    sentiment_score = []

    # Analyze sentiment
    for chunk in chunks:
        result = sentiment_analyzer(chunk[:512])
        sentiment_score.append(result[0]["score"])
    
    # Calculate the average score
    average_score = np.mean(sentiment_score)
    
    # Determine the overall sentiment based on the average score
    if average_score >= 0.75:
        overall_sentiment = 'positive'
    elif average_score <= 0.25:
        overall_sentiment = 'negative'
    else:
        overall_sentiment = 'neutral'
    
    return {
           "overall_sentiment": overall_sentiment,
           "average_score": average_score
    }

#add to data_df dataframe
sentiment_results = financial_report_df["filing_text"].apply(overall_sentiment)
financial_report_df["sentiment"] = sentiment_results.apply(lambda x: x["overall_sentiment"])
financial_report_df["sentiment_score"] = sentiment_results.apply(lambda x: x["average_score"])
    



In [None]:
financial_report_df

## create a LLM that model that iterate through the list of pdf files to analyze, syntesize, and summarize the pdf pages content

In [42]:
def summarize_financial_report(text, num_sentences=5):
    """
    Summarizes a financial report using a frequency-based approach with NLTK.

    Args:
        text (str): The financial report text.
        num_sentences (int): The desired number of sentences in the summary.

    Returns:
        str: The generated summary.
    """
    financial_keywords = ["revenue", "profit", "income", "loss", "assets", "liabilities",
                            "equity", "cash flow", "margin", "earnings", "sales", "billion",
                            "million", "thousand", "%", "increase", "decrease", "growth"]

    # 1. Tokenization
    words = word_tokenize(text.lower())

    # 2. Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped_words = [w.translate(table) for w in words]

    # 3. Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [w for w in stripped_words if w.isalpha() and w not in stop_words]

    # 4. Word frequency calculation
    word_frequency = {}
    for word in filtered_words:
        word_frequency[word] = word_frequency.get(word, 0) + 1

    # Calculate weighted frequencies (optional, can help emphasize important words)
    max_frequency = max(word_frequency.values()) if word_frequency else 1
    weighted_frequency = {word: freq / max_frequency for word, freq in word_frequency.items()}

    # 5. Sentence tokenization
    sentences = sent_tokenize(text)

    # 6. Sentence scoring
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            word = word.translate(table)
            if word in weighted_frequency:
                score_multiplier = 1.5 if word in financial_keywords else 1.0
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + weighted_frequency[word] * score_multiplier

    # 7. Summary generation
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    summary_sentences = sorted_sentences[:num_sentences]
    summary = " ".join(summary_sentences)

    return summary

In [48]:
def save_summary_to_file(company, date, summary):
    if not os.path.exists("summaries"):
        os.makedirs("summaries")
    with open(f"summaries/{company}_{date}_summary.txt", "w") as f:
        f.write(summary)

In [None]:
for link_text_url in data:
    link = link_text_url['link']
    filing_text = beautiful_soup(link)
    date = link_text_url['filing_date']
    company = link_text_url['company']
    create_directory(company, date, filing_text)
    save_summary_to_file(company, date, summarize_financial_report(filing_text, num_sentences=3))
    print(f"Filing text for {company} on {date} saved.")
    build_financial_report_df(filing_text)
    # Append the filing text to the data list
    data.append({
        "company": company,
        "filing_date": date,
        "filing_text": filing_text
    })
print(data)



https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
[{'company': 'Apple Inc.', 'filing_date': '2024-11-01T06:01:36-04:00', 'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/0000320193-24-000123.txt'}, {'company': 'Apple Inc.', 'filing_date': '2023-11-02T18:08:27-04:00', 'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/0000320193-23-000106.txt'}, {'company': 'Apple Inc.', 'filing_date': '2022-10-27T18:01:14-04:00', 'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/aapl-20220924.htm', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/0000320193-22-000108.txt'}, {'company': 'Apple Inc.', 'filing_date': '2021-10-28T18:04:28-04:00', 'link': 'https://www.sec.g