## import your modules needed

In [None]:
import requests as req
import time
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import pandas as pd
from transformers import pipeline, logging
import numpy as np
from langchain_openai import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain.chains import LLMChain
import streamlit as st
import random
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
load_dotenv()
SEC_API_KEY = os.getenv('SEC_API_KEY')
OPENAI_KEY = os.getenv('OPENAI_API_KEY')
FMP_KEY = os.getenv('FMP_API_KEY')
EMAIL=os.getenv("EMAIL")

sleep_time = 0.5


### instantiate an OpanAI LLM object

In [31]:
# llm = OpenAI(openai_api_key=OPENAI_KEY, temperature=0.9)
llm = OpenAI(openai_api_key=OPENAI_KEY, temperature=0.9)

#### prompt the user to get company name to retrieve ticker

In [4]:
#Get company ticker by the provided name
def get_ticker_by_company_name(company_name):
    url = "https://financialmodelingprep.com/api/v3/search"
    params = {
        "query": company_name,
        "limit": 1,
        "exchange": "NASDAQ",
        "apikey": FMP_KEY
    }

    try:
        response = req.get(url, params=params)
        response.raise_for_status()
        results = response.json()
        if results:
            return results[0]["symbol"]
        else:
            print(f"No ticker found for: {company_name}, using default tickers.")
            default_tickers = ['UPST', 'KO', 'TSLA', 'INTC']
            return default_tickers
    except Exception as e:
        print(f"Error fetching ticker: {e}")
        return None

In [5]:
company_names = input("Enter the company name, use , to separate multiple inputs: ")
print(f"Company name: {company_names}")
company_names = company_names.split(",")
print(f"Company names: {company_names}")
company_tickers = [get_ticker_by_company_name(name.strip()) for name in company_names]
print(f"Ticker for '{company_names}' is: {company_tickers}")

Company name: meta
Company names: ['meta']
Ticker for '['meta']' is: ['META']


## use edgar api to search company using their ticker to fetch FR document 

In [6]:
def get_comp_sec(ticker):
    """
    Get the company SECURITIES AND EXCHANGE COMMISSION reports for a given ticker symbol.
    
    Args:
        ticker (str): The ticker symbol of the company.
    
    Returns:
        object: filings of the company.
    """
    base_url = "https://api.sec-api.io"
    payload = {
                "query": f'formType:\"10-K\" AND ticker:{ticker} AND filedAt:[2020-01-01 TO 2025-01-31]',
                "from": "0",
                "size": "50",
                "sort": [{ "filedAt": { "order": "desc" }}]
            }
    response = req.post(base_url, json=payload, headers={'Authorization': SEC_API_KEY})
    
    if response.status_code == 200:
        comp_tenk_filing = response.json()
        return comp_tenk_filing
    else:
        raise Exception(f'Error fetching Company 10 k filings. Status code: {response.status_code}')

In [7]:
def safe_get_filings(tkr):
    try:
        time.sleep(sleep_time + random.uniform(0, 0.5))
        return get_comp_sec(tkr)
    except Exception as e:
        print(f"Error fetching {tkr}: {e}")
        return None

In [None]:
company_filings_info = [safe_get_filings(tkr) for tkr in company_tickers]
print(company_filings_info)

In [21]:
company_info = []
for comp in company_filings_info:
    if comp is not None:
        for filing in comp['filings']:
            company_info.append({
                "company": filing["companyName"],
                "filing_date": filing["filedAt"],
                "link": filing["linkToFilingDetails"],
                "text": filing["linkToTxt"],
            })
    else:
        print("No filings found.")
print(company_info)

[{'company': 'Meta Platforms, Inc.', 'filing_date': '2025-01-29T20:00:50-05:00', 'link': 'https://www.sec.gov/Archives/edgar/data/1326801/000132680125000017/meta-20241231.htm', 'text': 'https://www.sec.gov/Archives/edgar/data/1326801/000132680125000017/0001326801-25-000017.txt'}, {'company': 'Meta Platforms, Inc.', 'filing_date': '2024-02-01T19:39:02-05:00', 'link': 'https://www.sec.gov/Archives/edgar/data/1326801/000132680124000012/meta-20231231.htm', 'text': 'https://www.sec.gov/Archives/edgar/data/1326801/000132680124000012/0001326801-24-000012.txt'}, {'company': 'Meta Platforms, Inc.', 'filing_date': '2023-02-01T20:26:31-05:00', 'link': 'https://www.sec.gov/Archives/edgar/data/1326801/000132680123000013/meta-20221231.htm', 'text': 'https://www.sec.gov/Archives/edgar/data/1326801/000132680123000013/0001326801-23-000013.txt'}, {'company': 'Meta Platforms, Inc.', 'filing_date': '2022-02-02T21:11:04-05:00', 'link': 'https://www.sec.gov/Archives/edgar/data/1326801/000132680122000018/fb-

In [40]:
def beautiful_soup(url, company, date):
    """
    Get the soup object from the url.
    
    Args:
        url (str): The url of the filing.
    
    Returns:
        object: soup object.
    """
    headers = {
    'User-Agent': EMAIL
    }
    response = req.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'xml')
        # write the soup to a file
        date = date.replace("-", "_")
        company = company.replace(" ", "_")
        if not os.path.exists("financialreport"):
            os.makedirs("financialreport")
        with open(f'financialreport/{company}_{date}_raw.txt', 'w') as f:
            f.write(soup.get_text())
        return soup.get_text()
    else:
        raise Exception(f'Error fetching filing. Status code: {response.status_code}')

### Get sentiment analyses of financial report using ProsusAI/finbert

In [12]:
#sentiment analysis

#define function to get overall sentiment
def overall_sentiment(financial_report_df, text):
    # Silence transformers logging (optional)
    logging.set_verbosity_error()

    # Load FinBERT sentiment analysis pipeline
    sentiment_analyzer = pipeline(
        "sentiment-analysis",
        model="ProsusAI/finbert"
    )
    if not isinstance(text, str) or len(text.strip()) == 0:
        return {
            "overall_sentiment": "error",
            "average_score": None
        }
    
    # Break the text into chunks (basic method)
    
    chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]

    #make empty list for sentiment score
    sentiment_score = []

    # Analyze sentiment
    for chunk in chunks:
        result = sentiment_analyzer(chunk[:512])
        sentiment_score.append(result[0]["score"])
    
    # Calculate the average score
    average_score = np.mean(sentiment_score)
    
    # Determine the overall sentiment based on the average score
    if average_score >= 0.75:
        overall_sentiment = 'positive'
    elif average_score <= 0.25:
        overall_sentiment = 'negative'
    else:
        overall_sentiment = 'neutral'
    return {
           "overall_sentiment": overall_sentiment,
           "average_score": average_score
    }

#add to data_df dataframe
sentiment_results = financial_report_df["filing_text"].apply(overall_sentiment)
financial_report_df["sentiment"] = sentiment_results.apply(lambda x: x["overall_sentiment"])
financial_report_df["sentiment_score"] = sentiment_results.apply(lambda x: x["average_score"])
    



NameError: name 'financial_report_df' is not defined

In [13]:
# define dataframe
# Create an empty column for filing text
def build_financial_report_df(data):
    financial_report_df = pd.DataFrame(data=data)
    overall_sentiment(financial_report_df)

## create a LLM that model that iterate through the list of pdf files to analyze, syntesize, and summarize the pdf pages content

In [14]:
### create a dataframe to count how many times a financial_keywords appears in the filing and add to the dataframe
def display_financial_keywords_count_df(text, keywords):
    """
    Count the occurrences of financial keywords in the text.
    
    Args:
        text (str): The text to analyze.
        keywords (list): A list of financial keywords to count.
    
    Returns:
        dict: A dictionary with keyword counts.
    """
    word_tokens = word_tokenize(text.lower())
    keyword_counts = {keyword: word_tokens.count(keyword.lower()) for keyword in keywords}
    financial_keyword_df = pd.DataFrame(keyword_counts.items(), columns=['Keyword', 'Count'])
    display(financial_keyword_df)
    

In [16]:
def summarize_financial_report(text, num_sentences=5):
    """
    Summarizes a financial report using a frequency-based approach with NLTK.

    Args:
        text (str): The financial report text.
        num_sentences (int): The desired number of sentences in the summary.

    Returns:
        str: The generated summary.
    """
    financial_keywords = ["revenue", "profit", "income", "loss", "assets", "liabilities",
                            "equity", "cash flow", "margin", "earnings", "sales", "billion",
                            "million", "thousand", "%", "increase", "decrease", "growth"]

    # 1. Tokenization
    words = word_tokenize(text.lower())

    # 2. Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped_words = [w.translate(table) for w in words]

    # 3. Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [w for w in stripped_words if w.isalpha() and w not in stop_words]

    # 4. Word frequency calculation
    word_frequency = {}
    for word in filtered_words:
        word_frequency[word] = word_frequency.get(word, 0) + 1

    # Calculate weighted frequencies (optional, can help emphasize important words)
    max_frequency = max(word_frequency.values()) if word_frequency else 1
    weighted_frequency = {word: freq / max_frequency for word, freq in word_frequency.items()}

    # 5. Sentence tokenization
    sentences = sent_tokenize(text)

    # 6. Sentence scoring
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            word = word.translate(table)
            if word in weighted_frequency:
                score_multiplier = 1.5 if word in financial_keywords else 1.0
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + weighted_frequency[word] * score_multiplier

    # 7. Summary generation
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    summary_sentences = sorted_sentences[:num_sentences]
    summary = " ".join(summary_sentences)
    display_financial_keywords_count_df(summary, financial_keywords)

    return summary

In [None]:
def save_summary_to_file(company, date, summary):
    if not os.path.exists("summaries"):
        os.makedirs("summaries")
    with open(f"summaries/{company}_{date}_summary.txt", "w") as f:
        f.write(summary)

In [None]:
detail_company_data = []
for info in company_info:
    link = info['link']
    date = info['filing_date']
    company = info['company']
    filing_text = beautiful_soup(link, company, date)
    save_summary_to_file(company, date, summarize_financial_report(filing_text, num_sentences=3))
    print(f"Filing text for {company} on {date} saved.")
    # Append the filing text to the data list
    detail_company_data.append({
        "company": company,
        "filing_date": date,
        "filing_text": filing_text
    })
print(detail_company_data)
# build_financial_report_df(detail_company_data)




[{'company': 'Meta Platforms, Inc.', 'filing_date': '2025-01-29T20:00:50-05:00', 'filing_text': '\nmeta-20241231false2024FY0001326801P4YP1YP5Yhttp://fasb.org/us-gaap/2024#AccruedLiabilitiesCurrenthttp://fasb.org/us-gaap/2024#OtherLiabilitiesNoncurrent353450iso4217:USDxbrli:sharesiso4217:USDxbrli:sharesmeta:segmentmeta:unitxbrli:puremeta:employeemeta:objectormeta:appealiso4217:EURmeta:classActionmeta:votemeta:shareBasedCompensationPlanmeta:notice00013268012024-01-012024-12-3100013268012024-06-300001326801us-gaap:CommonClassAMember2025-01-240001326801us-gaap:CommonClassBMember2025-01-2400013268012024-12-3100013268012023-12-310001326801us-gaap:CommonClassAMember2023-12-310001326801us-gaap:CommonClassAMember2024-12-310001326801us-gaap:CommonClassBMember2024-12-310001326801us-gaap:CommonClassBMember2023-12-3100013268012023-01-012023-12-3100013268012022-01-012022-12-310001326801us-gaap:CommonStockMember2021-12-310001326801us-gaap:AdditionalPaidInCapitalMember2021-12-310001326801us-gaap:Accum

### run LLM app using OpenAi Langchain model

In [2]:
!streamlit run OpenaiLangchain.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.1.229:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
  financial_chain = LLMChain(llm=llm, prompt=prompt_template, verbose=True)
^C
[34m  Stopping...[0m
