## import your modules needed

In [52]:
import requests as req
import time
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import pandas as pd
from transformers import pipeline, logging
import numpy as np
from langchain_openai import OpenAI
import random
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [53]:
load_dotenv()
SEC_API_KEY = os.getenv('SEC_API_KEY')
OPENAI_KEY = os.getenv('OPENAI_API_KEY')
FMP_KEY = os.getenv('FMP_API_KEY')
EMAIL=os.getenv("EMAIL")

sleep_time = 0.5


### instantiate an OpanAI LLM object

In [54]:
llm = OpenAI(openai_api_key=OPENAI_KEY, temperature=0.9)

### Instantiate ProsusAI/Finbert model object

In [55]:
# Silence transformers logging (optional)
logging.set_verbosity_error()

# Load FinBERT sentiment analysis pipeline
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert"
)

#### prompt the user to get company name to retrieve ticker

In [56]:
#Get company ticker by the provided name
def get_ticker_by_company_name(company_name):
    url = "https://financialmodelingprep.com/api/v3/search"
    params = {
        "query": company_name,
        "limit": 1,
        "exchange": "NASDAQ",
        "apikey": FMP_KEY
    }

    try:
        response = req.get(url, params=params)
        response.raise_for_status()
        results = response.json()
        if results:
            return results[0]["symbol"]
        else:
            print(f"No ticker found for: {company_name}, using default tickers.")
            default_tickers = ['UPST', 'KO', 'TSLA', 'INTC']
            return default_tickers
    except Exception as e:
        print(f"Error fetching ticker: {e}")
        return None

In [57]:
company_names = input("Enter the company name, use , to separate multiple inputs: ")
print(f"Company name: {company_names}")
company_names = company_names.split(",")
print(f"Company names: {company_names}")
company_tickers = [get_ticker_by_company_name(name.strip()) for name in company_names]
print(f"Ticker for '{company_names}' is: {company_tickers}")

Company name: apple
Company names: ['apple']
Ticker for '['apple']' is: ['AAPL']


## use edgar api to search company using their ticker to fetch FR document 

In [58]:
def get_comp_sec(ticker):
    """
    Get the company SECURITIES AND EXCHANGE COMMISSION reports for a given ticker symbol.
    
    Args:
        ticker (str): The ticker symbol of the company.
    
    Returns:
        object: filings of the company.
    """
    base_url = "https://api.sec-api.io"
    payload = {
                "query": f'formType:\"10-K\" AND ticker:{ticker} AND filedAt:[2020-01-01 TO 2025-01-31]',
                "from": "0",
                "size": "50",
                "sort": [{ "filedAt": { "order": "desc" }}]
            }
    response = req.post(base_url, json=payload, headers={'Authorization': SEC_API_KEY})
    
    if response.status_code == 200:
        comp_tenk_filing = response.json()
        return comp_tenk_filing
    else:
        raise Exception(f'Error fetching Company 10 k filings. Status code: {response.status_code}')

In [59]:
def safe_get_filings(tkr):
    try:
        time.sleep(sleep_time + random.uniform(0, 0.5))
        return get_comp_sec(tkr)
    except Exception as e:
        print(f"Error fetching {tkr}: {e}")
        return None

In [60]:
company_filings_info = [safe_get_filings(tkr) for tkr in company_tickers]
print(company_filings_info)

[{'total': {'value': 5, 'relation': 'eq'}, 'query': {'from': 0, 'size': 50}, 'filings': [{'ticker': 'AAPL', 'formType': '10-K', 'accessionNo': '0000320193-24-000123', 'cik': '320193', 'companyNameLong': 'Apple Inc. (Filer)', 'companyName': 'Apple Inc.', 'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm', 'description': 'Form 10-K - Annual report [Section 13 and 15(d), not S-K Item 405]', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/0000320193-24-000123.txt', 'filedAt': '2024-11-01T06:01:36-04:00', 'documentFormatFiles': [{'sequence': '1', 'size': '1503780', 'documentUrl': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm', 'description': '10-K', 'type': '10-K'}, {'sequence': '2', 'size': '120785', 'documentUrl': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/a10-kexhibit4109282024.htm', 'description': 'EX-4.1', 'type': 'EX-4.1'}, {'

In [61]:
company_info = []
for comp in company_filings_info:
    if comp is not None:
        for filing in comp['filings']:
            company_info.append({
                "company": filing["companyName"],
                "filing_date": filing["filedAt"],
                "link": filing["linkToFilingDetails"],
                "text": filing["linkToTxt"],
            })
    else:
        print("No filings found.")
print(company_info)

[{'company': 'Apple Inc.', 'filing_date': '2024-11-01T06:01:36-04:00', 'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm', 'text': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/0000320193-24-000123.txt'}, {'company': 'Apple Inc.', 'filing_date': '2023-11-02T18:08:27-04:00', 'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930.htm', 'text': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/0000320193-23-000106.txt'}, {'company': 'Apple Inc.', 'filing_date': '2022-10-27T18:01:14-04:00', 'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/aapl-20220924.htm', 'text': 'https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/0000320193-22-000108.txt'}, {'company': 'Apple Inc.', 'filing_date': '2021-10-28T18:04:28-04:00', 'link': 'https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/aapl-20210925.htm', 'text': 'https://www.sec.gov/A

In [62]:
def beautiful_soup(url, company, date):
    """
    Get the soup object from the url.
    
    Args:
        url (str): The url of the filing.
    
    Returns:
        object: soup object.
    """
    headers = {
    'User-Agent': EMAIL
    }
    response = req.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'xml')
        # write the soup to a file
        date = date.replace("-", "_")
        company = company.replace(" ", "_")
        if not os.path.exists("financialreport"):
            os.makedirs("financialreport")
        with open(f'financialreport/{company}_{date}_raw.txt', 'w') as f:
            f.write(soup.get_text())
        return soup.get_text()
    else:
        raise Exception(f'Error fetching filing. Status code: {response.status_code}')

### Get sentiment analyses of financial report using ProsusAI/finbert

In [63]:

#define function to get overall sentiment
def overall_sentiment(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return {
            "overall_sentiment": "error",
            "average_score": None
        }
    
    # Break the text into chunks (basic method)
    chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]

    #make empty list for sentiment score
    sentiment_score = []

    # Analyze sentiment
    for chunk in chunks:
        result = sentiment_analyzer(chunk[:512])
        sentiment_score.append(result[0]["score"])
    
    # Calculate the average score
    average_score = np.mean(sentiment_score)
    
    # Determine the overall sentiment based on the average score
    if average_score >= 0.75:
        overall_sentiment = 'positive'
    elif average_score <= 0.25:
        overall_sentiment = 'negative'
    else:
        overall_sentiment = 'neutral'
    
    return {
           "overall_sentiment": overall_sentiment,
           "average_score": average_score
    }

## create a LLM that model that iterate through the list of pdf files to analyze, syntesize, and summarize the pdf pages content

In [64]:
### create a dataframe to count how many times a financial_keywords appears in the filing and add to the dataframe
def display_financial_keywords_count_df(text, keywords):
    """
    Count the occurrences of financial keywords in the text.
    
    Args:
        text (str): The text to analyze.
        keywords (list): A list of financial keywords to count.
    
    Returns:
        dict: A dictionary with keyword counts.
    """
    word_tokens = word_tokenize(text.lower())
    keyword_counts = {keyword: word_tokens.count(keyword.lower()) for keyword in keywords}
    financial_keyword_df = pd.DataFrame(keyword_counts.items(), columns=['Keyword', 'Count'])
    display(financial_keyword_df)
    

In [65]:
def summarize_financial_report(text, num_sentences=5):
    """
    Summarizes a financial report using a frequency-based approach with NLTK.

    Args:
        text (str): The financial report text.
        num_sentences (int): The desired number of sentences in the summary.

    Returns:
        str: The generated summary.
    """
    financial_keywords = ["revenue", "profit", "income", "loss", "assets", "liabilities",
                            "equity", "cash flow", "margin", "earnings", "sales", "billion",
                            "million", "thousand", "%", "increase", "decrease", "growth"]

    # 1. Tokenization
    words = word_tokenize(text.lower())

    # 2. Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped_words = [w.translate(table) for w in words]

    # 3. Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [w for w in stripped_words if w.isalpha() and w not in stop_words]

    # 4. Word frequency calculation
    word_frequency = {}
    for word in filtered_words:
        word_frequency[word] = word_frequency.get(word, 0) + 1

    # Calculate weighted frequencies (optional, can help emphasize important words)
    max_frequency = max(word_frequency.values()) if word_frequency else 1
    weighted_frequency = {word: freq / max_frequency for word, freq in word_frequency.items()}

    # 5. Sentence tokenization
    sentences = sent_tokenize(text)

    # 6. Sentence scoring
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            word = word.translate(table)
            if word in weighted_frequency:
                score_multiplier = 1.5 if word in financial_keywords else 1.0
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + weighted_frequency[word] * score_multiplier

    # 7. Summary generation
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    summary_sentences = sorted_sentences[:num_sentences]
    summary = " ".join(summary_sentences)
    display_financial_keywords_count_df(summary, financial_keywords)

    return summary

In [66]:
def save_summary_to_file(company, date, summary):
    if not os.path.exists("summaries"):
        os.makedirs("summaries")
    with open(f"summaries/{company}_{date}_summary.txt", "w") as f:
        f.write(summary)

In [67]:
detail_company_data = []
for info in company_info:
    link = info['link']
    date = info['filing_date']
    company = info['company']
    filing_text = beautiful_soup(link, company, date)
    save_summary_to_file(company, date, summarize_financial_report(filing_text, num_sentences=3))
    print(f"Filing text for {company} on {date} saved.")
    # Append the filing text to the data list
    detail_company_data.append({
        "company": company,
        "filing_date": date,
        "filing_text": filing_text
    })
print(overall_sentiment(detail_company_data[0]['filing_text']))
# Create a DataFrame from the data list and overall sentiment
data_df = pd.DataFrame(detail_company_data)
sentiment_results = data_df["filing_text"].apply(overall_sentiment)
data_df["sentiment"] = sentiment_results.apply(lambda x: x["overall_sentiment"]) 
data_df["sentiment_score"] = sentiment_results.apply(lambda x: x["average_score"])
data_df = data_df.drop(columns=["filing_text"])


Unnamed: 0,Keyword,Count
0,revenue,0
1,profit,0
2,income,21
3,loss,5
4,assets,5
5,liabilities,5
6,equity,10
7,cash flow,0
8,margin,10
9,earnings,4


Filing text for Apple Inc. on 2024-11-01T06:01:36-04:00 saved.


Unnamed: 0,Keyword,Count
0,revenue,0
1,profit,0
2,income,18
3,loss,5
4,assets,5
5,liabilities,5
6,equity,10
7,cash flow,0
8,margin,10
9,earnings,4


Filing text for Apple Inc. on 2023-11-02T18:08:27-04:00 saved.


Unnamed: 0,Keyword,Count
0,revenue,0
1,profit,0
2,income,12
3,loss,5
4,assets,5
5,liabilities,5
6,equity,10
7,cash flow,0
8,margin,0
9,earnings,2


Filing text for Apple Inc. on 2022-10-27T18:01:14-04:00 saved.


Unnamed: 0,Keyword,Count
0,revenue,0
1,profit,0
2,income,13
3,loss,5
4,assets,5
5,liabilities,5
6,equity,11
7,cash flow,0
8,margin,0
9,earnings,3


Filing text for Apple Inc. on 2021-10-28T18:04:28-04:00 saved.


Unnamed: 0,Keyword,Count
0,revenue,0
1,profit,0
2,income,14
3,loss,5
4,assets,5
5,liabilities,5
6,equity,11
7,cash flow,0
8,margin,0
9,earnings,3


Filing text for Apple Inc. on 2020-10-29T18:06:25-04:00 saved.
{'overall_sentiment': 'positive', 'average_score': np.float64(0.8449823115434911)}


## Display Sentiment analysis dataframe

In [68]:
display(data_df)

Unnamed: 0,company,filing_date,sentiment,sentiment_score
0,Apple Inc.,2024-11-01T06:01:36-04:00,positive,0.844982
1,Apple Inc.,2023-11-02T18:08:27-04:00,positive,0.849689
2,Apple Inc.,2022-10-27T18:01:14-04:00,positive,0.851534
3,Apple Inc.,2021-10-28T18:04:28-04:00,positive,0.865713
4,Apple Inc.,2020-10-29T18:06:25-04:00,positive,0.866818


### run LLM app using OpenAi Langchain model

In [None]:
!streamlit run OpenaiLangchain.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.1.229:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
  financial_chain = LLMChain(llm=llm, prompt=prompt_template, verbose=True)
  response = financial_chain.run(


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are a financial analyst assistant.
    Based on the report type "Financial Forecast", generate a financial summary 
    that includes recent earnings, revenue growth, CEO sentiment, and 5-year 
    profit/revenue forecasts for the company: apple.

    If the exact ticker is not provided, infer it automatically. Respond professionally.
    [0m

[1m> Finished chain.[0m
