In [74]:
# import libraries
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient


In [83]:
# Load environment variables
load_dotenv()

# change this
endpoint = os.getenv("AZURE_COGNITIVE_ENDPOINT")
key = os.getenv("AZURE_COGNITIVE_KEY")
gpt4_endpoint = os.getenv("OPENAI_ENDPOINT")
gpt4_key = os.getenv("OPENAI_KEY")

# Step 1: Preprocessing Large 10K/10Q documents, get Balance Sheets

Edgar Tool

In [None]:
from edgar import *
# Tell the SEC who you are
set_identity("Jessie Zeng lingruiz@andrew.cmu.edu")

In [13]:
def edgarTool_get_filing(ticker, form_type, num_filings):
    # Get the latest 10-K filing for Apple
    filing = Company(ticker).get_filings(form=form_type).latest(num_filings)
    return filing

In [18]:
# get filing by looking up the ticker and form type
filing = edgarTool_get_filing("PLYA", "10-K", 1)

In [20]:
from edgar.financials import Financials
financials = Financials(filing.xbrl())
financials.get_balance_sheet()
# financials.get_income_statement()
# financials.get_cash_flow_statement()

                                            [1;38;5;38mPlaya Hotels & Resorts N.V.[0m                                            
                                            [1mConsolidated Balance Sheets[0m                                            
                                                                                                                   
 [1m [0m[1m                                                                         [0m[1m [0m [1m            [0m[1m [0m [1m      2023[0m[1m [0m [1m      2022[0m[1m [0m 
 ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── 
  [1;38;5;32mASSETS                                                                   [0m                                        
  [2;38;5;250m  Cash and cash equivalents †                                            [0m  [2;38;5;249mthousands   [0m     272,520     283,945  
  [2;38;5;250m  Trade and other receivabl

In [None]:
# another way to get balance sheet
tenk = Company("AAPL").get_filings(form="10-K").latest(1).obj()
financials = tenk.financials
financials.get_balance_sheet

<bound method Financials.get_balance_sheet of ╭───────────────────────────────── [1;38;5;39mApple Inc.[0m financials[1;32m period ended 2023-09-30[0m ─────────────────────────────────╮
│ ╭───┬───────────────────────────────────────────────╮                                                           │
│ │[1m [0m[1m [0m[1m [0m│[1m [0m[1mStandard Financial Statements                [0m[1m [0m│                                                           │
│ ├───┼───────────────────────────────────────────────┤                                                           │
│ │ 1 │ Cover Page                                    │                                                           │
│ │ 2 │ Consolidated Balance Sheets                   │                                                           │
│ │ 3 │ Income Statements                             │                                                           │
│ │ 4 │ Consolidated Statement of Cash Flows          │            

# Step 1.1: azure Document Intelligence

Input: a pdf n

Output: location of identified text, table and figures with corresponding text

In [16]:
# sample document
base_path = os.path.abspath(os.path.join(os.getcwd(), '../data'))

formPath = os.path.join(base_path, "10k/plya_consolidated_balance_sheets.pdf")
print(formPath)

/Users/lingruiz/Documents/capstone/data/10k/plya_consolidated_balance_sheets.pdf


In [20]:
def open_pdf(path):
    with open(path, "rb") as f:
        return f.read()


def get_result(formPath, client):
    # read and analyze the document
    poller = client.begin_analyze_document("prebuilt-layout", open_pdf(formPath))
    result = poller.result()
    return result

In [21]:
# create a client
document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [22]:
# run the service, save result
result = get_result(formPath, document_analysis_client)


## Explore result structure

In [42]:
# tables are embedded in result.tables, to query the tables, do the following: 
# result.tables returns a list of tables in the pdf document, 
# access row count with row_count, and column count with column_count


for index, table in enumerate(result.tables):
    print(f"table #{index}: \nrow count:{table.row_count}")
    print(f"column count:{table.column_count} \n")

table #0: 
row count:32
column count:3 

table #1: 
row count:32
column count:4 



## Convert result to df/csv

In [45]:
def convert_azdoc_todf(tables):
    tablesCollected = []
    for table_idx, table in enumerate(tables):
        # Initialize an empty matrix
        matrix = [["" for _ in range(table.column_count)] for _ in range(table.row_count)]
        tablesCollected.append(matrix)
        for cell in table.cells:
            row_index = cell.row_index
            column_index = cell.column_index

            if row_index < table.row_count and column_index < table.column_count:
                matrix[row_index][column_index] = cell.content
    
    return tablesCollected

In [93]:
convert_azdoc_todf(result.tables)

[[['', 'As of December 31,', ''],
  ['', '2023', '2022'],
  ['ASSETS', '', ''],
  ['Cash and cash equivalents', '$ 272,520', '$ 283,945'],
  ['Trade and other receivables, net', '74,762', '62,946'],
  ['Insurance recoverable', '9,821', '34,191'],
  ['Accounts receivable from related parties', '5,861', '8,806'],
  ['Inventories', '19,963', '20,046'],
  ['Prepayments and other assets', '54,294', '44,177'],
  ['Property and equipment, net', '1,415,572', '1,536,567'],
  ['Derivative financial instruments', '2,966', '3,510'],
  ['Goodwill, net\n:unselected:', '60,642', '61,654'],
  ['Other intangible assets\n:unselected:', '4,357', '6,556'],
  ['Deferred tax assets', '12,967', '7,422'],
  ['Total assets', '$ 1,933,725', '$ 2,069,820'],
  ["LIABILITIES AND SHAREHOLDERS' EQUITY", '', ''],
  ['Trade and other payables', '$ 196,432', '$ 231,652'],
  ['Payables to related parties', '10,743', '6,852'],
  ['Income tax payable', '11,592', '990'],
  ['Debt', '1,061,376', '1,065,453'],
  ['Other liab

In [90]:
def convert_azdoc_tostring(tables):
    result = ""
    for table_idx, table in enumerate(tables):
        result += f"Table {table_idx + 1}:\n"
        matrix = [["" for _ in range(table.column_count)] for _ in range(table.row_count)]
        
        for cell in table.cells:
            row_index = cell.row_index
            column_index = cell.column_index

            if row_index < table.row_count and column_index < table.column_count:
                matrix[row_index][column_index] = cell.content
        
        # Convert each row in the matrix to a string
        for row in matrix:
            row_str = "\t".join(row)  # Separate columns by tabs for readability
            result += row_str + "\n"
        
        result += "\n"  # Separate tables by a blank line
        
    return result

def convert_azdoc_to_markdown(tables):
    markdown_tables = ""
    for table_idx, table in enumerate(tables):
        # Initialize an empty matrix for the table
        matrix = [["" for _ in range(table.column_count)] for _ in range(table.row_count)]
        
        # Fill in the matrix with cell content
        for cell in table.cells:
            row_index = cell.row_index
            column_index = cell.column_index
            if row_index < table.row_count and column_index < table.column_count:
                matrix[row_index][column_index] = cell.content
        
        # Convert matrix to Markdown format
        markdown_table = "| " + " | ".join(matrix[0]) + " |\n"  # Header row
        markdown_table += "| " + " | ".join(['---' for _ in matrix[0]]) + " |\n"  # Divider row
        
        for row in matrix[1:]:  # Data rows
            markdown_table += "| " + " | ".join(row) + " |\n"
        
        # Add each table to the result with a separating line between tables
        markdown_tables += f"### Table {table_idx + 1}\n" + markdown_table + "\n\n"
    
    return markdown_tables



In [91]:
table_md = convert_azdoc_to_markdown(result.tables)

# Step 2: Credit Score Calculations

In [None]:
# TODO: find some numeric driven calculations to perform credit scoring related calculations, 
# before feeding the results into LLM

# Step 3: Credit Score Analysis

### OpenAI GPT-4

In [84]:
gpt4_endpoint

'https://ai-lingruiz7765ai414752438778.openai.azure.com'

In [80]:
from openai import AzureOpenAI
    
client = AzureOpenAI(
    api_key=gpt4_key,  
    api_version="2024-08-01-preview",
    azure_endpoint = gpt4_endpoint
    )

In [87]:
SYSTEM_PROMPT = "given the balance sheet above, calculate below ratios: Current Ratio = Current Assets / Current Liabilities and compute  a credit score to evaluate the financial status of the company"

In [None]:
# Define deployment name for the model
# if needed, add to session chat history to prior prompt the model
# example use: 
# {"role": "user", "content": "Does Azure OpenAI support customer managed keys?"},
#  # {"role": "assistant", "content": "Yes, customer managed keys are supported by Azure OpenAI."},
# exceeding rate limit will result in 429 error
deployment_name = 'gpt-4'

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": table_md}
    ]
)

print(response.choices[0].message.content)


### FinGPT 

In [95]:
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizerFast
from peft import PeftModel  # 0.5.0

# Load Models
base_model = "NousResearch/Llama-2-13b-hf" 
peft_model = "FinGPT/fingpt-sentiment_llama2-13b_lora"
tokenizer = LlamaTokenizerFast.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model = LlamaForCausalLM.from_pretrained(base_model, trust_remote_code=True, device_map = "cuda:0", load_in_8bit = True,)
model = PeftModel.from_pretrained(model, peft_model)
model = model.eval()

# Make prompts
prompt = [
'''Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}
Input: FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing its growth strategy by increasingly focusing on technologically more demanding HDI printed circuit boards PCBs .
Answer: ''',
'''Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}
Input: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Answer: ''',
'''Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}
Input: A tinyurl link takes users to a scamming site promising that users can earn thousands of dollars by becoming a Google ( NASDAQ : GOOG ) Cash advertiser .
Answer: ''',
]

# Generate results
tokens = tokenizer(prompt, return_tensors='pt', padding=True, max_length=512)
res = model.generate(**tokens, max_length=512)
res_sentences = [tokenizer.decode(i) for i in res]
out_text = [o.split("Answer: ")[1] for o in res_sentences]

# show results
for sentiment in out_text:
    print(sentiment)

# Output:    
# positive
# neutral
# negative

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


RuntimeError: Failed to import transformers.models.llama.tokenization_llama_fast because of the following error (look up to see its traceback):
tokenizers>=0.13.3 is required for a normal functioning of this module, but found tokenizers==0.13.2.

### Claude Financial Analyst