In [19]:
import asyncio
from datetime import datetime
import enum
from instructor import OpenAISchema
from instructor.dsl import MultiTask
import json
from pydantic import BaseModel, Field
from typing import Optional, List, Any
from typing_extensions import Annotated
import openai
import instructor

openai.api_base = "https://futu-002-caeast-001.openai.azure.com/"
openai.api_key = "5d050ffec2b94f5eb43c54c80149561e"
openai.api_version = "2023-07-01-preview"
openai.api_type = "azure"

import sys
sys.path.append("/data/home/jarvixwang/project/filings_gpt")
from filingsgpt.utils import pdf_loader

In [22]:

class RowOfTable(OpenAISchema):
    row_index: int = Field(default=None, description="The index of the row in the table")
    # row_text: str = Field(default=None, description="The full text of the row contents, including column names and values for each row.")
    #row_columns: List[str] = Field(default=None, description="The columns involved in the current row")
    # row_data: List[Property] = Field(default=None, description="""The column keys and their corresponding values of the current row""")
    # row_values: List[Any] = Field(default=None, 
    #                               description="""The values of the current row. It can be number, string, date, etc.""")
    row_json: Annotated[str, Field(default=None, description="For each value in row_values, find a correct key from the table columns to relate to the value, drop the columns without any value if necessary, to form a complete json string to include all values in current row.")]
    
    
    
class Table(OpenAISchema):
    title: Optional[str] = Field(default=None, description="Title of the table")
    table_text: str = Field(default=None, description="The full text of the table contents, including column names and values for each row.")
    rows: List[RowOfTable] = Field(default=None, description="The rows of the table")
    
class Form144FilingDatabase(OpenAISchema):
    """
    A set of correct named and defined tables appearing in a SEC Form 144 filing.
    Each table has multiple rows. Each row has multiple field and its corresponding value.
    Each row is a json string.
    """

    tables: List[Table] = Field(
        ...,
        description="List of tables in the database",
    )
# MultiTables = MultiTask(Table)

def get_data(doc_text: str) -> Form144FilingDatabase:
    system_prompt = """
You are an expert in SEC filings, with substantial experience and knowledge about the US SEC's filing process.
* Experience: 10+ years of experience working for the US SEC, with a focus on extracting and analyzing data from company filings.
* Roles and Companies: Official Officer at the US Securities and Exchange Commission.
* Education: Masters in Finance from the Wharton School of the University of Pennsylvania.
* Skills: Data extraction, financial analysis, SEC filing review, JSON formatting, and table data extraction.
The workflow for extracting data from SEC filings is as follows:
1. Identify Text Fragments for Each Table:
- Look for specific markers or patterns that denote the start and end of each table.
- Common markers may include section headings like "Form type or form number: [Table Title]" or specific field names.
- Pay attention to structural elements that separate tables.
2. Extract Table Titles:
- Once a table's text fragment is identified, extract the title by locating the relevant section heading.
- The title is typically the same as the section heading.
3. Extract Rows Data:
- Identify and extract data from each row within the table.
- Match fields (e.g., "Field Name: Value") and capture both field names and their corresponding values.
- Pay attention to different data types, such as dates, numerical values, and text.
- Ensure that the data is extracted accurately and in the correct order.
- Focus on values of the row, if the number of values is less than the number of fields, you should double check the context and drop the fields without any value if necessary.
4. Format Data as JSON:
- Organize the extracted row data into a structured JSON format.
- Each field name becomes a key, and its value becomes the corresponding value in the JSON structure.
5. Compile All Tables' Data
6. Ensure Completeness:
- Verify that all rows within a table are accurately extracted.
- Ensure that the JSON formatting is consistent and free of errors.
7. Review for Accuracy:
- Cross-check the data extracted against the original SEC filing to ensure accuracy.
    """
    # TODO: system message应该让promptgpt生成
    completion = openai.ChatCompletion.create(
        engine="gpt-4",
        temperature=0.,
        functions=[Form144FilingDatabase.openai_schema],
        function_call={"name": Form144FilingDatabase.openai_schema["name"]},
        messages=[
            {"role": "system",
             "content": system_prompt
             },
            {"role": "user", 
             "content": f"Given a filing text: {doc_text}"
             }
        ],
    )
    return Form144FilingDatabase.from_response(completion)

pdf_filepath="/data/home/jarvixwang/project/filings_gpt/data/20231020评测集/股票交易_144_美股_TSLA_特斯拉：拟议出售证券_英文_0/primary_doc.xml.pdf"    
# pdf_filepath = "/data/home/jarvixwang/project/filings_gpt/data/20231020评测集/股票交易_4_美股_NVDA_英伟达：持股变动声明-高管 Shoquist Debora_英文_15/wk-form4_1694652514.xml.pdf"
doc_text = pdf_loader.get_pdf_text(pdf_filepath)

tables = get_data(doc_text=doc_text)

In [24]:
for table in tables.tables:
    print(f"table: {table.title}")
    for row in table.rows:
        print(f"row: {row.row_index}, json: {row.row_json}")
    print("----------")

table: 144: Filer Information
row: 1, json: {"Filer CIK": "0001771364", "Filer CCC": "XXXXXXX", "Is this a LIVE or TEST Filing?": "LIVE"}
----------
table: 144: Issuer Information
row: 1, json: {"Name of Issuer": "Tesla, Inc.", "SEC File Number": "001-34756", "Address of Issuer": "1 TESLA ROAD AUSTIN TEXAS 78725", "Phone": "(512) 516-8177", "Name of Person for Whose Account the Securities are To Be Sold": "ZACHARY KIRKHORN"}
----------
table: 144: Securities Information
row: 1, json: {"Title of the Class of Securities To Be Sold": "Common", "Name and Address of the Broker": "Morgan Stanley Smith Barney LLC Executive Financial Services 1 New York Plaza 38th Floor New York NY 10004", "Number of Shares or Other Units To Be Sold": "3752", "Aggregate Market Value": "919240.00", "Number of Shares or Other Units Outstanding": "3173994467", "Approximate Date of Sale": "09/05/202 3", "Name the Securities Exchange": "NASDAQ"}
----------
table: 144: Securities To Be Sold
row: 1, json: {"Title of 

In [21]:

# class Property(OpenAISchema):
#     key: str = Field(description="Key of the property, which should existed in the column names of the table")
#     key_existed: bool = Field(description="Determine whether this property appears in the current row")
#     value_starting_from: int = Field(description="Value starting position in given text")
#     value: Optional[Any] = Field(default=None, 
#                                  description="""Value of the property, it may be blank if the value is not existed in the table. 
#                                  You should combine the columns of the entire table and surrounding values to determine which key the current value belongs to. 
#                                  If not `key_existed`, the value keeps None.
#                                  One value starting from the same position in given text can only corresponds to one property.
#                                  The relative order of key appearing in the text is consistent with the relative position order of value, that is to say key1=value1, key2=value2, key3=value3, then key1 is definitely not equal to value3, because value3 appears after key2.
#                                  """)
    


MultiRows = MultiTask(RowOfTable)

def get_rows_of_table(table_text: str) -> MultiRows:
    # TODO: system message应该让promptgpt生成
    completion = openai.ChatCompletion.create(
        messages=[
            {"role": "system",
             "content": "You are a SEC filing expert. You are also an official who has been working for the US SEC for many years. Your daily job is to extract all data from company announcements. You are working to correctly extract all rows data from the given table text existed in SEC filings."},
            {"role": "user", 
             "content": f"Extract from {table_text}"
             }
        ],
        engine="gpt-4",
        temperature=0.,
        functions=[MultiRows.openai_schema],
        function_call={"name": MultiRows.openai_schema["name"]},
    )
    return MultiRows.from_response(completion)

table_texts = ["""
table: Issuer Information
Name of Issuer Tesla, Inc.
SEC File Number 001-34756
Address of Issuer 1 TESLA ROAD AUSTIN TEXAS 78725
Phone (512) 516-8177
Name of Person for Whose Account the Securities are To Be Sold ZACHARY KIRKHORN
Relationship to Issuer Officer
""",
"""
table: Securities Information
Title of the Class of Securities To Be Sold
Name and Address of the Broker
Number of Shares or Other Units To Be Sold
Aggregate Market Value
Number of Shares or Other Units Outstanding
Approximate Date of Sale
Name the Securities Exchange
Common Morgan Stanley Smith Barney LLC Executive Financial Services 1 New York Plaza 38th Floor New York NY 10004
3752
919240.00
3173994467
09/05/2023
NASDAQ
""",
"""
table: Securities To Be Sold
Title of the Class
Date you Acquired
Nature of Acquisition Transaction
Name of Person from Whom Acquired
Is this a Gift?
Date Donor Acquired
Amount of Securities Acquired
Date of Payment
Nature of Payment
* Common 12/05/2020 Restricted Stock Issuer 1599 12/05/2020 Not Applicable
Common 06/05/2022 Restricted Stock Issuer 1754 06/05/2022 Not Applicable
Common 06/05/2021 Restricted Stock Issuer 399 06/05/2021 Not Applicable
""",
"""
table: Securities Sold During The Past 3 Months
Name and Address of Seller
Title of Securities Sold
Date of Sale
Amount of Securities Sold
Gross Proceeds
ZACHARY KIRKHORN 1 TESLA ROAD AUSTIN TX 78725 Common 08/04/2023 3750 978750.00
ZACHARY KIRKHORN 1 TESLA ROAD AUSTIN TX 78725 Common 07/05/2023 3750 1045125.00
ZACHARY KIRKHORN 1 TESLA ROAD AUSTIN TX 78725 Common 06/06/2023 7403 1595449.00
"""
]

rows = get_rows_of_table(table_texts[-2])
for row in rows.tasks:
    print(f"row: {row.row_index}")
    # table_data = 
    # row_data = [item.model_dump() for item in row.row_data]
    # row_data = []
    print(f"row_json: {row.row_json}")
    # print(json.dumps(row_data, indent=4, ensure_ascii=False))
    print("----------")

row: 1
row_json: {"Title of the Class": "Common", "Date you Acquired": "12/05/2020", "Nature of Acquisition Transaction": "Restricted Stock", "Name of Person from Whom Acquired": "Issuer", "Is this a Gift?": "No", "Date Donor Acquired": "12/05/2020", "Amount of Securities Acquired": "1599", "Date of Payment": "12/05/2020", "Nature of Payment": "Not Applicable"}
----------
row: 2
row_json: {"Title of the Class": "Common", "Date you Acquired": "06/05/2022", "Nature of Acquisition Transaction": "Restricted Stock", "Name of Person from Whom Acquired": "Issuer", "Is this a Gift?": "No", "Date Donor Acquired": "06/05/2022", "Amount of Securities Acquired": "1754", "Date of Payment": "06/05/2022", "Nature of Payment": "Not Applicable"}
----------
row: 3
row_json: {"Title of the Class": "Common", "Date you Acquired": "06/05/2021", "Nature of Acquisition Transaction": "Restricted Stock", "Name of Person from Whom Acquired": "Issuer", "Is this a Gift?": "No", "Date Donor Acquired": "06/05/2021",

In [None]:
# from instructor.dsl import llm_validator
# llm_validator()