# Imports

In [148]:
import os

import pandas as pd

from collections import Counter

from dotenv import load_dotenv, find_dotenv
from openai import OpenAI

import utility.utility as util
import utility.prompts as prompts

# Below import and instructions simply for display
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# resets import once changes have been applied
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Directory and File Paths

In [23]:
path_data = os.path.join(os.getcwd(), "data")

path_stmts = os.path.join(path_data, "predict")

file_excel = os.path.join(path_data, "sample_28feb.xlsx")

# Load Environment Variables

In [24]:
_ = load_dotenv(find_dotenv())

# Load Data

In [187]:
df_meta = pd.read_excel(file_excel, sheet_name="manual cases") #pd.read_csv(file_meta)
df_data = pd.read_excel(file_excel, sheet_name="Sheet1") #pd.read_csv(file_data)

# Preliminary Adjustments

In [188]:
df_meta.dropna(axis=1, inplace=True, thresh = int(df_meta.shape[0]*.1))
df_meta.drop(columns=["checked by", "firm", "path_txt", "path_doc", "manual"], inplace=True)
df_meta["filename"] = df_meta["filename"].astype("str") + ".txt"
df_meta["filepath"] = df_meta["filename"].apply(lambda x: os.path.join(path_stmts, x))

In [189]:
df_data.drop(columns=["path_txt", "path_doc","selected","easy",], inplace=True)
df_data.dropna(axis=1, inplace=True, thresh = int(df_data.shape[0]*.1))
df_data["filename"] = df_data["filename"].astype("str") + ".txt"

# Split "Train" "Test"

In [190]:
test = df_meta.iloc[:10,:].copy()
df_meta = df_meta.iloc[10:,:].copy()

# Extract most common terms:

In [177]:
min_count = 7
terms_auditor = util.concat_terms(util.det_commonly_used_terms(df_meta["terms_audit"], min_count=min_count), ", ")
terms_notes = util.concat_terms(util.det_commonly_used_terms(df_meta["terms_notes"], min_count=min_count), ", ")
terms_acc_stds = util.concat_terms(util.det_commonly_used_terms(df_meta["term"], min_count = 1), " - ")

section_terms_auditor = prompts.common_terms_section_auditor.format(terms_auditor = terms_auditor)
section_terms_notes = prompts.common_terms_section_notes.format(terms_notes = terms_notes)
section_terms_acc_stds = prompts.common_terms_accounting_stds.format(acc_std = terms_acc_stds)

# Construct Base Prompt

In [329]:
length = 0

System Context

In [330]:
system = prompts.system_context_full_task + \
         section_terms_auditor + \
         section_terms_notes + \
         section_terms_acc_stds + \
         prompts.standard_answer
length += util.count_tokens(system)

User_Assitant Context

In [332]:
user_assistant = []
for id in test.filename.unique():
    user_content = ""
    assistant_content = ""
    tmp = test[test.filename == id]
    for entity in tmp.source.unique():
        user_content += str(tmp[tmp.source == entity]["paragraph (context)"].values) + " ... "
        assistant_content += f"{entity}: {tmp[tmp.source == entity]['term'].values}" + '\n'

    user_assistant.append((user_content, assistant_content))
for ua in user_assistant:
    length += util.count_tokens(ua[0])

Prompt

In [333]:
i = 4

In [334]:
text = util.parse_txt(df_meta["filepath"].iloc[i])
prompt = util.clean_text(text)
length += util.count_tokens(prompt)

Token Lenght of Prompt

In [335]:
length

15418

# Construct Messages

In [321]:
messages = util.create_messages_context_gpt(system, prompt, user_assistant)

# Initialize Client

In [322]:
client = OpenAI()

# Call API

In [323]:
response = util.get_completion(client, messages, model="gpt-4-0125-preview")

# Answer

In [324]:
response

ChatCompletion(id='chatcmpl-8yA0WLImbLETeKKLOMruC2EomEKRi', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Notes: 'Australian Accounting Standards, Australian Accounting Interpretations, other authoritative pronouncements of the Australian Accounting Standards Board and the Corporations Act'\nAudit: 'Australian Accounting Standards and International Financial Reporting Standards as disclosed in Note 2'", role='assistant', function_call=None, tool_calls=None))], created=1709348840, model='gpt-4-0125-preview', object='chat.completion', system_fingerprint='fp_70b2088885', usage=CompletionUsage(completion_tokens=46, prompt_tokens=15619, total_tokens=15665))

In [325]:
print(response.choices[0].message.content)

Notes: 'Australian Accounting Standards, Australian Accounting Interpretations, other authoritative pronouncements of the Australian Accounting Standards Board and the Corporations Act'
Audit: 'Australian Accounting Standards and International Financial Reporting Standards as disclosed in Note 2'


# Test

In [297]:
df_meta.iloc[i]["filename"]

'61102602.txt'

In [307]:
df_meta.iloc[i]["source"]

'notes'

In [326]:
df_meta.iloc[i+1]["term"]

' Australian Accounting Standards and the Corporations Regulations 2001|International Financial Reporting Standards'

In [284]:
df_meta.iloc[i]

cc_iso3                                                              CYP
year                                                                2007
filename                                                    61102602.txt
paragraph (context)    a summary of significant accounting policies a...
 sentence              a summary of significant accounting policies a...
term                   international financial reporting standards as...
classification                                                        eu
page (txt)                                                             7
note                                                                 NaN
source manual                                                      audit
source_cats_term       [consolidated cash flow statement for the year...
terms_notes            summary of significant accounting policies|exp...
terms_audit            audit evidence about the|we believe that the a...
source                                             