In [1]:
import openai
import pandas as pd
import tqdm

import picologging as logging

from pydantic import BaseModel, Field

from typing import Optional, List, Union, Literal
import json
import datetime
import pathlib

In [2]:
with open("../tokens/openai_token.txt") as f:
    openai_token = f.read()

In [3]:
now = datetime.datetime.now()
now_str = now.strftime("%Y%m%d_%H%M%S")
pathlib.Path(f"../notebooks_logging/find_files").mkdir(parents=True, exist_ok=True)
pathlib.Path(f"../notebooks_logging/find_files/{now_str}").mkdir(parents=True, exist_ok=True)

file_handler = logging.FileHandler(f"../notebooks_logging/find_files/{now_str}.log")
stream_handler = logging.StreamHandler()
# stdout_formatter = logging.Formatter(fmt="%(message)s") 
# stream_handler.setFormatter(stdout_formatter)
logging.basicConfig(
    encoding='utf-8', 
    format='%(asctime)s | %(levelname)s | %(message)s', 
    level=logging.INFO, 
    datefmt='%Y-%m-%dT%H:%M:%S',
    handlers=[
        file_handler,
        stream_handler,
    ],
    force=True
)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
li = logger.info
li("start")

2025-02-24T22:04:27 | INFO | start


In [4]:
class Question(BaseModel):
    text: str
    kind: Literal["number", "name", "boolean", "names"]

class SourceReference(BaseModel):
    pdf_sha1: str = Field(..., description="SHA1 hash of the PDF file")
    page_index: int = Field(..., description="Physical page number in the PDF file")

class Answer(BaseModel):
    question_text: Optional[str] = Field(None, description="Text of the question")
    kind: Optional[Literal["number", "name", "boolean", "names"]] = Field(None, description="Kind of the question")
    value: Union[float, str, bool, List[str], Literal["N/A"]] = Field(..., description="Answer to the question, according to the question schema")
    references: List[SourceReference] = Field([], description="References to the source material in the PDF file")

class AnswerSubmission(BaseModel):
    team_email: str = Field(..., description="Email that your team used to register for the challenge")
    submission_name: str = Field(..., description="Unique name of the submission (e.g. experiment name)")
    answers: List[Answer] = Field(..., description="List of answers to the questions")

In [5]:
li("Questions:")
with open("../data_in/questions.json") as f:
    questions = json.loads(f.read())
questions_typed = [Question(**x) for x in questions]
for q in questions[:5]:
    li(q)

li("Subset:")
with open("../data_in/subset.json") as f:
    subset = json.loads(f.read())
for x in subset[:5]:
    li(x)

2025-02-24T22:04:27 | INFO | Questions:
2025-02-24T22:04:27 | INFO | {'text': "According to the annual report, what is the Operating margin (%) for Altech Chemicals Ltd  (within the last period or at the end of the last period)? If data is not available, return 'N/A'.", 'kind': 'number'}
2025-02-24T22:04:27 | INFO | {'text': "According to the annual report, what is the Operating margin (%) for Cofinimmo  (within the last period or at the end of the last period)? If data is not available, return 'N/A'.", 'kind': 'number'}
2025-02-24T22:04:27 | INFO | {'text': 'Did Cofinimmo outline any new ESG initiatives in the annual report?', 'kind': 'boolean'}
2025-02-24T22:04:27 | INFO | {'text': "What is the total number of employees let go by Hagerty, Inc. according to the annual report? If data is not available, return 'N/A'.", 'kind': 'number'}
2025-02-24T22:04:27 | INFO | {'text': "Which leadership **positions** changed at Renold plc in the reporting period? If data is not available, return 'N

In [6]:
li("Company names:")
li("---")
subset_companies_list = [x["company_name"] for x in subset]
for x in subset_companies_list:
    li(x)
li("---")

2025-02-24T22:04:27 | INFO | Company names:
2025-02-24T22:04:27 | INFO | ---
2025-02-24T22:04:27 | INFO | KeyCorp
2025-02-24T22:04:27 | INFO | Summit Materials, Inc.
2025-02-24T22:04:27 | INFO | Gibson Energy Inc.
2025-02-24T22:04:27 | INFO | HCW Biologics Inc.
2025-02-24T22:04:27 | INFO | Essential Metals Limited
2025-02-24T22:04:27 | INFO | EVI Industries, Inc.
2025-02-24T22:04:27 | INFO | Koenig & Bauer AG
2025-02-24T22:04:27 | INFO | Microchip Technology Incorporated
2025-02-24T22:04:27 | INFO | Corbus Pharmaceuticals Holdings, Inc.
2025-02-24T22:04:27 | INFO | Audalia Resources Limited
2025-02-24T22:04:27 | INFO | Enerflex Ltd.
2025-02-24T22:04:27 | INFO | MFA Financial, Inc.
2025-02-24T22:04:27 | INFO | Strike Resources Limited
2025-02-24T22:04:27 | INFO | Altech Chemicals Ltd
2025-02-24T22:04:27 | INFO | HV Bancorp, Inc.
2025-02-24T22:04:27 | INFO | Zymeworks Inc.
2025-02-24T22:04:27 | INFO | iBio, Inc.
2025-02-24T22:04:27 | INFO | Winnebago Industries, Inc.
2025-02-24T22:04:27 

In [None]:
class CompanyAnswer(BaseModel):
    result: List[str] = Field(..., description="List of company names")
    result_prob: float = Field(..., description="Probability that the result is the right one")

client = openai.OpenAI(
    # This is the default and can be omitted
    api_key=openai_token,
)

progress_bar = tqdm.tqdm(questions)

temp_answers = []

for question in progress_bar:
    li(progress_bar)

    answer = {}
    q_text = question["text"]
    answer["question_text"] = q_text
    q_kind = question["kind"]
    answer["kind"] = q_kind

    msg = "Find company names in this question. There could be zero or more company names"

    msg += """
    ---
    Output format:
    The output should start with @@@ and end with @@@. If the result is empty it should still correspond to the JSON schema and the result should be and empty array.
    The output is a JSON that corresponds to the following schema:
    {
        "type": "object",
        "properties": {
            "result": {
                "description": "One company name",
                "type": "array",
                "items": {
                    "type": "string"
                }
            },
            "result_prob": {
                "description": "Probability that the result is the right one. Float from 0 to 1.",
                "type": "number"
            }
        },
        "required": [
            "result",
            "result_prob",
        ]
    }
    """

    msg += f"""
    ---
    Question:
    {q_text}
    """

    li(f"msg=\n{msg}")

    model_answers = []
    for i in range(3):
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": msg,
                }
            ],
            model="o1-mini",
        )
        ans = chat_completion.choices[0].message.content
        ans_l_splitted = ans.split('@@@')
        if len(ans_l_splitted) == 3:
            ans_splitted = ans_l_splitted[1]
            try:
                ans_d = json.loads(ans_splitted)
                _ = CompanyAnswer(**ans_d)
            except:
                ans_d = {
                    "result": ans_splitted,
                    "result_prob": 0.0
                }
        else:
            ans_d = {
                "result": ans,
                "result_prob": 0.0
            }



        model_answers.append(ans_d)

    li(f"model_answers=\n{model_answers}")

    df_result = pd.DataFrame(model_answers)
    df_result["result_prob"] = pd.to_numeric(df_result["result_prob"], errors="coerce")
    max_prob = df_result["result_prob"].fillna(0.0).max()
    df_result["result_str"] = df_result["result"].apply(str)
    df_result["result_votes"] = df_result.groupby(["result_str", "result_prob"]).transform("count")
    df_result_max_prob = df_result[df_result["result_prob"]==max_prob]
    max_votes = df_result_max_prob["result_votes"].max()
    list_companies = df_result_max_prob[df_result_max_prob["result_votes"]==max_votes]["result"].iloc[0]
    li(f"list_companies={list_companies}")

    subset_documents_to_look_through = []

    class ListContainsAnswer(BaseModel):
        result_name: str = Field(..., description="Company name")
        result_list_position: Union[int, Literal["N/A"]] = Field(..., description="Company name")
        result_prob: float = Field(..., description="Probability that the result is the right one")

    #for company in tqdm.notebook.tqdm(list_companies):
    for company in list_companies:
        client = openai.OpenAI(
            # This is the default and can be omitted
            api_key=openai_token,
        )

        msg = "Find this company name in the following list. "

        msg += """
---
Output format:
The output should start with @@@ and end with @@@. If the result is empty it should still correspond to the JSON schema and the result should be and empty array.
The output is a JSON that corresponds to the following schema:
{
    "type": "object",
    "properties": {
        "result_name": {
            "description": "The company name found in list. If not found return N/A",
            "type": "string",
        },
        "result_list_position": {
            "description": "The position of the found company in a list. If not found return N/A",
            "type": "string",
        },
        "result_prob": {
            "description": "Probability that the result is the right one. Float from 0 to 1.",
            "type": "number"
        }
    },
    "required": [
        "result",
        "result_list_position",
        "result_prob",
    ]
}
"""

        msg += f"""
---
Company name to find:
{company}
---
List of companies:
"""

        msg+= "\n".join(subset_companies_list)

        li(f"msg=\n{msg}")

        model_answers = []
        for i in range(3):
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": msg,
                    }
                ],
                model="o1-mini",
            )
            ans = chat_completion.choices[0].message.content
            ans_l_splitted = ans.split('@@@')
            if len(ans_l_splitted) == 3:
                ans_splitted = ans_l_splitted[1]
                try:
                    ans_d = json.loads(ans_splitted)
                    _ = ListContainsAnswer(**ans_d)
                except:
                    ans_d = {
                        "result": ans_splitted,
                        "result_prob": 0.0
                    }
            else:
                ans_d = {
                    "result": ans,
                    "result_prob": 0.0
                }



            model_answers.append(ans_d)
        li(f"model_answers=\n{model_answers}")

        df_result = pd.DataFrame(model_answers)
        df_result["result_prob"] = pd.to_numeric(df_result["result_prob"], errors="coerce")
        max_prob = df_result["result_prob"].fillna(0.0).max()
        df_result["result_str"] = df_result["result_name"].apply(str)
        df_result["result_votes"] = df_result.groupby(["result_str", "result_prob"]).transform("count").loc[:,"result_name"]
        df_result_max_prob = df_result[df_result["result_prob"]==max_prob]
        max_votes = df_result_max_prob["result_votes"].max()
        company_name_in_list = df_result_max_prob[df_result_max_prob["result_votes"]==max_votes]["result_str"].iloc[0]

        df_subset = pd.DataFrame({"subset_company": subset_companies_list})
        subset_documents_to_look_through_i = df_subset[df_subset["subset_company"] == company_name_in_list].index.to_list()
        subset_documents_to_look_through += subset_documents_to_look_through_i

    li(f"subset_documents_to_look_through={subset_documents_to_look_through}")

    answer["documents_to_look_through"] = subset_documents_to_look_through
    temp_answers.append(answer) 

  0%|          | 0/30 [00:00<?, ?it/s]2025-02-24T22:04:28 | INFO |   0%|          | 0/30 [00:00<?, ?it/s]
2025-02-24T22:04:28 | INFO | msg=
Find company names in this question. There could be zero or more company names
    ---
    Output format:
    The output should start with @@@ and end with @@@. If the result is empty it should still correspond to the JSON schema and the result should be and empty array.
    The output is a JSON that corresponds to the following schema:
    {
        "type": "object",
        "properties": {
            "result": {
                "description": "One company name",
                "type": "array",
                "items": {
                    "type": "string"
                }
            },
            "result_prob": {
                "description": "Probability that the result is the right one. Float from 0 to 1.",
                "type": "number"
            }
        },
        "required": [
            "result",
            "result_prob",
   

In [10]:
df_temp_answers = pd.DataFrame(temp_answers)
df_temp_answers.to_csv(f"../notebooks_logging/find_files/{now_str}/04 temp_answers.csv", index=False)
df_temp_answers.to_csv(f"../data_temp/04 temp_answers.csv", index=False)
df_temp_answers.to_parquet(f"../data_temp/04 temp_answers.parquet", index=False)
df_temp_answers

Unnamed: 0,question_text,kind,documents_to_look_through
0,"According to the annual report, what is the Op...",number,[13]
1,"According to the annual report, what is the Op...",number,[25]
2,Did Cofinimmo outline any new ESG initiatives ...,boolean,[25]
3,What is the total number of employees let go b...,number,[27]
4,Which leadership **positions** changed at Reno...,names,[33]
5,What was the Gross margin (%) for Charles & Co...,number,[32]
6,What was the Capital expenditures (in GBP) for...,number,[36]
7,What was the Capital expenditures (in USD) for...,number,[32]
8,What are the names of new products launched by...,names,[15]
9,"For Lipocine Inc., what was the value of Numbe...",number,[29]


In [12]:
all_pdfs = []
for i, row in df_temp_answers.iterrows():
    all_pdfs += row["documents_to_look_through"]
li(all_pdfs)

2025-02-25T12:10:25 | INFO | [13, 25, 25, 27, 33, 32, 36, 32, 15, 29, 17, 29, 9, 29, 32, 10, 14, 21, 33, 29, 25, 19, 19, 34, 17, 37, 25, 26, 38, 26]


In [None]:
doc_ids = pd.Series(all_pdfs).drop_duplicates().sort_values()
li(doc_ids)

2025-02-25T12:18:52 | INFO | 12     9
15    10
0     13
16    14
8     15
10    17
21    19
17    21
1     25
27    26
3     27
9     29
5     32
4     33
23    34
6     36
25    37
28    38
dtype: int64


In [19]:
df_subset2 = pd.DataFrame(subset)
df_subset2["is_in_questions"] = df_subset.index.isin(doc_ids)
df_subset2.to_csv(f"../notebooks_logging/find_files/{now_str}/04 subset.csv", index=False)
df_subset2.to_parquet(f"../notebooks_logging/find_files/{now_str}/04 subset.parquet", index=False)
df_subset2.to_csv(f"../data_temp/04 subset.csv", index=False)
df_subset2.to_parquet(f"../data_temp/04 subset.parquet", index=False)
li(f"../notebooks_logging/find_files/{now_str}/04 subset.csv")
df_subset2

2025-02-25T12:21:19 | INFO | ../notebooks_logging/find_files/20250224_220427/04 subset.csv


Unnamed: 0,sha1,cur,company_name,major_industry,mentions_recent_mergers_and_acquisitions,has_leadership_changes,has_layoffs,has_executive_compensation,has_rnd_investment_numbers,has_new_product_launches,...,has_dividend_policy_changes,has_share_buyback_plans,has_capital_structure_changes,mentions_new_risk_factors,has_guidance_updates,has_regulatory_or_litigation_issues,has_strategic_restructuring,has_supply_chain_disruptions,has_esg_initiatives,is_in_questions
0,0a9e39e1d2e176f3a766a0e86af82772f1654a6e,USD,KeyCorp,Financial Services,True,True,False,True,True,True,...,True,True,False,True,True,True,False,False,True,False
1,20c4badd4303f7aba1a298d84be3722fa84e0c67,USD,"Summit Materials, Inc.",Financial Services,True,True,False,True,False,False,...,False,True,False,True,False,False,False,True,True,False
2,30b729c124a24ff21f37431dab6b58dfe7ba56fa,USD,Gibson Energy Inc.,Energy and Utilities,False,False,False,True,False,False,...,True,True,False,False,False,True,False,False,False,False
3,42f03832077a92b3b34855cb7f9ef93563143838,EUR,HCW Biologics Inc.,Healthcare,True,False,False,True,True,True,...,False,False,False,True,True,True,False,False,False,False
4,49a9bf0542f3e5ff0250064bfed4369ecf6c8a09,USD,Essential Metals Limited,Technology,True,False,False,True,True,False,...,False,False,False,True,False,False,False,False,True,False
5,54e625a049a1713eb5a338b64858c06d74e52489,USD,"EVI Industries, Inc.",Retail,True,False,False,True,False,False,...,False,False,False,True,False,False,False,True,False,False
6,56262704bde6b584dcebfa644faa23a953498a79,EUR,Koenig & Bauer AG,Technology,True,True,False,True,True,True,...,False,False,False,True,True,False,True,True,True,False
7,58b196e02c2d9749d968a29039e9c2b29d3d31e1,USD,Microchip Technology Incorporated,Technology,True,True,False,True,True,True,...,False,True,True,True,True,True,False,True,True,False
8,5ac3ccdec033f81fab4c2ed9ae86553f4904a450,EUR,"Corbus Pharmaceuticals Holdings, Inc.",Pharmaceuticals,True,False,False,True,True,True,...,False,False,False,True,True,True,False,False,False,False
9,601aba58deffc81230c837404aa883de0d1dde1c,USD,Audalia Resources Limited,Technology,False,True,False,True,True,True,...,False,False,False,True,False,False,False,False,False,True
