In [1]:
import openai
import pandas as pd
import tqdm

import picologging as logging

from pydantic import BaseModel, Field

from typing import Optional, List, Union, Literal, Tuple
import json
import datetime
import pathlib
import asyncio

In [2]:
with open("../tokens/openai_token.txt") as f:
    openai_token = f.read()

In [3]:
now = datetime.datetime.now()
now_str = now.strftime("%Y%m%d_%H%M%S")
pathlib.Path(f"../notebooks_logging/find_files").mkdir(parents=True, exist_ok=True)
pathlib.Path(f"../notebooks_logging/find_files/{now_str}").mkdir(parents=True, exist_ok=True)

file_handler = logging.FileHandler(f"../notebooks_logging/find_files/{now_str}.log")
stream_handler = logging.StreamHandler()
# stdout_formatter = logging.Formatter(fmt="%(message)s") 
# stream_handler.setFormatter(stdout_formatter)
logging.basicConfig(
    encoding='utf-8', 
    format='%(asctime)s | %(levelname)s | %(message)s', 
    level=logging.INFO, 
    datefmt='%Y-%m-%dT%H:%M:%S',
    handlers=[
        file_handler,
        stream_handler,
    ],
    force=True
)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
li = logger.info
lw = logger.warning
li("start")
lw("check warning")

2025-02-26T15:56:03 | INFO | start


In [4]:
class Question(BaseModel):
    text: str
    kind: Literal["number", "name", "boolean", "names"]

class SourceReference(BaseModel):
    pdf_sha1: str = Field(..., description="SHA1 hash of the PDF file")
    page_index: int = Field(..., description="Physical page number in the PDF file")

class Answer(BaseModel):
    question_text: Optional[str] = Field(None, description="Text of the question")
    kind: Optional[Literal["number", "name", "boolean", "names"]] = Field(None, description="Kind of the question")
    value: Union[float, str, bool, List[str], Literal["N/A"]] = Field(..., description="Answer to the question, according to the question schema")
    references: List[SourceReference] = Field([], description="References to the source material in the PDF file")

class AnswerSubmission(BaseModel):
    team_email: str = Field(..., description="Email that your team used to register for the challenge")
    submission_name: str = Field(..., description="Unique name of the submission (e.g. experiment name)")
    answers: List[Answer] = Field(..., description="List of answers to the questions")

In [5]:
li("Questions:")
with open("../data_in/questions.json") as f:
    questions = json.loads(f.read())
questions_typed = [Question(**x) for x in questions]
for q in questions[:5]:
    li(q)

li("Subset:")
with open("../data_in/subset.json") as f:
    subset = json.loads(f.read())
for x in subset[:5]:
    li(x)

2025-02-26T15:56:03 | INFO | Questions:
2025-02-26T15:56:03 | INFO | {'text': "According to the annual report, what is the Operating margin (%) for Altech Chemicals Ltd  (within the last period or at the end of the last period)? If data is not available, return 'N/A'.", 'kind': 'number'}
2025-02-26T15:56:03 | INFO | {'text': "According to the annual report, what is the Operating margin (%) for Cofinimmo  (within the last period or at the end of the last period)? If data is not available, return 'N/A'.", 'kind': 'number'}
2025-02-26T15:56:03 | INFO | {'text': 'Did Cofinimmo outline any new ESG initiatives in the annual report?', 'kind': 'boolean'}
2025-02-26T15:56:03 | INFO | {'text': "What is the total number of employees let go by Hagerty, Inc. according to the annual report? If data is not available, return 'N/A'.", 'kind': 'number'}
2025-02-26T15:56:03 | INFO | {'text': "Which leadership **positions** changed at Renold plc in the reporting period? If data is not available, return 'N

In [6]:
li("Company names:")
li("---")
subset_companies_list = [x["company_name"] for x in subset]
for x in subset_companies_list:
    li(x)
li("---")

2025-02-26T15:56:03 | INFO | Company names:
2025-02-26T15:56:03 | INFO | ---
2025-02-26T15:56:03 | INFO | KeyCorp
2025-02-26T15:56:03 | INFO | Summit Materials, Inc.
2025-02-26T15:56:03 | INFO | Gibson Energy Inc.
2025-02-26T15:56:03 | INFO | HCW Biologics Inc.
2025-02-26T15:56:03 | INFO | Essential Metals Limited
2025-02-26T15:56:03 | INFO | EVI Industries, Inc.
2025-02-26T15:56:03 | INFO | Koenig & Bauer AG
2025-02-26T15:56:03 | INFO | Microchip Technology Incorporated
2025-02-26T15:56:03 | INFO | Corbus Pharmaceuticals Holdings, Inc.
2025-02-26T15:56:03 | INFO | Audalia Resources Limited
2025-02-26T15:56:03 | INFO | Enerflex Ltd.
2025-02-26T15:56:03 | INFO | MFA Financial, Inc.
2025-02-26T15:56:03 | INFO | Strike Resources Limited
2025-02-26T15:56:03 | INFO | Altech Chemicals Ltd
2025-02-26T15:56:03 | INFO | HV Bancorp, Inc.
2025-02-26T15:56:03 | INFO | Zymeworks Inc.
2025-02-26T15:56:03 | INFO | iBio, Inc.
2025-02-26T15:56:03 | INFO | Winnebago Industries, Inc.
2025-02-26T15:56:03 

In [7]:
client = openai.AsyncOpenAI(
    # This is the default and can be omitted
    api_key=openai_token,
)

In [8]:
import random

class CompanyAnswer(BaseModel):
    result: List[str] = Field(..., description="List of company names")
    result_prob: float = Field(..., description="Probability that the result is the right one")



prompt = """Find company names in this question. There could be zero or more company names.
---
Output format:
The output should start with @@@ and end with @@@. If the result is empty it should still correspond to the JSON schema and the result should be and empty array.
The output is a JSON that corresponds to the following pydantic class:

class CompanyAnswer(BaseModel):
    result: List[str] = Field(..., description="List of company names")
    result_prob: float = Field(..., description="Probability that the result is the right one")

---
Question:
"""

async def send_one_question(question_i: int, question: str, prompt: str) -> Tuple[int,dict]:
    chat_completion = await client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt + question,
            }
        ],
        model="o1-mini",
    )
    ans = chat_completion.choices[0].message.content
    ans_l_splitted = ans.split('@@@')
    if len(ans_l_splitted) == 3:
        ans_splitted = ans_l_splitted[1]
        try:
            ans_d = json.loads(ans_splitted)
            _ = CompanyAnswer(**ans_d)
        except:
            ans_d = {
                "result": ans_splitted,
                "result_error": True
            }
    else:
        ans_d = {
            "result": ans,
            "result_error": True
        }
    # if random.uniform(0, 1) < 0.1:
    #     ans_d = {
    #         "result": ans,
    #         "result_error": True
    #     }
    return question_i, question, ans_d

tasks_done = ["0"]*len(questions)
tasks_tries = {}
tasks = set()
li(f"Sending for every question started")
for i, question_d in enumerate(questions):
    tasks_tries[i] = 1
    tasks.add(asyncio.create_task(send_one_question(i, question_d["text"], prompt)))

tasks_run = []
while tasks:
    done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
    for completed in done:
        i, param1, ans_d = await completed
        if "result_error" in ans_d:
            
            if tasks_tries[i] < 5:
                lw(f"Task {i} failed. Total retries: {tasks_tries[i]}. Retrying")
                tasks_tries[i] = tasks_tries[i] + 1
                tasks.add(asyncio.create_task(send_one_question(i, param1, prompt)))
            else:
                lw(f"Task {i} failed. Total retries: {tasks_tries[i]}. Stop retrying. Returning the current answer with error.")
                tasks_done[i] = "2"
                tasks_run.append((i, param1, ans_d))
                li("".join(tasks_done))
        else:
            tasks_done[i] = "1"
            tasks_run.append((i, param1, ans_d))
            li("".join(tasks_done))

2025-02-26T15:56:03 | INFO | Sending for every question started
2025-02-26T15:56:06 | INFO | 000000000100000000000000000000
2025-02-26T15:56:07 | INFO | 000000000100000000100000000000
2025-02-26T15:56:07 | INFO | 000000001100000000100000000000
2025-02-26T15:56:08 | INFO | 000000001100001000100000000000
2025-02-26T15:56:08 | INFO | 000000001100001000100000001000
2025-02-26T15:56:08 | INFO | 000000011100001000100000001000
2025-02-26T15:56:08 | INFO | 000000011100001000100000001100
2025-02-26T15:56:08 | INFO | 000000011100001100100000001100
2025-02-26T15:56:08 | INFO | 000000011100001100100000001101
2025-02-26T15:56:08 | INFO | 000010011100001100100000001101
2025-02-26T15:56:09 | INFO | 000010011100001100100100001101
2025-02-26T15:56:09 | INFO | 000010011100001100110100001101
2025-02-26T15:56:09 | INFO | 100010011100001100110100001101
2025-02-26T15:56:09 | INFO | 110010011100001100110100001101
2025-02-26T15:56:10 | INFO | 110010011100001110110100001101
2025-02-26T15:56:10 | INFO | 1100101

In [9]:
tasks_run_sorted = sorted(tasks_run, key=lambda tup: tup[0])
temp_answers = [ans_d for _, _, ans_d in tasks_run_sorted]
temp_answers

[{'result': ['Altech Chemicals Ltd'], 'result_prob': 0.99},
 {'result': ['Cofinimmo'], 'result_prob': 1.0},
 {'result': ['Cofinimmo'], 'result_prob': 1.0},
 {'result': ['Hagerty, Inc.'], 'result_prob': 1.0},
 {'result': ['Renold plc'], 'result_prob': 1.0},
 {'result': ['Charles & Colvard, Ltd.'], 'result_prob': 1.0},
 {'result': ['Harworth Group plc'], 'result_prob': 1.0},
 {'result': ['Charles & Colvard, Ltd.'], 'result_prob': 1.0},
 {'result': ['Zymeworks Inc.'], 'result_prob': 1.0},
 {'result': ['Lipocine Inc.'], 'result_prob': 1.0},
 {'result': ['Winnebago Industries, Inc.'], 'result_prob': 1.0},
 {'result': ['Lipocine Inc.'], 'result_prob': 0.99},
 {'result': ['Audalia Resources Limited'], 'result_prob': 0.99},
 {'result': ['Lipocine Inc.'], 'result_prob': 1.0},
 {'result': ['Charles & Colvard, Ltd.'], 'result_prob': 1.0},
 {'result': ['Enerflex Ltd.'], 'result_prob': 0.99},
 {'result': ['HV Bancorp, Inc.'], 'result_prob': 1.0},
 {'result': ['Alien Metals Limited'], 'result_prob':

In [10]:
questions_companies = [x["result"] for x in temp_answers]
questions_companies

[['Altech Chemicals Ltd'],
 ['Cofinimmo'],
 ['Cofinimmo'],
 ['Hagerty, Inc.'],
 ['Renold plc'],
 ['Charles & Colvard, Ltd.'],
 ['Harworth Group plc'],
 ['Charles & Colvard, Ltd.'],
 ['Zymeworks Inc.'],
 ['Lipocine Inc.'],
 ['Winnebago Industries, Inc.'],
 ['Lipocine Inc.'],
 ['Audalia Resources Limited'],
 ['Lipocine Inc.'],
 ['Charles & Colvard, Ltd.'],
 ['Enerflex Ltd.'],
 ['HV Bancorp, Inc.'],
 ['Alien Metals Limited'],
 ['Renold plc'],
 ['Lipocine Inc.'],
 ['Cofinimmo'],
 ['Canadian Tire Corporation'],
 ['Canadian Tire Corporation'],
 ['LVMH'],
 ['Winnebago Industries, Inc.'],
 ['Johns Lyng Group Limited'],
 ['Cofinimmo'],
 ['Maxeon Solar Technologies, Ltd.'],
 ['Nevro Corp.'],
 ['Maxeon Solar Technologies, Ltd.']]

In [11]:
questions_companies_extended = [y for x in questions_companies for y in x]
questions_companies_drop_duplicates = pd.Series(questions_companies_extended).drop_duplicates().to_list()
questions_companies_drop_duplicates_str = "\n".join(questions_companies_drop_duplicates)
print(questions_companies_drop_duplicates_str)

Altech Chemicals Ltd
Cofinimmo
Hagerty, Inc.
Renold plc
Charles & Colvard, Ltd.
Harworth Group plc
Zymeworks Inc.
Lipocine Inc.
Winnebago Industries, Inc.
Audalia Resources Limited
Enerflex Ltd.
HV Bancorp, Inc.
Alien Metals Limited
Canadian Tire Corporation
LVMH
Johns Lyng Group Limited
Maxeon Solar Technologies, Ltd.
Nevro Corp.


In [12]:
class ListContainsAnswer(BaseModel):
    result_name: str = Field(..., description="Company name")
    result_list_position: Union[int, Literal["N/A"]] = Field(..., description="Company name")
    result_prob: float = Field(..., description="Probability that the result is the right one")

prompt1 = """Find this company name in the following list.
---
Output format:
The output should start with @@@ and end with @@@. If the result is empty it should still correspond to the JSON schema and the result should be and empty array.
The output is a JSON that corresponds to the following pydantic class:

class ListContainsAnswer(BaseModel):
    result_name: str = Field(..., description="The company name found in list. If not found return N/A")
    result_list_position: Union[int, Literal["N/A"]] = Field(..., description="The position of the found company in a list. If not found return N/A")
    result_prob: float = Field(..., description="Probability that the result is the right one")

---
Company name to find:
"""

prompt2 = """
---
List of companies:
""" + "\n".join(subset_companies_list)

async def send_one_question(company_i: int, company: str, prompt1: str, prompt2: str) -> Tuple[int,dict]:
    chat_completion = await client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt1 + company + prompt2,
            }
        ],
        model="o1-mini",
    )
    ans = chat_completion.choices[0].message.content
    ans_l_splitted = ans.split('@@@')
    if len(ans_l_splitted) == 3:
        ans_splitted = ans_l_splitted[1]
        try:
            ans_d = json.loads(ans_splitted)
            _ = ListContainsAnswer(**ans_d)
        except:
            ans_d = {
                "result_name": ans_splitted,
                "result_error": True
            }
    else:
        ans_d = {
            "result_name": ans,
            "result_error": True
        }
    # if random.uniform(0, 1) < 0.1:
    #     ans_d = {
    #         "result": ans,
    #         "result_error": True
    #     }
    return company_i, company, ans_d

tasks_done = ["0"]*len(questions_companies_drop_duplicates)
tasks_tries = {}
tasks = []
li(f"Sending for every company started")
for i, param1 in enumerate(questions_companies_drop_duplicates):
    tasks_tries[i] = 1
    tasks.append(asyncio.create_task(send_one_question(i, param1, prompt1, prompt2)))

tasks_run = []
while tasks:
    done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
    for completed in done:
        i, param1, ans_d = await completed
        if "result_error" in ans_d:
            
            if tasks_tries[i] < 5:
                lw(f"Task {i} failed. Total retries: {tasks_tries[i]}. Retrying")
                tasks_tries[i] = tasks_tries[i] + 1
                tasks.add(asyncio.create_task(send_one_question(i, param1, prompt1, prompt2)))
            else:
                lw(f"Task {i} failed. Total retries: {tasks_tries[i]}. Stop retrying. Returning the current answer with error.")
                tasks_done[i] = "2"
                tasks_run.append((i, param1, ans_d))
                li("".join(tasks_done))
        else:
            tasks_done[i] = "1"
            tasks_run.append((i, param1, ans_d))
            li("".join(tasks_done))

2025-02-26T15:56:18 | INFO | Sending for every company started
2025-02-26T15:56:22 | INFO | 000000000000000100
2025-02-26T15:56:22 | INFO | 000100000000000100
2025-02-26T15:56:22 | INFO | 000100000000001100
2025-02-26T15:56:23 | INFO | 000100100000001100
2025-02-26T15:56:23 | INFO | 000100101000001100
2025-02-26T15:56:23 | INFO | 000110101000001100
2025-02-26T15:56:23 | INFO | 001110101000001100
2025-02-26T15:56:23 | INFO | 001110101000101100
2025-02-26T15:56:23 | INFO | 001110101001101100
2025-02-26T15:56:23 | INFO | 001110101001101101
2025-02-26T15:56:23 | INFO | 001110111001101101
2025-02-26T15:56:23 | INFO | 001111111001101101
2025-02-26T15:56:24 | INFO | 001111111001111101
2025-02-26T15:56:24 | INFO | 101111111001111101
2025-02-26T15:56:24 | INFO | 101111111101111101
2025-02-26T15:56:25 | INFO | 101111111111111101
2025-02-26T15:56:25 | INFO | 101111111111111111
2025-02-26T15:56:27 | INFO | 111111111111111111


In [13]:
tasks_run_sorted = sorted(tasks_run, key=lambda tup: tup[0])
temp_answers2 = {company: ans_d for _, company, ans_d in tasks_run_sorted}
temp_answers2

{'Altech Chemicals Ltd': {'result_name': 'Altech Chemicals Ltd',
  'result_list_position': 14,
  'result_prob': 1.0},
 'Cofinimmo': {'result_name': 'Cofinimmo',
  'result_list_position': 26,
  'result_prob': 1.0},
 'Hagerty, Inc.': {'result_name': 'Hagerty, Inc.',
  'result_list_position': 28,
  'result_prob': 1.0},
 'Renold plc': {'result_name': 'Renold plc',
  'result_list_position': 34,
  'result_prob': 1.0},
 'Charles & Colvard, Ltd.': {'result_name': 'Charles & Colvard, Ltd.',
  'result_list_position': 33,
  'result_prob': 1.0},
 'Harworth Group plc': {'result_name': 'Harworth Group plc',
  'result_list_position': 37,
  'result_prob': 1.0},
 'Zymeworks Inc.': {'result_name': 'Zymeworks Inc.',
  'result_list_position': 16,
  'result_prob': 1.0},
 'Lipocine Inc.': {'result_name': 'Lipocine Inc.',
  'result_list_position': 30,
  'result_prob': 1.0},
 'Winnebago Industries, Inc.': {'result_name': 'Winnebago Industries, Inc.',
  'result_list_position': 18,
  'result_prob': 1.0},
 'Auda

In [14]:
map_dict = {k: v["result_name"] for k, v in temp_answers2.items()}
map_dict

{'Altech Chemicals Ltd': 'Altech Chemicals Ltd',
 'Cofinimmo': 'Cofinimmo',
 'Hagerty, Inc.': 'Hagerty, Inc.',
 'Renold plc': 'Renold plc',
 'Charles & Colvard, Ltd.': 'Charles & Colvard, Ltd.',
 'Harworth Group plc': 'Harworth Group plc',
 'Zymeworks Inc.': 'Zymeworks Inc.',
 'Lipocine Inc.': 'Lipocine Inc.',
 'Winnebago Industries, Inc.': 'Winnebago Industries, Inc.',
 'Audalia Resources Limited': 'Audalia Resources Limited',
 'Enerflex Ltd.': 'Enerflex Ltd.',
 'HV Bancorp, Inc.': 'HV Bancorp, Inc.',
 'Alien Metals Limited': 'Alien Metals Limited',
 'Canadian Tire Corporation': 'Canadian Tire Corporation',
 'LVMH': 'LVMH',
 'Johns Lyng Group Limited': 'Johns Lyng Group Limited',
 'Maxeon Solar Technologies, Ltd.': 'Maxeon Solar Technologies, Ltd.',
 'Nevro Corp.': 'Nevro Corp.'}

In [15]:
df_subset = pd.DataFrame(subset)
map_name2sha1 = df_subset.groupby("company_name")["sha1"].apply(list).to_dict()
map_name2sha1

{'Advantage Solutions Inc.': ['ea4b61f74a604df117176e74b11f5780fb473a31'],
 'Alien Metals Limited': ['8dd1306c26c63913495fe81dde5180033b39fc44'],
 'Altech Chemicals Ltd': ['63688d5d0b4f12e9f847c5407439a1ec46047a4a'],
 'Ameresco, Inc.': ['f973dd219c534accb0d4e72d8e12f51284d48d10'],
 'Audalia Resources Limited': ['601aba58deffc81230c837404aa883de0d1dde1c'],
 'Canadian Tire Corporation': ['7c55d7900a241e732c145687598d43c915a678f9'],
 'CareTrust REIT, Inc.': ['cd9239df47b2f0addd9cdd50d0fab4494414bba5'],
 'Charles & Colvard, Ltd.': ['d3a834539b046a49708161a6c0d35aad29dd15ec'],
 'Cofinimmo': ['9cc771c2171bacc138cda4e7d68b8b427a514d81'],
 'Corbus Pharmaceuticals Holdings, Inc.': ['5ac3ccdec033f81fab4c2ed9ae86553f4904a450'],
 'Dunedin Enterprise Investment Trust PLC': ['9b3fa9062a804f0f5a5dbfa46309ed3354cd680b'],
 'EVI Industries, Inc.': ['54e625a049a1713eb5a338b64858c06d74e52489'],
 'Enerflex Ltd.': ['612279ce81b538da24834af47131cc73e8c01a80'],
 'Essential Metals Limited': ['49a9bf0542f3e5ff0

In [16]:
df_temp_answers = pd.DataFrame(questions).rename(columns={"text": "q_text", "kind": "q_kind"})
df_temp_answers["company_names"] = questions_companies
df_temp_answers["company_names_subset"] = df_temp_answers["company_names"].apply(lambda x: [map_dict[y] for y in x])
df_temp_answers["documents_to_look_through"] = df_temp_answers["company_names_subset"].apply(lambda names: [y for x in names for y in map_name2sha1[x]])
df_temp_answers.to_csv(f"../notebooks_logging/find_files/{now_str}/04 temp_answers.csv", index=False)
df_temp_answers.to_csv(f"../data_temp/04 temp_answers.csv", index=False)
df_temp_answers.to_parquet(f"../data_temp/04 temp_answers.parquet", index=False)
df_temp_answers

Unnamed: 0,q_text,q_kind,company_names,company_names_subset,documents_to_look_through
0,"According to the annual report, what is the Op...",number,[Altech Chemicals Ltd],[Altech Chemicals Ltd],[63688d5d0b4f12e9f847c5407439a1ec46047a4a]
1,"According to the annual report, what is the Op...",number,[Cofinimmo],[Cofinimmo],[9cc771c2171bacc138cda4e7d68b8b427a514d81]
2,Did Cofinimmo outline any new ESG initiatives ...,boolean,[Cofinimmo],[Cofinimmo],[9cc771c2171bacc138cda4e7d68b8b427a514d81]
3,What is the total number of employees let go b...,number,"[Hagerty, Inc.]","[Hagerty, Inc.]",[a6f23184a87f3343f17e8e8ed08f604615cdefc1]
4,Which leadership **positions** changed at Reno...,names,[Renold plc],[Renold plc],[da8afefdc3840175c9f26a4dbbed05e250a342cc]
5,What was the Gross margin (%) for Charles & Co...,number,"[Charles & Colvard, Ltd.]","[Charles & Colvard, Ltd.]",[d3a834539b046a49708161a6c0d35aad29dd15ec]
6,What was the Capital expenditures (in GBP) for...,number,[Harworth Group plc],[Harworth Group plc],[ed55750eae9cb7d893e6484b92496639172717cd]
7,What was the Capital expenditures (in USD) for...,number,"[Charles & Colvard, Ltd.]","[Charles & Colvard, Ltd.]",[d3a834539b046a49708161a6c0d35aad29dd15ec]
8,What are the names of new products launched by...,names,[Zymeworks Inc.],[Zymeworks Inc.],[69b472e4c05986db72ff8e547bb220e2d222b7ef]
9,"For Lipocine Inc., what was the value of Numbe...",number,[Lipocine Inc.],[Lipocine Inc.],[c51f3c5aff7bea6fbc0bb7537838fa2f44f35c23]


In [17]:
li(df_temp_answers)

2025-02-26T15:56:27 | INFO |                                                q_text   q_kind  \
0   According to the annual report, what is the Op...   number   
1   According to the annual report, what is the Op...   number   
2   Did Cofinimmo outline any new ESG initiatives ...  boolean   
3   What is the total number of employees let go b...   number   
4   Which leadership **positions** changed at Reno...    names   
5   What was the Gross margin (%) for Charles & Co...   number   
6   What was the Capital expenditures (in GBP) for...   number   
7   What was the Capital expenditures (in USD) for...   number   
8   What are the names of new products launched by...    names   
9   For Lipocine Inc., what was the value of Numbe...   number   
10  According to the annual report, what is the To...   number   
11  For Lipocine Inc., what was the value of Value...   number   
12  According to the annual report, what is the Op...   number   
13  According to the annual report, what is the

In [18]:
sha_pdfs_used = [y for x in df_temp_answers["documents_to_look_through"].to_list() for y in x]
li(f"sha_pdfs_used={sha_pdfs_used}")

2025-02-26T15:56:27 | INFO | sha_pdfs_used=['63688d5d0b4f12e9f847c5407439a1ec46047a4a', '9cc771c2171bacc138cda4e7d68b8b427a514d81', '9cc771c2171bacc138cda4e7d68b8b427a514d81', 'a6f23184a87f3343f17e8e8ed08f604615cdefc1', 'da8afefdc3840175c9f26a4dbbed05e250a342cc', 'd3a834539b046a49708161a6c0d35aad29dd15ec', 'ed55750eae9cb7d893e6484b92496639172717cd', 'd3a834539b046a49708161a6c0d35aad29dd15ec', '69b472e4c05986db72ff8e547bb220e2d222b7ef', 'c51f3c5aff7bea6fbc0bb7537838fa2f44f35c23', '7820b6e9487202b30f2883a6df91ae76f9461f2f', 'c51f3c5aff7bea6fbc0bb7537838fa2f44f35c23', '601aba58deffc81230c837404aa883de0d1dde1c', 'c51f3c5aff7bea6fbc0bb7537838fa2f44f35c23', 'd3a834539b046a49708161a6c0d35aad29dd15ec', '612279ce81b538da24834af47131cc73e8c01a80', '69a9dcb0bb6a46e2ff9f969d035e1774a2d49ef1', '8dd1306c26c63913495fe81dde5180033b39fc44', 'da8afefdc3840175c9f26a4dbbed05e250a342cc', 'c51f3c5aff7bea6fbc0bb7537838fa2f44f35c23', '9cc771c2171bacc138cda4e7d68b8b427a514d81', '7c55d7900a241e732c145687598d43c

In [20]:
df_subset["is_in_questions"] = df_subset["sha1"].isin(sha_pdfs_used)
df_subset.to_csv(f"../notebooks_logging/find_files/{now_str}/04 subset.csv", index=False)
df_subset.to_parquet(f"../notebooks_logging/find_files/{now_str}/04 subset.parquet", index=False)
df_subset.to_csv(f"../data_temp/04 subset.csv", index=False)
df_subset.to_parquet(f"../data_temp/04 subset.parquet", index=False)
li(f"../notebooks_logging/find_files/{now_str}/04 subset.csv")
df_subset

2025-02-26T15:58:03 | INFO | ../notebooks_logging/find_files/20250226_155603/04 subset.csv


Unnamed: 0,sha1,cur,company_name,major_industry,mentions_recent_mergers_and_acquisitions,has_leadership_changes,has_layoffs,has_executive_compensation,has_rnd_investment_numbers,has_new_product_launches,...,has_dividend_policy_changes,has_share_buyback_plans,has_capital_structure_changes,mentions_new_risk_factors,has_guidance_updates,has_regulatory_or_litigation_issues,has_strategic_restructuring,has_supply_chain_disruptions,has_esg_initiatives,is_in_questions
0,0a9e39e1d2e176f3a766a0e86af82772f1654a6e,USD,KeyCorp,Financial Services,True,True,False,True,True,True,...,True,True,False,True,True,True,False,False,True,False
1,20c4badd4303f7aba1a298d84be3722fa84e0c67,USD,"Summit Materials, Inc.",Financial Services,True,True,False,True,False,False,...,False,True,False,True,False,False,False,True,True,False
2,30b729c124a24ff21f37431dab6b58dfe7ba56fa,USD,Gibson Energy Inc.,Energy and Utilities,False,False,False,True,False,False,...,True,True,False,False,False,True,False,False,False,False
3,42f03832077a92b3b34855cb7f9ef93563143838,EUR,HCW Biologics Inc.,Healthcare,True,False,False,True,True,True,...,False,False,False,True,True,True,False,False,False,False
4,49a9bf0542f3e5ff0250064bfed4369ecf6c8a09,USD,Essential Metals Limited,Technology,True,False,False,True,True,False,...,False,False,False,True,False,False,False,False,True,False
5,54e625a049a1713eb5a338b64858c06d74e52489,USD,"EVI Industries, Inc.",Retail,True,False,False,True,False,False,...,False,False,False,True,False,False,False,True,False,False
6,56262704bde6b584dcebfa644faa23a953498a79,EUR,Koenig & Bauer AG,Technology,True,True,False,True,True,True,...,False,False,False,True,True,False,True,True,True,False
7,58b196e02c2d9749d968a29039e9c2b29d3d31e1,USD,Microchip Technology Incorporated,Technology,True,True,False,True,True,True,...,False,True,True,True,True,True,False,True,True,False
8,5ac3ccdec033f81fab4c2ed9ae86553f4904a450,EUR,"Corbus Pharmaceuticals Holdings, Inc.",Pharmaceuticals,True,False,False,True,True,True,...,False,False,False,True,True,True,False,False,False,False
9,601aba58deffc81230c837404aa883de0d1dde1c,USD,Audalia Resources Limited,Technology,False,True,False,True,True,True,...,False,False,False,True,False,False,False,False,False,True
