In [1]:
import openai
import pandas as pd
import tqdm

import picologging as logging

from pydantic import BaseModel, Field

from typing import Optional, List, Union, Literal, Tuple
import json
import datetime
import pathlib
import asyncio

In [2]:
with open("../tokens/openai_token.txt") as f:
    openai_token = f.read()

In [3]:
now = datetime.datetime.now()
now_str = now.strftime("%Y%m%d_%H%M%S")
pathlib.Path(f"../notebooks_logging/find_files").mkdir(parents=True, exist_ok=True)
pathlib.Path(f"../notebooks_logging/find_files/{now_str}").mkdir(parents=True, exist_ok=True)

file_handler = logging.FileHandler(f"../notebooks_logging/find_files/{now_str}.log")
stream_handler = logging.StreamHandler()
# stdout_formatter = logging.Formatter(fmt="%(message)s") 
# stream_handler.setFormatter(stdout_formatter)
logging.basicConfig(
    encoding='utf-8', 
    format='%(asctime)s | %(levelname)s | %(message)s', 
    level=logging.INFO, 
    datefmt='%Y-%m-%dT%H:%M:%S',
    handlers=[
        file_handler,
        stream_handler,
    ],
    force=True
)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
li = logger.info
lw = logger.warning
li("start")
lw("check warning")

2025-02-27T15:37:21 | INFO | start


In [4]:
class Question(BaseModel):
    text: str
    kind: Literal["number", "name", "boolean", "names"]

class SourceReference(BaseModel):
    pdf_sha1: str = Field(..., description="SHA1 hash of the PDF file")
    page_index: int = Field(..., description="Physical page number in the PDF file")

class Answer(BaseModel):
    question_text: Optional[str] = Field(None, description="Text of the question")
    kind: Optional[Literal["number", "name", "boolean", "names"]] = Field(None, description="Kind of the question")
    value: Union[float, str, bool, List[str], Literal["N/A"]] = Field(..., description="Answer to the question, according to the question schema")
    references: List[SourceReference] = Field([], description="References to the source material in the PDF file")

class AnswerSubmission(BaseModel):
    team_email: str = Field(..., description="Email that your team used to register for the challenge")
    submission_name: str = Field(..., description="Unique name of the submission (e.g. experiment name)")
    answers: List[Answer] = Field(..., description="List of answers to the questions")

In [5]:
li("Questions:")
with open("../data_in/questions.json") as f:
    questions = json.loads(f.read())
questions_typed = [Question(**x) for x in questions]
for q in questions[:5]:
    li(q)

li("Subset:")
with open("../data_in/subset.json") as f:
    subset = json.loads(f.read())
for x in subset[:5]:
    li(x)

2025-02-27T15:37:23 | INFO | Questions:
2025-02-27T15:37:23 | INFO | {'text': "For Ziff Davis, Inc., what was the value of Cloud storage capacity (TB) at the end of the period listed in annual report? If data is not available, return 'N/A'.", 'kind': 'number'}
2025-02-27T15:37:23 | INFO | {'text': 'Did Liberty Broadband Corporation announce a share buyback plan in the annual report? If there is no mention, return False.', 'kind': 'boolean'}
2025-02-27T15:37:23 | INFO | {'text': "What is the total number of employees let go by Pintec Technology Holdings Limited according to the annual report? If data is not available, return 'N/A'.", 'kind': 'number'}
2025-02-27T15:37:23 | INFO | {'text': "Which leadership positions changed at Westwater Resources, Inc. in the reporting period? If data is not available, return 'N/A'. Give me the title of the position.", 'kind': 'names'}
2025-02-27T15:37:23 | INFO | {'text': 'Did Brave Bison Group plc mention any mergers or acquisitions in the annual repo

In [6]:
li("Company names:")
li("---")
subset_companies_list = [x["company_name"] for x in subset]
for x in subset_companies_list:
    li(x)
li("---")

2025-02-27T15:37:27 | INFO | Company names:
2025-02-27T15:37:27 | INFO | ---
2025-02-27T15:37:27 | INFO | ACRES Commercial Realty Corp.
2025-02-27T15:37:27 | INFO | Aptevo Therapeutics Inc.
2025-02-27T15:37:27 | INFO | Downer EDI Limited
2025-02-27T15:37:27 | INFO | Odyssey Gold Limited
2025-02-27T15:37:27 | INFO | NextNav Inc.
2025-02-27T15:37:27 | INFO | Peako Limited
2025-02-27T15:37:27 | INFO | Mosaic Brands Limited
2025-02-27T15:37:27 | INFO | Aurora Innovation, Inc.
2025-02-27T15:37:27 | INFO | Crombie REIT
2025-02-27T15:37:27 | INFO | Medallion Financial Corp.
2025-02-27T15:37:27 | INFO | Enact Holdings, Inc.
2025-02-27T15:37:27 | INFO | BetMakers Technology Group Ltd
2025-02-27T15:37:27 | INFO | OFX Group Limited
2025-02-27T15:37:27 | INFO | FNCB Bancorp, Inc.
2025-02-27T15:37:27 | INFO | Celldex Therapeutics, Inc.
2025-02-27T15:37:27 | INFO | SIG plc
2025-02-27T15:37:27 | INFO | Motability Operations Group plc
2025-02-27T15:37:27 | INFO | BCB Bancorp, Inc.
2025-02-27T15:37:27 

In [7]:
client = openai.AsyncOpenAI(
    # This is the default and can be omitted
    api_key=openai_token,
)

In [8]:
import random

class CompanyAnswer(BaseModel):
    result: List[str] = Field(..., description="List of company names")
    result_prob: float = Field(..., description="Probability that the result is the right one")



prompt = """Find company names in this question. There could be zero or more company names.
---
Output format:
The output should start with @@@ and end with @@@. If the result is empty it should still correspond to the JSON schema and the result should be and empty array.
The output is a JSON that corresponds to the following pydantic class:

class CompanyAnswer(BaseModel):
    result: List[str] = Field(..., description="List of company names")
    result_prob: float = Field(..., description="Probability that the result is the right one")

---
Question:
"""

async def send_one_question(question_i: int, question: str, prompt: str) -> Tuple[int,dict]:
    chat_completion = await client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt + question,
            }
        ],
        model="o1-mini",
    )
    ans = chat_completion.choices[0].message.content
    ans_l_splitted = ans.split('@@@')
    if len(ans_l_splitted) == 3:
        ans_splitted = ans_l_splitted[1]
        try:
            ans_d = json.loads(ans_splitted)
            _ = CompanyAnswer(**ans_d)
        except:
            ans_d = {
                "result": ans_splitted,
                "result_error": True
            }
    else:
        ans_d = {
            "result": ans,
            "result_error": True
        }
    # if random.uniform(0, 1) < 0.1:
    #     ans_d = {
    #         "result": ans,
    #         "result_error": True
    #     }
    return question_i, question, ans_d

tasks_done = ["0"]*len(questions)
tasks_tries = {}
tasks = set()
li(f"Sending for every question started")
for i, question_d in enumerate(questions):
    tasks_tries[i] = 1
    tasks.add(asyncio.create_task(send_one_question(i, question_d["text"], prompt)))

tasks_run = []
while tasks:
    done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
    for completed in done:
        i, param1, ans_d = await completed
        if "result_error" in ans_d:
            
            if tasks_tries[i] < 5:
                lw(f"Task {i} failed. Total retries: {tasks_tries[i]}. Retrying")
                tasks_tries[i] = tasks_tries[i] + 1
                tasks.add(asyncio.create_task(send_one_question(i, param1, prompt)))
            else:
                lw(f"Task {i} failed. Total retries: {tasks_tries[i]}. Stop retrying. Returning the current answer with error.")
                tasks_done[i] = "2"
                tasks_run.append((i, param1, ans_d))
                li("".join(tasks_done))
        else:
            tasks_done[i] = "1"
            tasks_run.append((i, param1, ans_d))
            li("".join(tasks_done))

2025-02-27T15:37:32 | INFO | Sending for every question started
2025-02-27T15:37:36 | INFO | 0000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
2025-02-27T15:37:36 | INFO | 0000000100000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000
2025-02-27T15:37:36 | INFO | 0000000100000000000000000000000000000000000000000000000000000100000000000000000000100000000000000000
2025-02-27T15:37:36 | INFO | 0000000100000000000000000000000000000000000000000000000000000100000001000000000000100000000000000000
2025-02-27T15:37:36 | INFO | 1000000100000000000000000000000000000000000000000000000000000100000001000000000000100000000000000000
2025-02-27T15:37:36 | INFO | 1000000100000000000000000000000000000000000000100000000000000100000001000000000000100000000000000000
2025-02-27T15:37:36 | INFO | 1000000100000010000000000000000000000000000000100000000000000100000001000000000000100000000000000000
2025-02-27T15:37:36 | INFO

In [9]:
tasks_run_sorted = sorted(tasks_run, key=lambda tup: tup[0])
temp_answers = [ans_d for _, _, ans_d in tasks_run_sorted]
temp_answers

[{'result': ['Ziff Davis, Inc.'], 'result_prob': 0.99},
 {'result': ['Liberty Broadband Corporation'], 'result_prob': 1.0},
 {'result': ['Pintec Technology Holdings Limited'], 'result_prob': 1.0},
 {'result': ['Westwater Resources, Inc.'], 'result_prob': 1.0},
 {'result': ['Brave Bison Group plc'], 'result_prob': 1.0},
 {'result': ['Sonic Automotive, Inc.'], 'result_prob': 1.0},
 {'result': ['Poste Italiane'], 'result_prob': 1.0},
 {'result': ['MGM Resorts International'], 'result_prob': 1.0},
 {'result': ['INMUNE BIO INC.'], 'result_prob': 1.0},
 {'result': ['BetMakers Technology Group Ltd'], 'result_prob': 0.99},
 {'result': ['Franklin Covey Co.'], 'result_prob': 1.0},
 {'result': ['Downer EDI Limited'], 'result_prob': 1.0},
 {'result': ['Armadale Capital Plc'], 'result_prob': 1.0},
 {'result': ['AA Limited'], 'result_prob': 1.0},
 {'result': ['Franklin Covey Co.'], 'result_prob': 0.98},
 {'result': ['Ocugen, Inc.'], 'result_prob': 0.99},
 {'result': ['Bionano Genomics, Inc.'], 'resu

In [10]:
questions_companies = [x["result"] for x in temp_answers]
questions_companies

[['Ziff Davis, Inc.'],
 ['Liberty Broadband Corporation'],
 ['Pintec Technology Holdings Limited'],
 ['Westwater Resources, Inc.'],
 ['Brave Bison Group plc'],
 ['Sonic Automotive, Inc.'],
 ['Poste Italiane'],
 ['MGM Resorts International'],
 ['INMUNE BIO INC.'],
 ['BetMakers Technology Group Ltd'],
 ['Franklin Covey Co.'],
 ['Downer EDI Limited'],
 ['Armadale Capital Plc'],
 ['AA Limited'],
 ['Franklin Covey Co.'],
 ['Ocugen, Inc.'],
 ['Bionano Genomics, Inc.'],
 ['Seiko Epson Corporation'],
 ['MGM Resorts International'],
 ['NZME Limited'],
 ['Incyte Corporation'],
 ['Aurora Innovation, Inc.'],
 ['Datalogic',
  'Terns Pharmaceuticals, Inc.',
  'Incyte Corporation',
  'INMUNE BIO INC.',
  'Duni Group'],
 ['Downer EDI Limited'],
 ['Atreca, Inc.', 'Poste Italiane', 'Datalogic', 'NuCana plc', 'RWE AG'],
 ['Elixir Energy Limited'],
 ['Kiniksa Pharmaceuticals, Ltd.'],
 ['CoreCard Corporation'],
 ['HCA Healthcare, Inc.'],
 ['Datalogic'],
 ['Incitec Pivot Limited'],
 ['Franklin Covey Co.'],


In [11]:
questions_companies_extended = [y for x in questions_companies for y in x]
questions_companies_drop_duplicates = pd.Series(questions_companies_extended).drop_duplicates().to_list()
questions_companies_drop_duplicates_str = "\n".join(questions_companies_drop_duplicates)
print(questions_companies_drop_duplicates_str)

Ziff Davis, Inc.
Liberty Broadband Corporation
Pintec Technology Holdings Limited
Westwater Resources, Inc.
Brave Bison Group plc
Sonic Automotive, Inc.
Poste Italiane
MGM Resorts International
INMUNE BIO INC.
BetMakers Technology Group Ltd
Franklin Covey Co.
Downer EDI Limited
Armadale Capital Plc
AA Limited
Ocugen, Inc.
Bionano Genomics, Inc.
Seiko Epson Corporation
NZME Limited
Incyte Corporation
Aurora Innovation, Inc.
Datalogic
Terns Pharmaceuticals, Inc.
Duni Group
Atreca, Inc.
NuCana plc
RWE AG
Elixir Energy Limited
Kiniksa Pharmaceuticals, Ltd.
CoreCard Corporation
HCA Healthcare, Inc.
Incitec Pivot Limited
Wheeler Real Estate Investment Trust, Inc.
archTIS Limited
Guaranty Bancshares, Inc.
Peako Limited
Medallion Financial Corp.
KP Tissue Inc.
Blue Apron Holdings, Inc.
Ritchie Bros. Auctioneers Incorporated
Albany International Corp.
ACRES Commercial Realty Corp.
1-800-FLOWERS.COM, INC.
Origin Bancorp, Inc.
Commerzbank
Rectifier Technologies Ltd
Playtech plc
SThree plc
SIG plc

In [12]:
class ListContainsAnswer(BaseModel):
    result_name: str = Field(..., description="Company name")
    result_list_position: Union[int, Literal["N/A"]] = Field(..., description="Company name")
    result_prob: float = Field(..., description="Probability that the result is the right one")

prompt1 = """Find this company name in the following list.
---
Output format:
The output should start with @@@ and end with @@@. If the result is empty it should still correspond to the JSON schema and the result should be and empty array.
The output is a JSON that corresponds to the following pydantic class:

class ListContainsAnswer(BaseModel):
    result_name: str = Field(..., description="The company name found in list. If not found return N/A")
    result_list_position: Union[int, Literal["N/A"]] = Field(..., description="The position of the found company in a list. If not found return N/A")
    result_prob: float = Field(..., description="Probability that the result is the right one")

---
Company name to find:
"""

prompt2 = """
---
List of companies:
""" + "\n".join(subset_companies_list)

async def send_one_question(company_i: int, company: str, prompt1: str, prompt2: str) -> Tuple[int,dict]:
    chat_completion = await client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt1 + company + prompt2,
            }
        ],
        model="o1-mini",
    )
    ans = chat_completion.choices[0].message.content
    ans_l_splitted = ans.split('@@@')
    if len(ans_l_splitted) == 3:
        ans_splitted = ans_l_splitted[1]
        try:
            ans_d = json.loads(ans_splitted)
            _ = ListContainsAnswer(**ans_d)
        except:
            ans_d = {
                "result_name": ans_splitted,
                "result_error": True
            }
    else:
        ans_d = {
            "result_name": ans,
            "result_error": True
        }
    # if random.uniform(0, 1) < 0.1:
    #     ans_d = {
    #         "result": ans,
    #         "result_error": True
    #     }
    return company_i, company, ans_d

tasks_done = ["0"]*len(questions_companies_drop_duplicates)
tasks_tries = {}
tasks = []
li(f"Sending for every company started")
for i, param1 in enumerate(questions_companies_drop_duplicates):
    tasks_tries[i] = 1
    tasks.append(asyncio.create_task(send_one_question(i, param1, prompt1, prompt2)))

tasks_run = []
while tasks:
    done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
    for completed in done:
        i, param1, ans_d = await completed
        if "result_error" in ans_d:
            
            if tasks_tries[i] < 5:
                lw(f"Task {i} failed. Total retries: {tasks_tries[i]}. Retrying")
                tasks_tries[i] = tasks_tries[i] + 1
                tasks.add(asyncio.create_task(send_one_question(i, param1, prompt1, prompt2)))
            else:
                lw(f"Task {i} failed. Total retries: {tasks_tries[i]}. Stop retrying. Returning the current answer with error.")
                tasks_done[i] = "2"
                tasks_run.append((i, param1, ans_d))
                li("".join(tasks_done))
        else:
            tasks_done[i] = "1"
            tasks_run.append((i, param1, ans_d))
            li("".join(tasks_done))

2025-02-27T15:38:34 | INFO | Sending for every company started
2025-02-27T15:38:39 | INFO | 000000000000000000010000000000000000000000000000000000000000
2025-02-27T15:38:39 | INFO | 000000000000000000010000000000000000000000000000000000100000
2025-02-27T15:38:40 | INFO | 000000000000000000010000000000000000000000000000000100100000
2025-02-27T15:38:40 | INFO | 000000000000000000010000000000000000000000000000000100101000
2025-02-27T15:38:40 | INFO | 000000000001000000010000000000000000000000000000000100101000
2025-02-27T15:38:41 | INFO | 000000000001000000010000000000000000000000100000000100101000
2025-02-27T15:38:41 | INFO | 000000000001000000010000000000000010000000100000000100101000
2025-02-27T15:38:41 | INFO | 000000000001000000010000000000000010000000100000000110101000
2025-02-27T15:38:42 | INFO | 000000000001000000010000000000000010000000100000000110101010
2025-02-27T15:38:42 | INFO | 000010000001000000010000000000000010000000100000000110101010
2025-02-27T15:38:42 | INFO | 00001000

In [13]:
tasks_run_sorted = sorted(tasks_run, key=lambda tup: tup[0])
temp_answers2 = {company: ans_d for _, company, ans_d in tasks_run_sorted}
temp_answers2

{'Ziff Davis, Inc.': {'result_name': 'Ziff Davis, Inc.',
  'result_list_position': 94,
  'result_prob': 1.0},
 'Liberty Broadband Corporation': {'result_name': 'Liberty Broadband Corporation',
  'result_list_position': 27,
  'result_prob': 1.0},
 'Pintec Technology Holdings Limited': {'result_name': 'Pintec Technology Holdings Limited',
  'result_list_position': 62,
  'result_prob': 1.0},
 'Westwater Resources, Inc.': {'result_name': 'Westwater Resources, Inc.',
  'result_list_position': 56,
  'result_prob': 1.0},
 'Brave Bison Group plc': {'result_name': 'Brave Bison Group plc',
  'result_list_position': 81,
  'result_prob': 1.0},
 'Sonic Automotive, Inc.': {'result_name': 'Sonic Automotive, Inc.',
  'result_list_position': 42,
  'result_prob': 1.0},
 'Poste Italiane': {'result_name': 'Poste Italiane',
  'result_list_position': 73,
  'result_prob': 1.0},
 'MGM Resorts International': {'result_name': 'MGM Resorts International',
  'result_list_position': 83,
  'result_prob': 1.0},
 'IN

In [14]:
map_dict = {k: v["result_name"] for k, v in temp_answers2.items()}
map_dict

{'Ziff Davis, Inc.': 'Ziff Davis, Inc.',
 'Liberty Broadband Corporation': 'Liberty Broadband Corporation',
 'Pintec Technology Holdings Limited': 'Pintec Technology Holdings Limited',
 'Westwater Resources, Inc.': 'Westwater Resources, Inc.',
 'Brave Bison Group plc': 'Brave Bison Group plc',
 'Sonic Automotive, Inc.': 'Sonic Automotive, Inc.',
 'Poste Italiane': 'Poste Italiane',
 'MGM Resorts International': 'MGM Resorts International',
 'INMUNE BIO INC.': 'INMUNE BIO INC.',
 'BetMakers Technology Group Ltd': 'BetMakers Technology Group Ltd',
 'Franklin Covey Co.': 'Franklin Covey Co.',
 'Downer EDI Limited': 'Downer EDI Limited',
 'Armadale Capital Plc': 'Armadale Capital Plc',
 'AA Limited': 'AA Limited',
 'Ocugen, Inc.': 'Ocugen, Inc.',
 'Bionano Genomics, Inc.': 'Bionano Genomics, Inc.',
 'Seiko Epson Corporation': 'Seiko Epson Corporation',
 'NZME Limited': 'NZME Limited',
 'Incyte Corporation': 'Incyte Corporation',
 'Aurora Innovation, Inc.': 'Aurora Innovation, Inc.',
 'Data

In [15]:
df_subset = pd.DataFrame(subset)
map_name2sha1 = df_subset.groupby("company_name")["sha1"].apply(list).to_dict()
map_name2sha1

{'1-800-FLOWERS.COM, INC.': ['30f64d1043f4cb425eb636763580ae27094ffef1'],
 'AA Limited': ['aa781901e117281bfee6f8e4bea6fc9c9bada62e'],
 'ACRES Commercial Realty Corp.': ['0279901b645e568591ad95dac2c2bf939ef0c00d'],
 'ARCA Biopharma, Inc.': ['a533b44fe091a8aa62d1fa4f0a3262b311df8bd9'],
 'Aeeris Ltd': ['f6a1077a46cb36761604f8f4840698e22837f5f7'],
 'Albany International Corp.': ['da663e46fbf02ec8a90b3f3c1079ef4c9f7907e1'],
 'Alcoa Corporation': ['9af0a7f8db3954c3e74851c8c1807a09ece8ef00'],
 'Aptevo Therapeutics Inc.': ['0981826b4b43a88920f3e01c71ae73539bab84cc'],
 'Aptiv PLC': ['957f31e694198a1695d14aa512a972ab6f3c55b5'],
 'Arcadia Minerals Limited': ['b33bd2220c0fc0939ef08b1bdd8d2ebea158aa59'],
 'Armadale Capital Plc': ['a85dba6c75031912d56a811637f803ba4ddeb257'],
 'AstraZeneca': ['5bb5f71d1457f3272adab97d64a79c300da75cea'],
 'Atreca, Inc.': ['5f226fe96206888930e3baaf0bff70d4b0a1db40'],
 'Aurora Innovation, Inc.': ['13999998018cc53440310d94a26d1e8957e2277f'],
 'BCB Bancorp, Inc.': ['2fd7

In [17]:
pathlib.Path("../data_temp/").mkdir(exist_ok=True, parents=True)
df_temp_answers = pd.DataFrame(questions).rename(columns={"text": "q_text", "kind": "q_kind"})
df_temp_answers["company_names"] = questions_companies
df_temp_answers["company_names_subset"] = df_temp_answers["company_names"].apply(lambda x: [map_dict[y] for y in x])
df_temp_answers["documents_to_look_through"] = df_temp_answers["company_names_subset"].apply(lambda names: [y for x in names for y in map_name2sha1[x]])
df_temp_answers.to_csv(f"../notebooks_logging/find_files/{now_str}/04 temp_answers.csv", index=False)
df_temp_answers.to_csv(f"../data_temp/04 temp_answers.csv", index=False)
df_temp_answers.to_parquet(f"../data_temp/04 temp_answers.parquet", index=False)
df_temp_answers

Unnamed: 0,q_text,q_kind,company_names,company_names_subset,documents_to_look_through
0,"For Ziff Davis, Inc., what was the value of Cl...",number,"[Ziff Davis, Inc.]","[Ziff Davis, Inc.]",[ecabab4934d4b80570c4bb3b8e35b7476694b3fb]
1,Did Liberty Broadband Corporation announce a s...,boolean,[Liberty Broadband Corporation],[Liberty Broadband Corporation],[446545ae548543d8744f8d885ff75face3424ba4]
2,What is the total number of employees let go b...,number,[Pintec Technology Holdings Limited],[Pintec Technology Holdings Limited],[9e794a58e511f6a6a9a13b201d652deff9f9f69a]
3,Which leadership positions changed at Westwate...,names,"[Westwater Resources, Inc.]","[Westwater Resources, Inc.]",[92d9de8e4db96e0b95a484afcd1c54c6beb62c03]
4,Did Brave Bison Group plc mention any mergers ...,boolean,[Brave Bison Group plc],[Brave Bison Group plc],[ddd10e4612006205c4b1ba050a11648071e6e429]
...,...,...,...,...,...
95,"According to the annual report, what is the Ca...",number,[James Halstead plc],[James Halstead plc],[71d137454a1524843e1f49b34603438510232919]
96,What was the value of End-of-year tech staff h...,number,[archTIS Limited],[archTIS Limited],[c06d5ad4b6408fec26675d30b37a6042c007095a]
97,"For Westwater Resources, Inc., what was the va...",number,"[Westwater Resources, Inc.]","[Westwater Resources, Inc.]",[92d9de8e4db96e0b95a484afcd1c54c6beb62c03]
98,Which leadership positions changed at Origin B...,names,"[Origin Bancorp, Inc.]","[Origin Bancorp, Inc.]",[3f36d4f26ada778d89cf5a7344be0b9e9a5223a3]


In [18]:
li(df_temp_answers)

2025-02-27T15:40:15 | INFO |                                                q_text   q_kind  \
0   For Ziff Davis, Inc., what was the value of Cl...   number   
1   Did Liberty Broadband Corporation announce a s...  boolean   
2   What is the total number of employees let go b...   number   
3   Which leadership positions changed at Westwate...    names   
4   Did Brave Bison Group plc mention any mergers ...  boolean   
..                                                ...      ...   
95  According to the annual report, what is the Ca...   number   
96  What was the value of End-of-year tech staff h...   number   
97  For Westwater Resources, Inc., what was the va...   number   
98  Which leadership positions changed at Origin B...    names   
99  What was the Gross margin (%) for Ritchie Bros...   number   

                               company_names  \
0                         [Ziff Davis, Inc.]   
1            [Liberty Broadband Corporation]   
2       [Pintec Technology Holding

In [19]:
sha_pdfs_used = [y for x in df_temp_answers["documents_to_look_through"].to_list() for y in x]
li(f"sha_pdfs_used={sha_pdfs_used}")

2025-02-27T15:40:16 | INFO | sha_pdfs_used=['ecabab4934d4b80570c4bb3b8e35b7476694b3fb', '446545ae548543d8744f8d885ff75face3424ba4', '9e794a58e511f6a6a9a13b201d652deff9f9f69a', '92d9de8e4db96e0b95a484afcd1c54c6beb62c03', 'ddd10e4612006205c4b1ba050a11648071e6e429', '682de8e45fd9688f3452bc0e18257132a8f3cff6', 'c74139ce26a6f803725f5074a8a0f539abb99c09', 'e117005fc313bf0d49429d34bc8e1ef64de54898', '553afbf09b6d83166b17acb02431c6cf38e4defc', '1af8f906e34af6e0acfe4f73e37093bbe34700f3', 'e30ece688caf7602b734bbbcf39559b4acdb2739', '0a61a353b1ea9fd9b8f63b60239634ca3007d58f', 'a85dba6c75031912d56a811637f803ba4ddeb257', 'aa781901e117281bfee6f8e4bea6fc9c9bada62e', 'e30ece688caf7602b734bbbcf39559b4acdb2739', '36dd058d3237202cbb94139611c8b8a35ff8c158', '5a24fa827d172a7669eca206b2a5f47c2b19b48d', '6d76ccb75bbf1b27ca60b8419c5343ac050cebb0', 'e117005fc313bf0d49429d34bc8e1ef64de54898', 'c7475e1d98f9a46a4652e503881d4a67232b41d3', '4d3e52b69b4b5366e54ce87cf641b01b1419bdee', '13999998018cc53440310d94a26d1e8

In [20]:
df_subset["is_in_questions"] = df_subset["sha1"].isin(sha_pdfs_used)
df_subset.to_csv(f"../notebooks_logging/find_files/{now_str}/04 subset.csv", index=False)
df_subset.to_parquet(f"../notebooks_logging/find_files/{now_str}/04 subset.parquet", index=False)
df_subset.to_csv(f"../data_temp/04 subset.csv", index=False)
df_subset.to_parquet(f"../data_temp/04 subset.parquet", index=False)
li(f"../notebooks_logging/find_files/{now_str}/04 subset.csv")
df_subset

2025-02-27T15:40:18 | INFO | ../notebooks_logging/find_files/20250227_153721/04 subset.csv


Unnamed: 0,sha1,cur,company_name,major_industry,mentions_recent_mergers_and_acquisitions,has_leadership_changes,has_layoffs,has_executive_compensation,has_rnd_investment_numbers,has_new_product_launches,...,has_dividend_policy_changes,has_share_buyback_plans,has_capital_structure_changes,mentions_new_risk_factors,has_guidance_updates,has_regulatory_or_litigation_issues,has_strategic_restructuring,has_supply_chain_disruptions,has_esg_initiatives,is_in_questions
0,0279901b645e568591ad95dac2c2bf939ef0c00d,USD,ACRES Commercial Realty Corp.,Financial Services,False,False,False,True,False,False,...,False,True,False,True,True,False,False,False,True,True
1,0981826b4b43a88920f3e01c71ae73539bab84cc,USD,Aptevo Therapeutics Inc.,Healthcare,True,False,False,True,True,True,...,False,False,False,True,False,True,False,True,True,True
2,0a61a353b1ea9fd9b8f63b60239634ca3007d58f,USD,Downer EDI Limited,Transport & Logistics,True,True,True,True,False,False,...,True,True,True,True,True,True,True,False,True,True
3,0c0faea14d108e1617f2d6d2a7c1aae04eb88fe0,USD,Odyssey Gold Limited,Pharmaceuticals,True,True,False,True,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0f111d244aee3d976684995a222fa177a64571c4,USD,NextNav Inc.,Technology,True,False,False,True,True,True,...,False,False,False,True,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,f6a1077a46cb36761604f8f4840698e22837f5f7,USD,Aeeris Ltd,Technology,False,True,False,True,False,True,...,False,False,True,True,False,False,False,False,False,False
96,f7324f3868005b3aae6aafd30441c9bb6c8823e8,USD,Jefferies Financial Group Inc.,Financial Services,True,False,True,True,False,False,...,True,True,True,True,False,True,True,False,True,False
97,f85f0ccbbc59f1423d52d44e5ab70e3a01164499,USD,Integra LifeSciences Holdings Corporation,Healthcare,True,False,False,True,True,True,...,False,False,False,True,False,True,True,True,True,False
98,f879b3a802ccd6e8e6ca0a07ed8464318b7c0724,USD,Elixir Energy Limited,Energy and Utilities,True,False,False,True,True,True,...,False,False,True,True,False,False,False,True,True,True
