In [1]:
import openai
import pandas as pd
import tqdm

from pydantic import BaseModel, Field
from typing import Optional, List, Union, Literal

import json

In [2]:
with open("../tokens/openai_token.txt") as f:
    openai_token = f.read()

In [3]:
class Question(BaseModel):
    text: str
    kind: Literal["number", "name", "boolean", "names"]

class SourceReference(BaseModel):
    pdf_sha1: str = Field(..., description="SHA1 hash of the PDF file")
    page_index: int = Field(..., description="Physical page number in the PDF file")

class Answer(BaseModel):
    question_text: Optional[str] = Field(None, description="Text of the question")
    kind: Optional[Literal["number", "name", "boolean", "names"]] = Field(None, description="Kind of the question")
    value: Union[float, str, bool, List[str], Literal["N/A"]] = Field(..., description="Answer to the question, according to the question schema")
    references: List[SourceReference] = Field([], description="References to the source material in the PDF file")

class AnswerSubmission(BaseModel):
    team_email: str = Field(..., description="Email that your team used to register for the challenge")
    submission_name: str = Field(..., description="Unique name of the submission (e.g. experiment name)")
    answers: List[Answer] = Field(..., description="List of answers to the questions")

In [4]:
with open("../data_in/questions.json") as f:
    questions = json.loads(f.read())
questions_typed = [Question(**x) for x in questions]
print(questions_typed[:1])

with open("../data_in/subset.json") as f:
    subset = json.loads(f.read())
print(subset[0])

[Question(text="According to the annual report, what is the Operating margin (%) for Altech Chemicals Ltd  (within the last period or at the end of the last period)? If data is not available, return 'N/A'.", kind='number')]
{'sha1': '0a9e39e1d2e176f3a766a0e86af82772f1654a6e', 'cur': 'USD', 'company_name': 'KeyCorp', 'major_industry': 'Financial Services', 'mentions_recent_mergers_and_acquisitions': True, 'has_leadership_changes': True, 'has_layoffs': False, 'has_executive_compensation': True, 'has_rnd_investment_numbers': True, 'has_new_product_launches': True, 'has_capital_expenditures': True, 'has_financial_performance_indicators': True, 'has_dividend_policy_changes': True, 'has_share_buyback_plans': True, 'has_capital_structure_changes': False, 'mentions_new_risk_factors': True, 'has_guidance_updates': True, 'has_regulatory_or_litigation_issues': True, 'has_strategic_restructuring': False, 'has_supply_chain_disruptions': False, 'has_esg_initiatives': True}


In [5]:
subset_companies_list = [x["company_name"] for x in subset]
subset_companies_list

['KeyCorp',
 'Summit Materials, Inc.',
 'Gibson Energy Inc.',
 'HCW Biologics Inc.',
 'Essential Metals Limited',
 'EVI Industries, Inc.',
 'Koenig & Bauer AG',
 'Microchip Technology Incorporated',
 'Corbus Pharmaceuticals Holdings, Inc.',
 'Audalia Resources Limited',
 'Enerflex Ltd.',
 'MFA Financial, Inc.',
 'Strike Resources Limited',
 'Altech Chemicals Ltd',
 'HV Bancorp, Inc.',
 'Zymeworks Inc.',
 'iBio, Inc.',
 'Winnebago Industries, Inc.',
 'Terns Pharmaceuticals, Inc.',
 'Canadian Tire Corporation',
 'Prodigy Gold NL',
 'Alien Metals Limited',
 'Urban Logistics REIT plc',
 'Xero Limited',
 'Dunedin Enterprise Investment Trust PLC',
 'Cofinimmo',
 'Maxeon Solar Technologies, Ltd.',
 'Hagerty, Inc.',
 'Universal Electronics Inc.',
 'Lipocine Inc.',
 'ICICI Bank',
 'CareTrust REIT, Inc.',
 'Charles & Colvard, Ltd.',
 'Renold plc',
 'LVMH',
 'Advantage Solutions Inc.',
 'Harworth Group plc',
 'Johns Lyng Group Limited',
 'Nevro Corp.',
 'Ameresco, Inc.']

In [6]:
class CompanyAnswer(BaseModel):
    result: List[str] = Field(..., description="List of company names")
    result_prob: float = Field(..., description="Probability that the result is the right one")

client = openai.OpenAI(
    # This is the default and can be omitted
    api_key=openai_token,
)

question = questions[0]

msg = "Find company names in this question. There could be zero or more company names"

msg += """
---
Output format:
The output should start with @@@ and end with @@@. If the result is empty it should still correspond to the JSON schema and the result should be and empty array.
The output is a JSON that corresponds to the following schema:
{
    "type": "object",
    "properties": {
        "result": {
            "description": "One company name",
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "result_prob": {
            "description": "Probability that the result is the right one. Float from 0 to 1.",
            "type": "number"
        }
    },
    "required": [
        "result",
        "result_prob",
    ]
}
"""

msg += f"""
---
Question:
{question}
"""

model_answers = []
for i in range(3):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": msg,
            }
        ],
        model="o1-mini",
    )
    ans = chat_completion.choices[0].message.content
    ans_l_splitted = ans.split('@@@')
    if len(ans_l_splitted) == 3:
        ans_splitted = ans_l_splitted[1]
        try:
            ans_d = json.loads(ans_splitted)
            _ = CompanyAnswer(**ans_d)
        except:
            ans_d = {
                "result": ans_splitted,
                "result_prob": 0.0
            }
    else:
        ans_d = {
            "result": ans,
            "result_prob": 0.0
        }



    model_answers.append(ans_d)
model_answers

[{'result': ['Altech Chemicals Ltd'], 'result_prob': 1.0},
 {'result': ['Altech Chemicals Ltd'], 'result_prob': 1.0},
 {'result': ['Altech Chemicals Ltd'], 'result_prob': 1.0}]

In [7]:
df_result = pd.DataFrame(model_answers)
df_result["result_prob"] = pd.to_numeric(df_result["result_prob"], errors="coerce")
max_prob = df_result["result_prob"].fillna(0.0).max()
df_result["result_str"] = df_result["result"].apply(str)
df_result["result_votes"] = df_result.groupby(["result_str", "result_prob"]).transform("count")
df_result_max_prob = df_result[df_result["result_prob"]==max_prob]
max_votes = df_result_max_prob["result_votes"].max()
list_companies = df_result_max_prob[df_result_max_prob["result_votes"]==max_votes]["result"].iloc[0]
list_companies

['Altech Chemicals Ltd']

In [None]:
subset_documents_to_look_through = []

class ListContainsAnswer(BaseModel):
    result_name: str = Field(..., description="Company name")
    result_list_position: Union[int, Literal["N/A"]] = Field(..., description="Company name")
    result_prob: float = Field(..., description="Probability that the result is the right one")

#for company in tqdm.notebook.tqdm(list_companies):
for company in list_companies:
    company = list_companies[0]
    client = openai.OpenAI(
        # This is the default and can be omitted
        api_key=openai_token,
    )

    msg = "Find this company name in the following list. "

    msg += """
    ---
    Output format:
    The output should start with @@@ and end with @@@. If the result is empty it should still correspond to the JSON schema and the result should be and empty array.
    The output is a JSON that corresponds to the following schema:
    {
        "type": "object",
        "properties": {
            "result_name": {
                "description": "The company name found in list. If not found return N/A",
                "type": "string",
            },
            "result_list_position": {
                "description": "The position of the found company in a list. If not found return N/A",
                "type": "string",
            },
            "result_prob": {
                "description": "Probability that the result is the right one. Float from 0 to 1.",
                "type": "number"
            }
        },
        "required": [
            "result",
            "result_list_position",
            "result_prob",
        ]
    }
    """

    msg += f"""
    ---
    Company name to find:
    {company}
    ---
    List of companies:
    """

    msg+= "\n".join(subset_companies_list)

    model_answers = []
    for i in range(3):
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": msg,
                }
            ],
            model="o1-mini",
        )
        ans = chat_completion.choices[0].message.content
        ans_l_splitted = ans.split('@@@')
        if len(ans_l_splitted) == 3:
            ans_splitted = ans_l_splitted[1]
            try:
                ans_d = json.loads(ans_splitted)
                _ = ListContainsAnswer(**ans_d)
            except:
                ans_d = {
                    "result": ans_splitted,
                    "result_prob": 0.0
                }
        else:
            ans_d = {
                "result": ans,
                "result_prob": 0.0
            }



        model_answers.append(ans_d)
    print(model_answers)

    df_result = pd.DataFrame(model_answers)
    df_result["result_prob"] = pd.to_numeric(df_result["result_prob"], errors="coerce")
    max_prob = df_result["result_prob"].fillna(0.0).max()
    df_result["result_str"] = df_result["result_name"].apply(str)
    df_result["result_votes"] = df_result.groupby(["result_str", "result_prob"]).transform("count").loc[:,"result_name"]
    df_result_max_prob = df_result[df_result["result_prob"]==max_prob]
    max_votes = df_result_max_prob["result_votes"].max()
    company_name_in_list = df_result_max_prob[df_result_max_prob["result_votes"]==max_votes]["result_str"].iloc[0]

    df_subset = pd.DataFrame({"subset_company": subset_companies_list})
    subset_documents_to_look_through_i = df_subset[df_subset["subset_company"] == company_name_in_list].index.to_list()
    subset_documents_to_look_through += subset_documents_to_look_through_i

print(subset_documents_to_look_through)


  0%|          | 0/1 [00:00<?, ?it/s]

[{'result_name': 'Altech Chemicals Ltd', 'result_list_position': '14', 'result_prob': 1.0}, {'result_name': 'Altech Chemicals Ltd', 'result_list_position': '14', 'result_prob': 1.0}, {'result_name': 'Altech Chemicals Ltd', 'result_list_position': '14', 'result_prob': 1.0}]
[13]
