In [None]:
import picologging as logging
from pydantic import BaseModel, Field
import pandas as pd
import httpx

import pathlib
import datetime
import json
from typing import Optional, List, Union, Literal
import io

In [None]:
now = datetime.datetime.now()
now_str = now.strftime("%Y%m%d_%H%M%S")
pathlib.Path(f"../notebooks_logging/07_collect_result").mkdir(parents=True, exist_ok=True)
pathlib.Path(f"../notebooks_logging/07_collect_result/{now_str}").mkdir(parents=True, exist_ok=True)

file_handler = logging.FileHandler(f"../notebooks_logging/07_collect_result/{now_str}.log")
stream_handler = logging.StreamHandler()
# stdout_formatter = logging.Formatter(fmt="%(message)s") 
# stream_handler.setFormatter(stdout_formatter)
logging.basicConfig(
    encoding='utf-8', 
    format='%(asctime)s | %(levelname)s | %(message)s', 
    level=logging.INFO, 
    datefmt='%Y-%m-%dT%H:%M:%S',
    handlers=[
        file_handler,
        stream_handler,
    ],
    force=True
)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
li = logger.info
lw = logger.warning
li("start")
lw("check warning")

In [None]:
files4questions = list(pathlib.Path("../data_temp/06_ask_questions").glob("*"))
li(f"files4questions=\n{"\n".join(str(x) for x in files4questions)}")

In [None]:
df_temp_answers04 = pd.read_parquet(f"../data_temp/04 temp_answers.parquet")
df_temp_answers04 = df_temp_answers04.rename(columns={"documents_to_look_through": "ref_sha1"})
df_temp_answers04

In [None]:
with open("../data_in/questions.json") as f:
    questions = json.loads(f.read())
questions

In [None]:
class Question(BaseModel):
    text: str
    kind: Literal["number", "name", "boolean", "names"]

class SourceReference(BaseModel):
    pdf_sha1: str = Field(..., description="SHA1 hash of the PDF file")
    page_index: int = Field(..., description="Physical page number in the PDF file")

class Answer(BaseModel):
    question_text: Optional[str] = Field(None, description="Text of the question")
    kind: Optional[Literal["number", "name", "boolean", "names"]] = Field(None, description="Kind of the question")
    value: Union[float, str, bool, List[str], Literal["N/A"]] = Field(..., description="Answer to the question, according to the question schema")
    references: List[SourceReference] = Field([], description="References to the source material in the PDF file")

class AnswerSubmission(BaseModel):
    team_email: str = Field(..., description="Email that your team used to register for the challenge")
    submission_name: str = Field(..., description="Unique name of the submission (e.g. experiment name)")
    answers: List[Answer] = Field(..., description="List of answers to the questions")

In [None]:
res = {
    "team_email": "keiv.fly@gmail.com",
    "submission_name": "Sergey Nikonov main v2",
}
answers = []
for i_row, row in df_temp_answers04.iterrows():
    ans = {}
    q_text = row["q_text"]
    ans["question_text"] = q_text
    q_kind = row["q_kind"]
    ans["kind"] = q_kind

    match q_kind:
        case "number":
            ans_value = "N/A"           
        case "name":
            ans_value = "N/A"
        case "boolean":
            ans_value = False
        case "names":
            ans_value = "N/A"

    ans["value"] = ans_value

    ans["references"] = []
    ans["_company_names"] = list(row["company_names"])
    ans["_is_multiple_company_names"] = len(row["company_names"]) > 1
    ans["_ref_sha1"] = list(row["ref_sha1"])
    ans["_q_id"] = i_row

    answers.append(ans)
    
res["answers"] = answers
_ = AnswerSubmission(**res)

In [None]:
for file in files4questions:
    file_content_txt = file.read_text()
    # print(file_content_txt)
    q_sha1_res = json.loads(file_content_txt)
    q_id = q_sha1_res["question_id"]
    if q_sha1_res["value"] != "N/A":
        res["answers"][q_id]["value"] = q_sha1_res["value"]
        res["answers"][q_id]["references"].append({
            "pdf_sha1": q_sha1_res["ref_sha1_i"],
            "page_index": q_sha1_res["ref_sha1_i_page"],
        })

res
    

In [None]:
li("Working with questions with multiple companies")

for q_id, item in enumerate(res["answers"]):
    if item["_is_multiple_company_names"]:
        answers06 = []
        for file in files4questions:
            file_content_txt = file.read_text()
            q_sha1_res = json.loads(file_content_txt)
            if q_sha1_res["question_id"] == q_id:
                answers06.append(q_sha1_res)
        
        df_answers06 = pd.DataFrame(answers06)
        df_answers06["value_float"] = pd.to_numeric(df_answers06["value"], errors="coerce")
        df_answers06_filtered = df_answers06[df_answers06["value_float"].notna()].copy()
        q_text = item["question_text"]
        if "lowest" in q_text:
            li("lowest")
            df_answers06_filtered = df_answers06_filtered.sort_values("value_float", ascending=True)
            df_answers06 = df_answers06.sort_values("value_float", ascending=True)
        elif "highest" in q_text:
            li("lowest")
            df_answers06_filtered = df_answers06_filtered.sort_values("value_float", ascending=False)
            df_answers06 = df_answers06.sort_values("value_float", ascending=False)
        else:
            li("other")
            df_answers06_filtered = df_answers06_filtered.sort_values("value_float", ascending=True)
            df_answers06 = df_answers06.sort_values("value_float", ascending=True)
        
        if len(df_answers06_filtered) == 0:
            val = "N/A"
        else:
            val = df_answers06_filtered["company_name"].iloc[0]

        item["value"] = val

        df_answers06_filtered_renamed = df_answers06_filtered[["ref_sha1_i","ref_sha1_i_page"]].rename(columns={"ref_sha1_i": "pdf_sha1", "ref_sha1_i_page": "page_index"})
        refs = df_answers06_filtered_renamed.to_dict('records')
        if len(refs) > 0:
            item["references"] = refs
        else:
            item["references"] = []

        li(json.dumps(item, indent=4))

        filename_csv = f"q{q_id:04d}.csv"
        folder = pathlib.Path(f"../data_temp/07_collect_result/")
        folder.mkdir(parents=True, exist_ok=True)
        full_filename_csv = folder/ filename_csv
        
        df_answers06.to_csv(full_filename_csv, index=False)  

In [None]:
submission_folder_before_submission = now_str
pathlib.Path(f"../data_out/{submission_folder_before_submission}").mkdir(parents=True, exist_ok=True)
with open(f"../data_out/{submission_folder_before_submission}/submission_w_all.json", "w") as f:
    json.dump(res, f, indent=4)

In [None]:
for ans in res["answers"]:
    for key in ["_company_names", "_is_multiple_company_names", "_ref_sha1", "_q_id"]:
        if key in ans:
            del ans[key]

with open(f"../data_out/{submission_folder_before_submission}/submission_wo_additions.json", "w") as f:
    json.dump(res, f, indent=4)

In [None]:
res_json = json.loads(pathlib.Path(f"../data_out/{submission_folder_before_submission}/submission_wo_additions.json").read_text())
res_json

In [None]:
url = "https://rag.timetoact.at/check-submission"
headers = {"accept": "application/json"}
files = {
    "file": ("submision.json", io.BytesIO(json.dumps(res_json).encode()), "application/json")
}
response = httpx.post(url, headers=headers, files=files)
response.json()

In [None]:
url = "https://rag.timetoact.at/submit"
headers = {"accept": "application/json"}
files = {
    "file": ("submision.json", io.BytesIO(json.dumps(res_json).encode()), "application/json")
}
response = httpx.post(url, headers=headers, files=files)
submission_response = response.json()
submission_response

In [None]:
s_datetime = pd.to_datetime(submission_response["response"]["time"]).strftime("%Y%m%d_%H%M%S")
s_submission_name = submission_response["response"]["submission_name"]
submission_folder = f"{s_datetime}_{s_submission_name}"
submission_folder

In [None]:
pathlib.Path(f"../data_out/{submission_folder}").mkdir(parents=True, exist_ok=True)
with open(f"../data_out/{submission_folder}/submission.json", "w") as f:
    json.dump(res, f, indent=4)

In [None]:
with open(f"../data_out/{submission_folder}/submission_response.json", "w") as f:
    json.dump(submission_response, f, indent=4)