In [1]:
import sys
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI
import warnings

# Load environment variables
_ = load_dotenv(find_dotenv())

# Append the submodules path to the local libs directory
repo_dir = Path().resolve()
sys.path.append(str(repo_dir / 'libs'))

# Ensure the symlink exists (assuming setup_symlink.py has been executed)
symlink_path = repo_dir / 'libs' / 'NLP_on_multilingual_coin_datasets'
if not symlink_path.exists():
    print(f"Error: Symlink {symlink_path} does not exist. Run setup_symlink.py first.")
    sys.exit(1)

# Import the custom modules after ensuring symlink is in place
from NLP_on_multilingual_coin_datasets.cnt.io import Database_Connection
from modules.loading_preprocessed_designs import PreprocessingConfig, LoadingPreprocessedDesigns
from modules import scripts, prompts

# Set up pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

# Suppress warnings
warnings.filterwarnings('ignore')

# Access the OpenAI API key from environment variables
api_key = os.getenv('OPENAI_API_KEY')

prep_cfg = PreprocessingConfig





Define filenames and paths

In [2]:
json_dir = prep_cfg.json_path
enhanced_json_filename = "enhanced_objects.json"
sop_json_filename = "subject_object_pairs.json"
pred_json_filename = "subject_predicate_object_triples.json"
#tmps
tmp_dir = prep_cfg.tmp_path
job_ids_file_name = Path("batch_job_ids.json")
job_ids_file_path = tmp_dir / job_ids_file_name

Define database connection parameters or set them as environment variables.

In [3]:
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
database = prep_cfg.database

dc = Database_Connection(f"mysql+mysqlconnector://{db_user}:{db_password}@{db_host}:{db_port}/{database}")

# Load or preprocess data
- Check for the preprocessed designs.csv file in the `data` directory. 
    - If it does not exist, get the data from the database and preprocess it.
    - Else load the data from the file.
- Create Dataframe

In [4]:
lpd = LoadingPreprocessedDesigns(dc, prep_cfg)
df_designs = lpd.load_designs_csv_or_process_database()

2024-09-01 17:39:30,638 - INFO - Checking if file data/source/lists/csv/annotated_designs.csv exists.
2024-09-01 17:39:30,742 - INFO - Converting annotations to list.
2024-09-01 17:39:31,493 - INFO - File exists and was loaded.


# Prep Dataframe 

- create copy
- filter the columns id, design_en and annotations
    - respresenting only the preprocessed columns

In [5]:
df_designs_0 = df_designs.copy()
df_designs = df_designs[["id", "design_en", "annotations"]]
df_designs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22333 entries, 0 to 22332
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           22333 non-null  int64 
 1   design_en    22333 non-null  object
 2   annotations  22333 non-null  object
dtypes: int64(1), object(2)
memory usage: 523.6+ KB


In [6]:
df_designs_0.head(5)

Unnamed: 0,design_en_orig,id,annotations_orig,design_en,annotations
0,"Diademed head of deified Alexander the Great with horn of Ammon, right. Border of dots.",1,"[(9, 13, 'OBJECT'), (25, 44, 'PERSON'), (50, 54, 'OBJECT')]","Diadem head of deified Alexander the Great with horn of Ammon, right. Border of dots.","[(0, 6, OBJECT), (7, 11, OBJECT), (23, 42, PERSON), (48, 52, OBJECT)]"
1,"Altar, lighted and garlanded.",6,"[(0, 5, 'OBJECT')]","Altar, lighted and garland.","[(0, 5, OBJECT), (19, 26, OBJECT)]"
2,Prize amphora on ornamental stand; within linear square and incuse square.,8,"[(6, 13, 'OBJECT')]",Prize amphora on ornamental stand; within linear square and incuse square.,"[(6, 13, OBJECT)]"
3,Amphora with ribbed surface and crooked handles containing two ears of corn and poppy.,9,"[(0, 7, 'OBJECT'), (40, 47, 'OBJECT'), (63, 67, 'PLANT'), (71, 75, 'PLANT'), (80, 85, 'PLANT')]",Amphora with ribbed surface and crooked handleholding two corn and poppy.,"[(0, 7, OBJECT), (58, 62, PLANT), (67, 72, PLANT)]"
4,"Bust of youthful Anchialos, right, wearing taenia. Border of dots.",10,"[(0, 4, 'OBJECT'), (17, 26, 'PERSON'), (43, 49, 'OBJECT')]","Bust of youthful Anchialos, right, wearing taenia. Border of dots.","[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OBJECT)]"


**create strings from annotations**

In [7]:
df_designs["list_of_strings"] = df_designs.apply(scripts.generate_list_of_strings, axis=1)
df_designs.head(5)

Unnamed: 0,id,design_en,annotations,list_of_strings
0,1,"Diadem head of deified Alexander the Great with horn of Ammon, right. Border of dots.","[(0, 6, OBJECT), (7, 11, OBJECT), (23, 42, PERSON), (48, 52, OBJECT)]","[(Diadem, OBJECT), (head, OBJECT), (Alexander the Great, PERSON), (horn, OBJECT)]"
1,6,"Altar, lighted and garland.","[(0, 5, OBJECT), (19, 26, OBJECT)]","[(Altar, OBJECT), (garland, OBJECT)]"
2,8,Prize amphora on ornamental stand; within linear square and incuse square.,"[(6, 13, OBJECT)]","[(amphora, OBJECT)]"
3,9,Amphora with ribbed surface and crooked handleholding two corn and poppy.,"[(0, 7, OBJECT), (58, 62, PLANT), (67, 72, PLANT)]","[(Amphora, OBJECT), (corn, PLANT), (poppy, PLANT)]"
4,10,"Bust of youthful Anchialos, right, wearing taenia. Border of dots.","[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OBJECT)]","[(Bust, OBJECT), (Anchialos, PERSON), (taenia, OBJECT)]"


### Define a subset of the data to be used for the testing of the implementation.
- 22332 rows
- define top and start 
- only new ones will be processed, existing will be skipped with filtering.

In [8]:
start = 0 
stop = 3250


df_designs_source = df_designs.iloc[start:stop].copy()
df_designs_source.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3250 entries, 0 to 3249
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               3250 non-null   int64 
 1   design_en        3250 non-null   object
 2   annotations      3250 non-null   object
 3   list_of_strings  3250 non-null   object
dtypes: int64(1), object(3)
memory usage: 101.7+ KB


# Step 0: Check for More Possible Subjects or Objects
- **Input:** Design description and list of strings (entities).
- **Output:** Identified and verified subjects and objects categorized as PERSON, OBJECT, ANIMAL, PLANT.


**Filter source data for already computed datapoints.**

In [9]:
df_designs_filtered = scripts.filter_source_dataframe(df_designs_source, json_dir, enhanced_json_filename)
df_designs_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 3200 to 3249
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               50 non-null     int64 
 1   design_en        50 non-null     object
 2   annotations      50 non-null     object
 3   list_of_strings  50 non-null     object
dtypes: int64(1), object(3)
memory usage: 2.0+ KB


**Define client and batchsize (not the OpenAI batch) but the size of the datapoints to be processed in one prompt.**

In [10]:
client = OpenAI(api_key=api_key)
batch_size = 32
batch_start = 0 # for token calc
batch_stop = len(df_designs_filtered)//batch_size + 1 # for token calc

**Create prompts and batches for enhance objects**

**Note** The price claculation is only an example and only for the input token, based on:
- gpt4o, 2024-July:
    - 5$/Million Token * 0.5 for batch API discount 

In [11]:
prompts_enhance = prompts.enhance_objects_in_designs(df_designs_filtered, batch_size)
scripts.calculate_total_tokens_and_price(prompts_enhance, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_enhance, client, tmp_dir, step="0")

Token count for prompt 0: 3333, Price: $0.00833
Token count for prompt 1: 2374, Price: $0.00593
Total token count: 5707
Total input price: $0.01427
Created 2 tasks
Tasks saved to data/data/results/tmp/batchinput_0.jsonl


2024-09-01 17:39:33,120 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"


Batch file uploaded: FileObject(id='file-CRdxVxXey8rjadKi7Gi9leFP', bytes=27055, created_at=1725205172, filename='batchinput_0.jsonl', object='file', purpose='batch', status='processed', status_details=None)


**¡¡¡ creates the batch job and sends them to the OpenAI API !!!**
- and also saves the batch job ID to a file

In [12]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="0")

2024-09-01 17:39:33,846 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
2024-09-01 17:39:33,856 - INFO - Job ID batch_hw46pm6xrPlgXAEEX2OXGUYx added to the file.


In [13]:
print(batch_job)


Batch(id='batch_hw46pm6xrPlgXAEEX2OXGUYx', completion_window='24h', created_at=1725205173, endpoint='/v1/chat/completions', input_file_id='file-CRdxVxXey8rjadKi7Gi9leFP', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725291573, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


**Load the newest job ID from the file**
- For previous job IDs check the file `temp/batch_jobs_id.json` with timestamps.
- The idea behind this:
    - If the kernel is restarted, the job ID is still available and the job can be continued.

In [14]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="0")
except (FileNotFoundError, ValueError) as e:
    print(e)

2024-09-01 17:39:36,092 - INFO - The newest job ID for task 0 is: batch_hw46pm6xrPlgXAEEX2OXGUYx
2024-09-01 17:39:36,093 - INFO - Timestamp: 2024-09-01T17:39:33.856402+02:00


**Check the status of the job ID**

In [24]:
status_info = scripts.retrieve_batch_job_status(client, newest_job_id)

2024-09-01 17:43:51,270 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_hw46pm6xrPlgXAEEX2OXGUYx "HTTP/1.1 200 OK"
2024-09-01 17:43:51,273 - INFO - Batch Job Status: completed
2024-09-01 17:43:51,274 - INFO - Batch Job ID: batch_hw46pm6xrPlgXAEEX2OXGUYx
2024-09-01 17:43:51,274 - INFO - Input File ID: file-CRdxVxXey8rjadKi7Gi9leFP
2024-09-01 17:43:51,275 - INFO - Request Counts: Completed: 2, Failed: 0, Total: 2


**If the status is completed, load the results and proceed**

In [25]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_enhanced = scripts.parse_and_clean_batch_responses(result)

df_enhanced_merged = df_responses_enhanced.merge(
    df_designs_filtered[['id', 'design_en', 'list_of_strings']], 
    left_on='design_id', 
    right_on='id', 
    how='left'
).drop(columns='id')

df_enhanced_merged.info()


2024-09-01 17:43:54,299 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_hw46pm6xrPlgXAEEX2OXGUYx "HTTP/1.1 200 OK"
2024-09-01 17:43:54,740 - INFO - HTTP Request: GET https://api.openai.com/v1/files/file-8yrruQ0zS79bBscZ4O9KLRsQ/content "HTTP/1.1 200 OK"
2024-09-01 17:43:54,742 - INFO - Parsed 2 JSON objects.
2024-09-01 17:43:54,746 - ERROR - JSON format error: Expecting ',' delimiter: line 5 column 1 (char 117)
2024-09-01 17:43:54,748 - INFO - JSON format error fixed.
2024-09-01 17:43:54,749 - ERROR - JSON format error: Expecting ',' delimiter: line 5 column 1 (char 108)
2024-09-01 17:43:54,749 - INFO - JSON format error fixed.
2024-09-01 17:43:54,750 - INFO - Cleaned 50 responses.
2024-09-01 17:43:54,754 - INFO - DataFrame created from cleaned responses.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   design_id            50 non-null     int64 
 1   new_list_of_strings  50 non-null     object
 2   design_en            50 non-null     object
 3   list_of_strings      50 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.7+ KB


### Step 0.1 Validate and Classify enhanced entities

**Create prompts and batches for validation of enhanced objects**

In [26]:
prompts_validate_enhanced = prompts.validate_overall_objects_in_designs(df_enhanced_merged, batch_size)
scripts.calculate_total_tokens_and_price(prompts_validate_enhanced, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_validate_enhanced, client, tmp_dir, step="0_1")


Token count for prompt 0: 4334, Price: $0.01084
Token count for prompt 1: 3025, Price: $0.00756
Total token count: 7359
Total input price: $0.01840
Created 2 tasks
Tasks saved to data/data/results/tmp/batchinput_0_1.jsonl


2024-09-01 17:43:59,542 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"


Batch file uploaded: FileObject(id='file-PJW1NygfOapocnQMH62xRZnu', bytes=33697, created_at=1725205439, filename='batchinput_0_1.jsonl', object='file', purpose='batch', status='processed', status_details=None)


**¡¡¡ creates the batch job and sends them to the OpenAI API !!!**
- and also saves the batch job ID to a file

In [27]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="0_1")

2024-09-01 17:44:07,621 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
2024-09-01 17:44:07,625 - INFO - Job ID batch_2yNoOm6mp2V9zDnxCbRx0zLL added to the file.


**Load the newest job ID from the file**
- For previous job IDs check the file `temp/batch_jobs_id.json` with timestamps.
- The idea behind this:
    - If the kernel is restarted, the job ID is still available and the job can be continued.

In [28]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="0_1")
except (FileNotFoundError, ValueError) as e:
    print(e)

2024-09-01 17:44:09,927 - INFO - The newest job ID for task 0_1 is: batch_2yNoOm6mp2V9zDnxCbRx0zLL
2024-09-01 17:44:09,928 - INFO - Timestamp: 2024-09-01T17:44:07.624586+02:00


**Check the status of the job ID**

In [30]:
scripts.retrieve_batch_job_status(client, newest_job_id)

2024-09-01 17:52:11,973 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_2yNoOm6mp2V9zDnxCbRx0zLL "HTTP/1.1 200 OK"
2024-09-01 17:52:11,975 - INFO - Batch Job Status: completed
2024-09-01 17:52:11,976 - INFO - Batch Job ID: batch_2yNoOm6mp2V9zDnxCbRx0zLL
2024-09-01 17:52:11,977 - INFO - Input File ID: file-PJW1NygfOapocnQMH62xRZnu
2024-09-01 17:52:11,977 - INFO - Request Counts: Completed: 2, Failed: 0, Total: 2


{'status': 'completed',
 'batch_job_id': 'batch_2yNoOm6mp2V9zDnxCbRx0zLL',
 'input_file_id': 'file-PJW1NygfOapocnQMH62xRZnu',
 'completed': 2,
 'failed': 0,
 'total': 2}

**If the status is completed, load the results and proceed**

In [31]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_val_enhanced = scripts.parse_and_clean_batch_responses(result)

df_enhanced_validated = df_responses_val_enhanced.merge(
    df_enhanced_merged, 
    on=['design_id'], 
    how='left')

df_enhanced_validated.info()

2024-09-01 17:52:14,923 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_2yNoOm6mp2V9zDnxCbRx0zLL "HTTP/1.1 200 OK"
2024-09-01 17:52:15,372 - INFO - HTTP Request: GET https://api.openai.com/v1/files/file-YdfE3X4GYQnIv8Mo0VzzbyQz/content "HTTP/1.1 200 OK"
2024-09-01 17:52:15,493 - INFO - Parsed 2 JSON objects.
2024-09-01 17:52:15,496 - INFO - Cleaned 50 responses.
2024-09-01 17:52:15,500 - INFO - DataFrame created from cleaned responses.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   design_id            50 non-null     int64 
 1   relevance            50 non-null     int64 
 2   correctness          50 non-null     int64 
 3   comment_enh          50 non-null     object
 4   new_list_of_strings  50 non-null     object
 5   design_en            50 non-null     object
 6   list_of_strings      50 non-null     object
dtypes: int64(3), object(4)
memory usage: 2.9+ KB


**Save validated enahanced objects to file**

In [32]:
columns = ['design_id', 'design_en', 'new_list_of_strings', 
           'relevance', 'correctness', 'comment_enh', 'list_of_strings']
scripts.update_json_with_merged_df(df_enhanced_validated, columns, json_dir, enhanced_json_filename)

In [33]:
# show all different values for correctness in df_enhanced_validated
df_enhanced_validated['correctness'].value_counts()


correctness
1    47
0     3
Name: count, dtype: int64

### Step 1: Identify Subject-Object Pairs
- **Input:** Design description and categorized entities.
- **Output:** List of subject-object pairs.

In [34]:
df_enhanced = pd.read_json(Path(json_dir) / enhanced_json_filename)
df_enhanced.info()
# df_enhanced['design_id'].nunique()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3250 entries, 0 to 3249
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            3250 non-null   int64  
 1   design_en            3250 non-null   object 
 2   new_list_of_strings  3250 non-null   object 
 3   completeness         64 non-null     float64
 4   relevance            3250 non-null   int64  
 5   correctness          3250 non-null   int64  
 6   comment_enh          3250 non-null   object 
 7   list_of_strings      3250 non-null   object 
dtypes: float64(1), int64(3), object(4)
memory usage: 203.2+ KB


In [35]:
df_enhanced_filtered = scripts.filter_enhanced_designs(df_enhanced, json_dir, sop_json_filename)
df_enhanced_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 64 to 3249
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            87 non-null     int64  
 1   design_en            87 non-null     object 
 2   new_list_of_strings  87 non-null     object 
 3   completeness         0 non-null      float64
 4   relevance            87 non-null     int64  
 5   correctness          87 non-null     int64  
 6   comment_enh          87 non-null     object 
 7   list_of_strings      87 non-null     object 
dtypes: float64(1), int64(3), object(4)
memory usage: 6.1+ KB


**Create prompts and batches for enhance objects**
- reduce batchsize: because!

In [36]:
batch_size = 12
batch_stop = len(df_enhanced_filtered)//batch_size + 1
prompts_sop = prompts.find_subject_object_pairs_prompts(df_enhanced_filtered, batch_size)
scripts.calculate_total_tokens_and_price(prompts_sop, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_sop, client, tmp_dir, step="1")

Token count for prompt 0: 2016, Price: $0.00504
Token count for prompt 1: 1811, Price: $0.00453
Token count for prompt 2: 1761, Price: $0.00440
Token count for prompt 3: 2078, Price: $0.00519
Token count for prompt 4: 1929, Price: $0.00482
Token count for prompt 5: 2101, Price: $0.00525
Token count for prompt 6: 2165, Price: $0.00541
Token count for prompt 7: 1415, Price: $0.00354
Total token count: 15276
Total input price: $0.03819
Created 8 tasks
Tasks saved to data/data/results/tmp/batchinput_1.jsonl


2024-09-01 17:52:42,382 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"


Batch file uploaded: FileObject(id='file-l9LM5jWnqiJLVKGpvd1J81mX', bytes=69266, created_at=1725205962, filename='batchinput_1.jsonl', object='file', purpose='batch', status='processed', status_details=None)


**¡¡¡ creates the batch job and sends them to the OpenAI API !!!**
- and also saves the batch job ID to a file

In [37]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="1")

2024-09-01 17:52:48,535 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
2024-09-01 17:52:48,540 - INFO - Job ID batch_Oyo0Jr8lOUlhtOnG2zasublU added to the file.


**Load the newest job ID from the file**


In [38]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="1")
except (FileNotFoundError, ValueError) as e:
    print(e)

2024-09-01 17:53:22,852 - INFO - The newest job ID for task 1 is: batch_Oyo0Jr8lOUlhtOnG2zasublU
2024-09-01 17:53:22,853 - INFO - Timestamp: 2024-09-01T17:52:48.540030+02:00


In [39]:
status_info = scripts.retrieve_batch_job_status(client, newest_job_id)
status_info

2024-09-01 17:53:26,187 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_Oyo0Jr8lOUlhtOnG2zasublU "HTTP/1.1 200 OK"
2024-09-01 17:53:26,189 - INFO - Batch Job Status: in_progress
2024-09-01 17:53:26,189 - INFO - Batch Job ID: batch_Oyo0Jr8lOUlhtOnG2zasublU
2024-09-01 17:53:26,190 - INFO - Input File ID: file-l9LM5jWnqiJLVKGpvd1J81mX
2024-09-01 17:53:26,190 - INFO - Request Counts: Completed: 5, Failed: 0, Total: 8


{'status': 'in_progress',
 'batch_job_id': 'batch_Oyo0Jr8lOUlhtOnG2zasublU',
 'input_file_id': 'file-l9LM5jWnqiJLVKGpvd1J81mX',
 'completed': 5,
 'failed': 0,
 'total': 8}

In [40]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_sop = scripts.parse_and_clean_batch_responses(result)

df_sop_merged = df_responses_sop.merge(
    df_enhanced_filtered, 
    on=['design_id'], 
    how='left')

df_sop_merged.info()

2024-09-01 17:53:29,368 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_Oyo0Jr8lOUlhtOnG2zasublU "HTTP/1.1 200 OK"
2024-09-01 17:53:29,816 - INFO - HTTP Request: GET https://api.openai.com/v1/files/file-rVSKeyoNzyNlw7U4WPGczClN/content "HTTP/1.1 200 OK"
2024-09-01 17:53:29,935 - INFO - Parsed 8 JSON objects.
2024-09-01 17:53:29,939 - INFO - Cleaned 163 responses.
2024-09-01 17:53:29,942 - INFO - DataFrame created from cleaned responses.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            163 non-null    int64  
 1   s_o_id               163 non-null    object 
 2   s                    163 non-null    object 
 3   subject_class        163 non-null    object 
 4   o                    163 non-null    object 
 5   object_class         163 non-null    object 
 6   design_en            163 non-null    object 
 7   new_list_of_strings  163 non-null    object 
 8   completeness         0 non-null      float64
 9   relevance            163 non-null    int64  
 10  correctness          163 non-null    int64  
 11  comment_enh          163 non-null    object 
 12  list_of_strings      163 non-null    object 
dtypes: float64(1), int64(3), object(9)
memory usage: 16.7+ KB


In [42]:
# save df_sop_merged to json file in tmp
df_sop_merged.to_json(Path(tmp_dir) / "sop_before_val_temp_save_20240701_13h52m.json", orient='records')


In [43]:
#load the tmp saved file
df_sop_merged = pd.read_json(Path(tmp_dir) / "sop_before_val_temp_save_20240701_13h52m.json")
df_sop_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            163 non-null    int64  
 1   s_o_id               163 non-null    object 
 2   s                    163 non-null    object 
 3   subject_class        163 non-null    object 
 4   o                    163 non-null    object 
 5   object_class         163 non-null    object 
 6   design_en            163 non-null    object 
 7   new_list_of_strings  163 non-null    object 
 8   completeness         0 non-null      float64
 9   relevance            163 non-null    int64  
 10  correctness          163 non-null    int64  
 11  comment_enh          163 non-null    object 
 12  list_of_strings      163 non-null    object 
dtypes: float64(1), int64(3), object(9)
memory usage: 16.7+ KB


### Step 1.1 Validate and Classify Object Subject

In [44]:
print(f"Batch start {batch_start}, Batch stop {batch_stop}, Batch size {batch_size}")

Batch start 0, Batch stop 8, Batch size 12


In [45]:
factor = 2
batch_size = batch_size * factor
# batch_start = 0 
# batch_stop = batch_stop // factor
# print(f"Batch start {batch_start}, Batch stop {batch_stop}, Batch size {batch_size}")

In [46]:
prompts_validate_sop = prompts.validate_subject_object_pairs(df_sop_merged, batch_size)
scripts.calculate_total_tokens_and_price(prompts_validate_sop, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_validate_sop, client, tmp_dir, step="1_1")

Token count for prompt 0: 2499, Price: $0.00625
Token count for prompt 1: 2224, Price: $0.00556
Token count for prompt 2: 2821, Price: $0.00705
Token count for prompt 3: 2407, Price: $0.00602
Token count for prompt 4: 2731, Price: $0.00683
Token count for prompt 5: 2649, Price: $0.00662
Token count for prompt 6: 2223, Price: $0.00556
Total token count: 17554
Total input price: $0.04389
Created 7 tasks
Tasks saved to data/data/results/tmp/batchinput_1_1.jsonl


2024-09-01 17:54:17,132 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"


Batch file uploaded: FileObject(id='file-6I39AOyEmhNhwj25B3pTAm3l', bytes=82241, created_at=1725206056, filename='batchinput_1_1.jsonl', object='file', purpose='batch', status='processed', status_details=None)


In [47]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="1_1")

2024-09-01 17:54:20,236 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
2024-09-01 17:54:20,241 - INFO - Job ID batch_lkWx1aUHcdJ9ApUcO3owojyB added to the file.


In [48]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="1_1")
except (FileNotFoundError, ValueError) as e:
    print(e)

2024-09-01 17:54:23,043 - INFO - The newest job ID for task 1_1 is: batch_lkWx1aUHcdJ9ApUcO3owojyB
2024-09-01 17:54:23,043 - INFO - Timestamp: 2024-09-01T17:54:20.240028+02:00


In [49]:
batch_job = client.batches.retrieve(newest_job_id)
batch_job


2024-09-01 17:54:27,088 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_lkWx1aUHcdJ9ApUcO3owojyB "HTTP/1.1 200 OK"


Batch(id='batch_lkWx1aUHcdJ9ApUcO3owojyB', completion_window='24h', created_at=1725206060, endpoint='/v1/chat/completions', input_file_id='file-6I39AOyEmhNhwj25B3pTAm3l', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725292460, failed_at=None, finalizing_at=None, in_progress_at=1725206061, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=7))

In [50]:
print(client.batches.list())
# list batches if status is not processed
for batch in client.batches.list():
    if batch.status == "in_progress":
        print(batch)

# client.batches.cancel(newest_job_id)

2024-09-01 17:54:37,272 - INFO - HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


SyncCursorPage[Batch](data=[Batch(id='batch_lkWx1aUHcdJ9ApUcO3owojyB', completion_window='24h', created_at=1725206060, endpoint='/v1/chat/completions', input_file_id='file-6I39AOyEmhNhwj25B3pTAm3l', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725292460, failed_at=None, finalizing_at=None, in_progress_at=1725206061, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=1, failed=0, total=7)), Batch(id='batch_Oyo0Jr8lOUlhtOnG2zasublU', completion_window='24h', created_at=1725205968, endpoint='/v1/chat/completions', input_file_id='file-l9LM5jWnqiJLVKGpvd1J81mX', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1725206008, error_file_id=None, errors=None, expired_at=None, expires_at=1725292368, failed_at=None, finalizing_at=1725206007, in_progress_at=1725205969, metadata=None, output_file_id='file-rVSKeyoNzyNlw7U4

2024-09-01 17:54:37,728 - INFO - HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


Batch(id='batch_lkWx1aUHcdJ9ApUcO3owojyB', completion_window='24h', created_at=1725206060, endpoint='/v1/chat/completions', input_file_id='file-6I39AOyEmhNhwj25B3pTAm3l', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725292460, failed_at=None, finalizing_at=None, in_progress_at=1725206061, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=2, failed=0, total=7))


2024-09-01 17:54:38,432 - INFO - HTTP Request: GET https://api.openai.com/v1/batches?after=batch_hykzd34wdQG7t3poHPXeqiwF "HTTP/1.1 200 OK"
2024-09-01 17:54:38,775 - INFO - HTTP Request: GET https://api.openai.com/v1/batches?after=batch_TsAnvlVBrHwbBvypUAvuLUPW "HTTP/1.1 200 OK"
2024-09-01 17:54:38,990 - INFO - HTTP Request: GET https://api.openai.com/v1/batches?after=batch_x7Y2FTHkl3cYeXxgInNdOH5A "HTTP/1.1 200 OK"


In [53]:
scripts.retrieve_batch_job_status(client, newest_job_id)

2024-09-01 17:57:16,607 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_lkWx1aUHcdJ9ApUcO3owojyB "HTTP/1.1 200 OK"
2024-09-01 17:57:16,610 - INFO - Batch Job Status: completed
2024-09-01 17:57:16,611 - INFO - Batch Job ID: batch_lkWx1aUHcdJ9ApUcO3owojyB
2024-09-01 17:57:16,611 - INFO - Input File ID: file-6I39AOyEmhNhwj25B3pTAm3l
2024-09-01 17:57:16,612 - INFO - Request Counts: Completed: 7, Failed: 0, Total: 7


{'status': 'completed',
 'batch_job_id': 'batch_lkWx1aUHcdJ9ApUcO3owojyB',
 'input_file_id': 'file-6I39AOyEmhNhwj25B3pTAm3l',
 'completed': 7,
 'failed': 0,
 'total': 7}

In [54]:
batch_job = client.batches.retrieve(newest_job_id)
batch_job

2024-09-01 17:57:19,673 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_lkWx1aUHcdJ9ApUcO3owojyB "HTTP/1.1 200 OK"


Batch(id='batch_lkWx1aUHcdJ9ApUcO3owojyB', completion_window='24h', created_at=1725206060, endpoint='/v1/chat/completions', input_file_id='file-6I39AOyEmhNhwj25B3pTAm3l', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1725206122, error_file_id=None, errors=None, expired_at=None, expires_at=1725292460, failed_at=None, finalizing_at=1725206120, in_progress_at=1725206061, metadata=None, output_file_id='file-T5qOXIJNAyykauA1oiFMYIR9', request_counts=BatchRequestCounts(completed=7, failed=0, total=7))

In [55]:
batch_job = client.batches.retrieve(newest_job_id)
# client.files.delete(batch_job.input_file_id)

result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_val_sop = scripts.parse_and_clean_batch_responses(result)

df_sop_validated = df_responses_val_sop.merge(
    df_sop_merged, 
    on=['design_id', 's_o_id'], 
    how='left')

df_sop_validated.info()

2024-09-01 17:57:21,872 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_lkWx1aUHcdJ9ApUcO3owojyB "HTTP/1.1 200 OK"
2024-09-01 17:57:22,339 - INFO - HTTP Request: GET https://api.openai.com/v1/files/file-T5qOXIJNAyykauA1oiFMYIR9/content "HTTP/1.1 200 OK"
2024-09-01 17:57:22,461 - INFO - Parsed 7 JSON objects.
2024-09-01 17:57:22,464 - INFO - Cleaned 163 responses.
2024-09-01 17:57:22,467 - INFO - DataFrame created from cleaned responses.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            163 non-null    int64  
 1   s_o_id               163 non-null    object 
 2   validity_sop         163 non-null    int64  
 3   comment_sop          163 non-null    object 
 4   s                    163 non-null    object 
 5   subject_class        163 non-null    object 
 6   o                    163 non-null    object 
 7   object_class         163 non-null    object 
 8   design_en            163 non-null    object 
 9   new_list_of_strings  163 non-null    object 
 10  completeness         0 non-null      float64
 11  relevance            163 non-null    int64  
 12  correctness          163 non-null    int64  
 13  comment_enh          163 non-null    object 
 14  list_of_strings      163 non-null    object 
dtypes: float64(1), int64(4), object(10)
memo

In [56]:
columns = ['design_id', 's_o_id', 's', 'subject_class', 'o', 'object_class', 
           'validity_sop', 'comment_sop', 'design_en', 'new_list_of_strings', 
           'relevance', 'correctness', 'comment_enh', 'list_of_strings'
           ]
scripts.update_json_with_merged_df(df_sop_validated, columns, json_dir, sop_json_filename)

### Step 2: Combine Subject-Predicate-Object
- **Input:** Design description, subject-object pairs, and possible predicates.
- **Output:** List of subject-predicate-object triples.

In [57]:
df_sop = pd.read_json(Path(json_dir) / sop_json_filename)
df_sop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8861 entries, 0 to 8860
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            8861 non-null   int64  
 1   s_o_id               8861 non-null   object 
 2   s                    8861 non-null   object 
 3   subject_class        8861 non-null   object 
 4   o                    8861 non-null   object 
 5   object_class         8861 non-null   object 
 6   validity_sop         8861 non-null   int64  
 7   comment_sop          8861 non-null   object 
 8   design_en            8861 non-null   object 
 9   new_list_of_strings  8861 non-null   object 
 10  completeness         157 non-null    float64
 11  relevance            8861 non-null   int64  
 12  correctness          8861 non-null   int64  
 13  comment_enh          8861 non-null   object 
 14  list_of_strings      8861 non-null   object 
dtypes: float64(1), int64(4), object(10)
me

In [58]:
# count all entries in df_sop, for which validity is one value for each different value
df_sop['validity_sop'].value_counts()

validity_sop
 1    7733
 0     676
-1     452
Name: count, dtype: int64

In [59]:
df_sop_filtered = scripts.filter_sop_dataframe(df_sop, json_dir, pred_json_filename)
df_sop_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 438 entries, 10 to 8860
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            438 non-null    int64  
 1   s_o_id               438 non-null    object 
 2   s                    438 non-null    object 
 3   subject_class        438 non-null    object 
 4   o                    438 non-null    object 
 5   object_class         438 non-null    object 
 6   validity_sop         438 non-null    int64  
 7   comment_sop          438 non-null    object 
 8   design_en            438 non-null    object 
 9   new_list_of_strings  438 non-null    object 
 10  completeness         10 non-null     float64
 11  relevance            438 non-null    int64  
 12  correctness          438 non-null    int64  
 13  comment_enh          438 non-null    object 
 14  list_of_strings      438 non-null    object 
dtypes: float64(1), int64(4), object(10)
memory 

In [60]:
# filter all 'Null' values in 's' and 'o'
df_sop_filtered = df_sop_filtered[df_sop_filtered['o'] != 'NULL']
print(df_sop_filtered['validity_sop'].value_counts())
df_sop_filtered.info()  

validity_sop
 1    122
 0      9
-1      5
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 136 entries, 8698 to 8860
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            136 non-null    int64  
 1   s_o_id               136 non-null    object 
 2   s                    136 non-null    object 
 3   subject_class        136 non-null    object 
 4   o                    136 non-null    object 
 5   object_class         136 non-null    object 
 6   validity_sop         136 non-null    int64  
 7   comment_sop          136 non-null    object 
 8   design_en            136 non-null    object 
 9   new_list_of_strings  136 non-null    object 
 10  completeness         0 non-null      float64
 11  relevance            136 non-null    int64  
 12  correctness          136 non-null    int64  
 13  comment_enh          136 non-null    object 
 14  list_of_strings      1

In [61]:
batch_size = 32
batch_stop = len(df_sop_filtered)//batch_size + 1
prompts_pred = prompts.find_predicates_prompts(df_sop_filtered, batch_size)
scripts.calculate_total_tokens_and_price(prompts_pred, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_pred, client, tmp_dir, step="2")

Token count for prompt 0: 3000, Price: $0.00750
Token count for prompt 1: 2608, Price: $0.00652
Token count for prompt 2: 2875, Price: $0.00719
Token count for prompt 3: 2918, Price: $0.00730
Token count for prompt 4: 1126, Price: $0.00282
Total token count: 12527
Total input price: $0.03132
Created 5 tasks
Tasks saved to data/data/results/tmp/batchinput_2.jsonl


2024-09-01 17:57:38,943 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"


Batch file uploaded: FileObject(id='file-x9Oh66XExHhIXOVfURAkLHIi', bytes=64884, created_at=1725206258, filename='batchinput_2.jsonl', object='file', purpose='batch', status='processed', status_details=None)


In [62]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="2")

2024-09-01 17:57:42,023 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
2024-09-01 17:57:42,029 - INFO - Job ID batch_7udL2QTDqFe1UfgD9MEegriw added to the file.


In [63]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="2")
except (FileNotFoundError, ValueError) as e:
    print(e)

2024-09-01 17:57:44,442 - INFO - The newest job ID for task 2 is: batch_7udL2QTDqFe1UfgD9MEegriw
2024-09-01 17:57:44,443 - INFO - Timestamp: 2024-09-01T17:57:42.029072+02:00


In [67]:
scripts.retrieve_batch_job_status(client, newest_job_id)

2024-09-01 18:01:05,307 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_7udL2QTDqFe1UfgD9MEegriw "HTTP/1.1 200 OK"
2024-09-01 18:01:05,309 - INFO - Batch Job Status: completed
2024-09-01 18:01:05,310 - INFO - Batch Job ID: batch_7udL2QTDqFe1UfgD9MEegriw
2024-09-01 18:01:05,310 - INFO - Input File ID: file-x9Oh66XExHhIXOVfURAkLHIi
2024-09-01 18:01:05,310 - INFO - Request Counts: Completed: 5, Failed: 0, Total: 5


{'status': 'completed',
 'batch_job_id': 'batch_7udL2QTDqFe1UfgD9MEegriw',
 'input_file_id': 'file-x9Oh66XExHhIXOVfURAkLHIi',
 'completed': 5,
 'failed': 0,
 'total': 5}

In [68]:
df_sop_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136 entries, 8698 to 8860
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            136 non-null    int64  
 1   s_o_id               136 non-null    object 
 2   s                    136 non-null    object 
 3   subject_class        136 non-null    object 
 4   o                    136 non-null    object 
 5   object_class         136 non-null    object 
 6   validity_sop         136 non-null    int64  
 7   comment_sop          136 non-null    object 
 8   design_en            136 non-null    object 
 9   new_list_of_strings  136 non-null    object 
 10  completeness         0 non-null      float64
 11  relevance            136 non-null    int64  
 12  correctness          136 non-null    int64  
 13  comment_enh          136 non-null    object 
 14  list_of_strings      136 non-null    object 
dtypes: float64(1), int64(4), object(10)
memor

In [69]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_pred = scripts.parse_and_clean_batch_responses(result)

df_pred_merged = df_responses_pred.merge(
    df_sop_filtered, 
    on=['design_id', 's_o_id'], 
    how='left')

df_pred_merged.info()

2024-09-01 18:01:10,174 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_7udL2QTDqFe1UfgD9MEegriw "HTTP/1.1 200 OK"
2024-09-01 18:01:10,578 - INFO - HTTP Request: GET https://api.openai.com/v1/files/file-sWX7ln0atWztPNwBWtH3vtn1/content "HTTP/1.1 200 OK"
2024-09-01 18:01:10,703 - INFO - Parsed 5 JSON objects.
2024-09-01 18:01:10,705 - INFO - Cleaned 136 responses.
2024-09-01 18:01:10,706 - INFO - DataFrame created from cleaned responses.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            136 non-null    int64  
 1   s_o_id               136 non-null    object 
 2   predicate            136 non-null    object 
 3   s                    136 non-null    object 
 4   subject_class        136 non-null    object 
 5   o                    136 non-null    object 
 6   object_class         136 non-null    object 
 7   validity_sop         136 non-null    int64  
 8   comment_sop          136 non-null    object 
 9   design_en            136 non-null    object 
 10  new_list_of_strings  136 non-null    object 
 11  completeness         0 non-null      float64
 12  relevance            136 non-null    int64  
 13  correctness          136 non-null    int64  
 14  comment_enh          136 non-null    object 
 15  list_of_strings      136 non-null    obj

## Step 2.1: Validate and Classify Extractes Relations
- **Input:** List of subject-predicate-object triples.
- **Output:** Validated and classified relations, marked as "added predicates" or "used predicates in design".

#### Notes
- Avoid/Filter predicates which a in the text, and a valid relation, but not in the design description.
- Example 28/27
    - Antoninus Pius	wearing	Wreath

In [70]:
df_pred_merged.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            136 non-null    int64  
 1   s_o_id               136 non-null    object 
 2   predicate            136 non-null    object 
 3   s                    136 non-null    object 
 4   subject_class        136 non-null    object 
 5   o                    136 non-null    object 
 6   object_class         136 non-null    object 
 7   validity_sop         136 non-null    int64  
 8   comment_sop          136 non-null    object 
 9   design_en            136 non-null    object 
 10  new_list_of_strings  136 non-null    object 
 11  completeness         0 non-null      float64
 12  relevance            136 non-null    int64  
 13  correctness          136 non-null    int64  
 14  comment_enh          136 non-null    object 
 15  list_of_strings      136 non-null    obj

In [71]:
print(f"Batch size: {batch_size}")

Batch size: 32


In [72]:
prompts_validate_pred = prompts.validate_spo_triples(df_pred_merged, batch_size)
scripts.calculate_total_tokens_and_price(prompts_validate_pred, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_validate_pred, client, tmp_dir, step="2_1")

Token count for prompt 0: 3265, Price: $0.00816
Token count for prompt 1: 2875, Price: $0.00719
Token count for prompt 2: 3144, Price: $0.00786
Token count for prompt 3: 3189, Price: $0.00797
Token count for prompt 4: 1275, Price: $0.00319
Total token count: 13748
Total input price: $0.03437
Created 5 tasks
Tasks saved to data/data/results/tmp/batchinput_2_1.jsonl


2024-09-01 18:01:18,714 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"


Batch file uploaded: FileObject(id='file-5RCdD2b0ir2Ium38aGRrqv9T', bytes=72483, created_at=1725206478, filename='batchinput_2_1.jsonl', object='file', purpose='batch', status='processed', status_details=None)


In [73]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="2_1")

2024-09-01 18:01:21,889 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
2024-09-01 18:01:21,893 - INFO - Job ID batch_cIlxJXQ7OHnIRpN7148ghBrO added to the file.


In [74]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="2_1")
except (FileNotFoundError, ValueError) as e:
    print(e)

2024-09-01 18:01:24,819 - INFO - The newest job ID for task 2_1 is: batch_cIlxJXQ7OHnIRpN7148ghBrO
2024-09-01 18:01:24,820 - INFO - Timestamp: 2024-09-01T18:01:21.892881+02:00


In [76]:
scripts.retrieve_batch_job_status(client, newest_job_id)

2024-09-01 18:02:32,268 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_cIlxJXQ7OHnIRpN7148ghBrO "HTTP/1.1 200 OK"
2024-09-01 18:02:32,271 - INFO - Batch Job Status: completed
2024-09-01 18:02:32,272 - INFO - Batch Job ID: batch_cIlxJXQ7OHnIRpN7148ghBrO
2024-09-01 18:02:32,272 - INFO - Input File ID: file-5RCdD2b0ir2Ium38aGRrqv9T
2024-09-01 18:02:32,273 - INFO - Request Counts: Completed: 5, Failed: 0, Total: 5


{'status': 'completed',
 'batch_job_id': 'batch_cIlxJXQ7OHnIRpN7148ghBrO',
 'input_file_id': 'file-5RCdD2b0ir2Ium38aGRrqv9T',
 'completed': 5,
 'failed': 0,
 'total': 5}

In [77]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_val_pred = scripts.parse_and_clean_batch_responses(result)

df_pred_validated = df_responses_val_pred.merge(
    df_pred_merged, 
    on=['design_id', 's_o_id'], 
    how='left')

df_pred_validated.info()

2024-09-01 18:02:35,541 - INFO - HTTP Request: GET https://api.openai.com/v1/batches/batch_cIlxJXQ7OHnIRpN7148ghBrO "HTTP/1.1 200 OK"
2024-09-01 18:02:36,020 - INFO - HTTP Request: GET https://api.openai.com/v1/files/file-x09vmsLRDYF9YeRk79jRbWdW/content "HTTP/1.1 200 OK"
2024-09-01 18:02:36,144 - INFO - Parsed 5 JSON objects.
2024-09-01 18:02:36,149 - INFO - Cleaned 136 responses.
2024-09-01 18:02:36,155 - INFO - DataFrame created from cleaned responses.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            136 non-null    int64  
 1   s_o_id               136 non-null    object 
 2   validity_pred        136 non-null    int64  
 3   comment_pred         136 non-null    object 
 4   implicit_pred        136 non-null    object 
 5   predicate            136 non-null    object 
 6   s                    136 non-null    object 
 7   subject_class        136 non-null    object 
 8   o                    136 non-null    object 
 9   object_class         136 non-null    object 
 10  validity_sop         136 non-null    int64  
 11  comment_sop          136 non-null    object 
 12  design_en            136 non-null    object 
 13  new_list_of_strings  136 non-null    object 
 14  completeness         0 non-null      float64
 15  relevance            136 non-null    int

In [78]:
columns = ['design_id', 's_o_id', 's', 'subject_class', 'predicate', 'o', 'object_class', 
           "validity_pred", "comment_pred", "implicit_pred", 
           'validity_sop', 'comment_sop', 'design_en', 'new_list_of_strings', 
           'relevance', 'correctness', 'comment_enh', 'list_of_strings'
           ]

scripts.update_json_with_merged_df(df_pred_validated, columns, json_dir, pred_json_filename)

In [79]:
df_pred_validated['validity_pred'].value_counts()

validity_pred
 1    81
-1    45
 0    10
Name: count, dtype: int64

In [80]:
df_pred_val = df_pred_validated.copy()
df_pred_val = df_pred_val[df_pred_val['predicate'] != 'NULL']
df_pred_val

Unnamed: 0,design_id,s_o_id,validity_pred,comment_pred,implicit_pred,predicate,s,subject_class,o,object_class,validity_sop,comment_sop,design_en,new_list_of_strings,completeness,relevance,correctness,comment_enh,list_of_strings
3,8,a,1,Correct and meaningful SPO triple.,,on,amphora,OBJECT,stand,OBJECT,1,Correct and meaningful pair.,Prize amphora on ornamental stand; within linear square and incuse square.,"[[amphora, OBJECT], [stand, OBJECT]]",,1,1,The enhanced list includes the main objects of the design.,"[[amphora, OBJECT]]"
4,9,a,-1,Predicate 'holding' is not plausible for an amphora.,decorated_with,holding,Amphora,OBJECT,corn,PLANT,1,Correct and meaningful pair.,Amphora with ribbed surface and crooked handleholding two corn and poppy.,"[[Amphora, OBJECT], [corn, PLANT], [poppy, PLANT], [handle, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by including 'handle'.,"[[Amphora, OBJECT], [corn, PLANT], [poppy, PLANT]]"
5,9,b,-1,Predicate 'holding' is not plausible for an amphora.,decorated_with,holding,Amphora,OBJECT,poppy,PLANT,1,Correct and meaningful pair.,Amphora with ribbed surface and crooked handleholding two corn and poppy.,"[[Amphora, OBJECT], [corn, PLANT], [poppy, PLANT], [handle, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by including 'handle'.,"[[Amphora, OBJECT], [corn, PLANT], [poppy, PLANT]]"
7,10,a,1,Correct and meaningful SPO triple.,,wearing,Anchialos,PERSON,taenia,OBJECT,1,Correct and meaningful pair.,"Bust of youthful Anchialos, right, wearing taenia. Border of dots.","[[Anchialos, PERSON], [taenia, OBJECT], [Bust, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description.,"[[Bust, OBJECT], [Anchialos, PERSON], [taenia, OBJECT]]"
9,11,a,1,Correct and meaningful SPO triple.,,under,anchor,OBJECT,crayfish,ANIMAL,1,Correct and meaningful pair.,"Inverted anchor; under left fluke, crayfish, under right fluke, ethnicon.","[[anchor, OBJECT], [crayfish, ANIMAL], [ethnicon, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description.,"[[anchor, OBJECT], [crayfish, ANIMAL]]"
10,11,b,1,Correct and meaningful SPO triple.,,under,anchor,OBJECT,ethnicon,OBJECT,1,Correct and meaningful pair.,"Inverted anchor; under left fluke, crayfish, under right fluke, ethnicon.","[[anchor, OBJECT], [crayfish, ANIMAL], [ethnicon, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description.,"[[anchor, OBJECT], [crayfish, ANIMAL]]"
11,12,a,1,Correct and meaningful SPO triple.,,under,anchor,OBJECT,crayfish,ANIMAL,1,Correct and meaningful pair.,"Inverted anchor; under left fluke, crayfish, under right fluke, ethnicon; all within circular incuse.","[[anchor, OBJECT], [crayfish, ANIMAL], [ethnicon, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description.,"[[anchor, OBJECT], [crayfish, ANIMAL]]"
12,12,b,1,Correct and meaningful SPO triple.,,under,anchor,OBJECT,ethnicon,OBJECT,1,Correct and meaningful pair.,"Inverted anchor; under left fluke, crayfish, under right fluke, ethnicon; all within circular incuse.","[[anchor, OBJECT], [crayfish, ANIMAL], [ethnicon, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description.,"[[anchor, OBJECT], [crayfish, ANIMAL]]"
13,14,a,1,Correct and meaningful SPO triple.,,under,anchor,OBJECT,crayfish,ANIMAL,1,Correct and meaningful pair.,"Inverted anchor; under right fluke, crayfish, under left fluke, ethnicon.","[[anchor, OBJECT], [crayfish, ANIMAL], [ethnicon, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description.,"[[anchor, OBJECT], [crayfish, ANIMAL]]"
14,14,b,1,Correct and meaningful SPO triple.,,under,anchor,OBJECT,ethnicon,OBJECT,1,Correct and meaningful pair.,"Inverted anchor; under right fluke, crayfish, under left fluke, ethnicon.","[[anchor, OBJECT], [crayfish, ANIMAL], [ethnicon, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description.,"[[anchor, OBJECT], [crayfish, ANIMAL]]"
