In [1]:
import sys
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI
import warnings

# Load environment variables
_ = load_dotenv(find_dotenv())

# Append the submodules path to the local libs directory
repo_dir = Path().resolve()
sys.path.append(str(repo_dir / 'libs'))

# Ensure the symlink exists (assuming setup_symlink.py has been executed)
symlink_path = repo_dir / 'libs' / 'NLP_on_multilingual_coin_datasets'
if not symlink_path.exists():
    print(f"Error: Symlink {symlink_path} does not exist. Run setup_symlink.py first.")
    sys.exit(1)

# Import the custom modules after ensuring symlink is in place
from NLP_on_multilingual_coin_datasets.cnt.io import Database_Connection
from modules.loading_preprocessed_designs import PreprocessingConfig, LoadingPreprocessedDesigns
from modules import scripts, prompts

# Set up pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

# Suppress warnings
warnings.filterwarnings('ignore')

# Access the OpenAI API key from environment variables
api_key = os.getenv('OPENAI_API_KEY')

prep_cfg = PreprocessingConfig





Define preprocessing config varialbes else default values will be used.

Define filenames and paths

In [2]:
json_dir = prep_cfg.json_path
enhanced_json_filename = "enhanced_objects.json"
sop_json_filename = "subject_object_pairs.json"
pred_json_filename = "subject_predicate_object_triples.json"

Define database connection parameters or set them as environment variables.

In [3]:
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
database = prep_cfg.database

dc = Database_Connection(f"mysql+mysqlconnector://{db_user}:{db_password}@{db_host}:{db_port}/{database}")

# Load or preprocess data
- Check for the preprocessed designs.csv file in the `data` directory. 
    - If it does not exist, get the data from the database and preprocess it.
    - Else load the data from the file.
- Create Dataframe

In [6]:
# TODO test file creation if not exist, after adjustments 
lpd = LoadingPreprocessedDesigns(dc, prep_cfg)
df_designs = lpd.load_designs_csv_or_process_database()

2024-09-01 17:50:21,433 - INFO - Checking if file data/source/lists/csv/annotated_designs.csv exists.
2024-09-01 17:50:21,538 - INFO - Converting annotations to list.
2024-09-01 17:50:22,449 - INFO - File exists and was loaded.


# Prep Dataframe 

- create copy
- filter the columns id, design_en and annotations
    - respresenting only the preprocessed columns

In [7]:
df_designs_0 = df_designs.copy()
df_designs = df_designs[["id", "design_en", "annotations"]]
df_designs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22333 entries, 0 to 22332
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           22333 non-null  int64 
 1   design_en    22333 non-null  object
 2   annotations  22333 non-null  object
dtypes: int64(1), object(2)
memory usage: 523.6+ KB


**create strings from annotations**

In [8]:
df_designs["list_of_strings"] = df_designs.apply(scripts.generate_list_of_strings, axis=1)
df_designs.head(5)

Unnamed: 0,id,design_en,annotations,list_of_strings
0,1,"Diadem head of deified Alexander the Great with horn of Ammon, right. Border of dots.","[(0, 6, OBJECT), (7, 11, OBJECT), (23, 42, PERSON), (48, 52, OBJECT)]","[(Diadem, OBJECT), (head, OBJECT), (Alexander the Great, PERSON), (horn, OBJECT)]"
1,6,"Altar, lighted and garland.","[(0, 5, OBJECT), (19, 26, OBJECT)]","[(Altar, OBJECT), (garland, OBJECT)]"
2,8,Prize amphora on ornamental stand; within linear square and incuse square.,"[(6, 13, OBJECT)]","[(amphora, OBJECT)]"
3,9,Amphora with ribbed surface and crooked handleholding two corn and poppy.,"[(0, 7, OBJECT), (58, 62, PLANT), (67, 72, PLANT)]","[(Amphora, OBJECT), (corn, PLANT), (poppy, PLANT)]"
4,10,"Bust of youthful Anchialos, right, wearing taenia. Border of dots.","[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OBJECT)]","[(Bust, OBJECT), (Anchialos, PERSON), (taenia, OBJECT)]"


**load RE examples from prepared JSON file**

### Define a subset of the data to be used for the testing of the implementation.
- 22332 rows

In [10]:
start = 0 
stop = 3275

df_designs_source = df_designs.iloc[start:stop].copy()
df_designs_source.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3275 entries, 0 to 3274
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               3275 non-null   int64 
 1   design_en        3275 non-null   object
 2   annotations      3275 non-null   object
 3   list_of_strings  3275 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.5+ KB


# Step 0: Check for More Possible Subjects or Objects
- **Input:** Design description and list of strings (entities).
- **Output:** Identified subjects and objects categorized as PERSON, OBJECT, ANIMAL, PLANT.


In [11]:
df_designs_filtered = scripts.filter_source_dataframe(df_designs_source, json_dir, enhanced_json_filename)
df_designs_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 75 entries, 3200 to 3274
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               75 non-null     int64 
 1   design_en        75 non-null     object
 2   annotations      75 non-null     object
 3   list_of_strings  75 non-null     object
dtypes: int64(1), object(3)
memory usage: 2.9+ KB


In [13]:
batch_size = 32
batch_start = 0
batch_stop = 1
client = OpenAI(api_key=api_key)

In [14]:
prompts_enhance = prompts.enhance_objects_in_designs(df_designs_filtered, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_enhance, batch_start, batch_stop, 
)

Token count for prompt 0: 3333, Price: $0.01666
Total token count: 3333
Total input price: $0.01666


(3333, 0.016665)

In [15]:
responses_list_enhanced = scripts.process_prompts(prompts_enhance, client, batch_start, batch_stop)

2024-09-01 17:51:59,054 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 17:52:13,796 - INFO - Token count for completion: 1331, Price: $0.01997
2024-09-01 17:52:13,798 - ERROR - JSON format error: Expecting ',' delimiter: line 5 column 1 (char 117)
2024-09-01 17:52:13,799 - INFO - JSON format error fixed.


In [16]:
df_responses_enhanced = pd.DataFrame(responses_list_enhanced, index=None)
df_responses_enhanced["design_id"] = df_responses_enhanced["design_id"].astype(int)

df_enhanced_merged = df_responses_enhanced.merge(df_designs_filtered[['id', 'design_en', 'list_of_strings']], 
                                       left_on='design_id', right_on='id', how='left'
                                       ).drop(columns='id')

df_enhanced_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   design_id            32 non-null     int64 
 1   new_list_of_strings  32 non-null     object
 2   design_en            32 non-null     object
 3   list_of_strings      32 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.1+ KB


### Step 0.1 Validate and Classify enhanced entities

In [17]:
prompts_validate_enhanced = prompts.validate_overall_objects_in_designs(df_enhanced_merged, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_validate_enhanced, batch_start, batch_stop, 
)

Token count for prompt 0: 4334, Price: $0.02167
Total token count: 4334
Total input price: $0.02167


(4334, 0.021670000000000002)

In [18]:
responses_list_validated_enh = scripts.process_prompts(prompts_validate_enhanced, client, batch_start, batch_stop)

2024-09-01 17:53:16,853 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 17:53:32,083 - INFO - Token count for completion: 1734, Price: $0.02601


In [19]:
df_responses_val_enhanced = pd.DataFrame(responses_list_validated_enh, index=None)
df_enhanced_validated = df_responses_val_enhanced.merge(df_enhanced_merged, on=['design_id'], how='left')

df_enhanced_validated.head(2)

Unnamed: 0,design_id,relevance,correctness,comment_enh,new_list_of_strings,design_en,list_of_strings
0,4289,1,1,The enhanced list includes all significant objects mentioned in the design description and correctly classifies them.,"[[grape, PLANT], [vine, PLANT], [branch, OBJECT]]",Bunch of grape on vine branch.,"[(grape, PLANT), (vine, PLANT), (branch, OBJECT)]"
1,4290,0,0,"The enhanced list is missing 'head', which is a significant part of the design description.","[[Dionysus, PERSON]]",Frontal head of Dionysus,"[(head, OBJECT), (Dionysus, PERSON)]"


In [20]:

columns = ['design_id', 'design_en', 'new_list_of_strings', 
           'completeness', 'relevance', 'correctness', 'comment_enh', 'list_of_strings']
scripts.update_json_with_merged_df(df_enhanced_validated, columns, json_dir, enhanced_json_filename)

### Step 1: Identify Subject-Object Pairs
- **Input:** Design description and categorized entities.
- **Output:** List of subject-object pairs.

In [21]:
df_enhanced = pd.read_json(Path(json_dir) / enhanced_json_filename)
df_enhanced.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3282 entries, 0 to 3281
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            3282 non-null   int64  
 1   design_en            3282 non-null   object 
 2   new_list_of_strings  3282 non-null   object 
 3   completeness         64 non-null     float64
 4   relevance            3282 non-null   int64  
 5   correctness          3282 non-null   int64  
 6   comment_enh          3282 non-null   object 
 7   list_of_strings      3282 non-null   object 
dtypes: float64(1), int64(3), object(4)
memory usage: 205.2+ KB


In [22]:
# show all unique values for completeness, relevance, correctness
print(df_enhanced['completeness'].value_counts(),
df_enhanced['relevance'].value_counts(),
df_enhanced['correctness'].value_counts())

completeness
1.0    45
0.0    19
Name: count, dtype: int64 relevance
1    3249
0      33
Name: count, dtype: int64 correctness
 1    3230
 0      48
-1       4
Name: count, dtype: int64


In [23]:
df_enhanced_filtered = scripts.filter_enhanced_designs(df_enhanced, json_dir, sop_json_filename)
df_enhanced_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119 entries, 64 to 3281
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            119 non-null    int64  
 1   design_en            119 non-null    object 
 2   new_list_of_strings  119 non-null    object 
 3   completeness         0 non-null      float64
 4   relevance            119 non-null    int64  
 5   correctness          119 non-null    int64  
 6   comment_enh          119 non-null    object 
 7   list_of_strings      119 non-null    object 
dtypes: float64(1), int64(3), object(4)
memory usage: 8.4+ KB


In [24]:
print(f"batch_start: {batch_start}, batch_stop: {batch_stop}")
print(f"batch_size: {batch_size}")
batch_size = 16
batch_start = 0
batch_stop = 2

batch_start: 0, batch_stop: 1
batch_size: 32


In [25]:
#TODO where is id 67 ????
prompts_sop = prompts.find_subject_object_pairs_prompts(df_enhanced_filtered, batch_size=batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_sop, batch_start, batch_stop, 
)

Token count for prompt 0: 2244, Price: $0.01122
Token count for prompt 1: 1953, Price: $0.00977
Total token count: 4197
Total input price: $0.02099


(4197, 0.020985)

In [26]:
responses_list_sop = scripts.process_prompts(prompts_sop, client, batch_start, batch_stop)


2024-09-01 17:55:43,637 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 17:55:55,917 - INFO - Token count for completion: 1289, Price: $0.01934
2024-09-01 17:55:56,372 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 17:56:03,557 - INFO - Token count for completion: 835, Price: $0.01252


In [28]:
df_responses_sop = pd.DataFrame(responses_list_sop, index=None)
df_responses_sop["design_id"] = df_responses_sop["design_id"].astype(int)
# print(df_responses_sop.info())
# print(df_designs_filtered.info())
df_sop_merged = df_responses_sop.merge(df_enhanced_filtered, on=['design_id'], how='left')
# merged_df = merged_df.drop(columns='id')
df_sop_merged.head()

Unnamed: 0,design_id,s_o_id,s,subject_class,o,object_class,design_en,new_list_of_strings,completeness,relevance,correctness,comment_enh,list_of_strings
0,1,a,Alexander the Great,PERSON,horn,OBJECT,"Diadem head of deified Alexander the Great with horn of Ammon, right. Border of dots.","[[Alexander the Great, PERSON], [horn, OBJECT], [Diadem, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by excluding redundant 'head'.,"[[Diadem, OBJECT], [head, OBJECT], [Alexander the Great, PERSON], [horn, OBJECT]]"
1,1,b,Alexander the Great,PERSON,Diadem,OBJECT,"Diadem head of deified Alexander the Great with horn of Ammon, right. Border of dots.","[[Alexander the Great, PERSON], [horn, OBJECT], [Diadem, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by excluding redundant 'head'.,"[[Diadem, OBJECT], [head, OBJECT], [Alexander the Great, PERSON], [horn, OBJECT]]"
2,6,a,,,,,"Altar, lighted and garland.","[[Altar, OBJECT], [garland, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description.,"[[Altar, OBJECT], [garland, OBJECT]]"
3,8,a,amphora,OBJECT,stand,OBJECT,Prize amphora on ornamental stand; within linear square and incuse square.,"[[amphora, OBJECT], [stand, OBJECT]]",,1,1,The enhanced list includes the main objects of the design.,"[[amphora, OBJECT]]"
4,9,a,Amphora,OBJECT,corn,PLANT,Amphora with ribbed surface and crooked handleholding two corn and poppy.,"[[Amphora, OBJECT], [corn, PLANT], [poppy, PLANT], [handle, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by including 'handle'.,"[[Amphora, OBJECT], [corn, PLANT], [poppy, PLANT]]"


In [29]:
df_sop_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            40 non-null     int64  
 1   s_o_id               40 non-null     object 
 2   s                    40 non-null     object 
 3   subject_class        40 non-null     object 
 4   o                    40 non-null     object 
 5   object_class         40 non-null     object 
 6   design_en            40 non-null     object 
 7   new_list_of_strings  40 non-null     object 
 8   completeness         0 non-null      float64
 9   relevance            40 non-null     int64  
 10  correctness          40 non-null     int64  
 11  comment_enh          40 non-null     object 
 12  list_of_strings      40 non-null     object 
dtypes: float64(1), int64(3), object(9)
memory usage: 4.2+ KB


### Step 1.1 Validate and Classify Object Subject

In [30]:
batch_size = 32
length_sop_merged = len(df_sop_merged)
print(length_sop_merged)
num_batches = (length_sop_merged + batch_size - 1) // batch_size

print(num_batches)
batch_stop = num_batches

prompts_validate_sop = prompts.validate_subject_object_pairs(df_sop_merged, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_validate_sop, batch_start, batch_stop, 
)

40
2
Token count for prompt 0: 2966, Price: $0.01483
Token count for prompt 1: 1279, Price: $0.00639
Total token count: 4245
Total input price: $0.02122


(4245, 0.021224999999999997)

In [31]:
responses_list_validated_sop = scripts.process_prompts(prompts_validate_sop, client, batch_start, batch_stop)

2024-09-01 17:57:03,210 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 17:57:17,080 - INFO - Token count for completion: 1465, Price: $0.02197
2024-09-01 17:57:17,439 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 17:57:20,689 - INFO - Token count for completion: 366, Price: $0.00549


In [32]:
df_responses_val_sop = pd.DataFrame(responses_list_validated_sop, index=None)
df_sop_validated = df_responses_val_sop.merge(df_sop_merged, on=['design_id', 's_o_id'], how='left')

df_sop_validated.head(2)

Unnamed: 0,design_id,s_o_id,validity_sop,comment_sop,s,subject_class,o,object_class,design_en,new_list_of_strings,completeness,relevance,correctness,comment_enh,list_of_strings
0,1,a,1,Correct and meaningful pair.,Alexander the Great,PERSON,horn,OBJECT,"Diadem head of deified Alexander the Great with horn of Ammon, right. Border of dots.","[[Alexander the Great, PERSON], [horn, OBJECT], [Diadem, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by excluding redundant 'head'.,"[[Diadem, OBJECT], [head, OBJECT], [Alexander the Great, PERSON], [horn, OBJECT]]"
1,1,b,1,Correct and meaningful pair.,Alexander the Great,PERSON,Diadem,OBJECT,"Diadem head of deified Alexander the Great with horn of Ammon, right. Border of dots.","[[Alexander the Great, PERSON], [horn, OBJECT], [Diadem, OBJECT]]",,1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by excluding redundant 'head'.,"[[Diadem, OBJECT], [head, OBJECT], [Alexander the Great, PERSON], [horn, OBJECT]]"


In [33]:
columns = ['design_id', 's_o_id', 's', 'subject_class', 'o', 'object_class', 
           'validity_sop', 'comment_sop', 'design_en', 'new_list_of_strings', 
           'completeness', 'relevance', 'correctness', 'comment_enh', 'list_of_strings'
           ]
scripts.update_json_with_merged_df(df_sop_validated, columns, json_dir, sop_json_filename)

### Step 2: Combine Subject-Predicate-Object
- **Input:** Design description, subject-object pairs, and possible predicates.
- **Output:** List of subject-predicate-object triples.

In [34]:
df_sop = pd.read_json(Path(json_dir) / sop_json_filename)
df_sop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8901 entries, 0 to 8900
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            8901 non-null   int64  
 1   s_o_id               8901 non-null   object 
 2   s                    8901 non-null   object 
 3   subject_class        8901 non-null   object 
 4   o                    8901 non-null   object 
 5   object_class         8901 non-null   object 
 6   validity_sop         8901 non-null   int64  
 7   comment_sop          8901 non-null   object 
 8   design_en            8901 non-null   object 
 9   new_list_of_strings  8901 non-null   object 
 10  completeness         157 non-null    float64
 11  relevance            8901 non-null   int64  
 12  correctness          8901 non-null   int64  
 13  comment_enh          8901 non-null   object 
 14  list_of_strings      8901 non-null   object 
dtypes: float64(1), int64(4), object(10)
me

In [35]:
# count all entries in df_sop, for which validity is one value for each different value
df_sop['validity_sop'].value_counts()

validity_sop
 1    7752
 0     677
-1     472
Name: count, dtype: int64

In [36]:
df_sop_filtered = scripts.filter_sop_dataframe(df_sop, json_dir, pred_json_filename)
df_sop_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 10 to 8900
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   design_id            478 non-null    int64  
 1   s_o_id               478 non-null    object 
 2   s                    478 non-null    object 
 3   subject_class        478 non-null    object 
 4   o                    478 non-null    object 
 5   object_class         478 non-null    object 
 6   validity_sop         478 non-null    int64  
 7   comment_sop          478 non-null    object 
 8   design_en            478 non-null    object 
 9   new_list_of_strings  478 non-null    object 
 10  completeness         10 non-null     float64
 11  relevance            478 non-null    int64  
 12  correctness          478 non-null    int64  
 13  comment_enh          478 non-null    object 
 14  list_of_strings      478 non-null    object 
dtypes: float64(1), int64(4), object(10)
memory 

In [37]:
batch_size = 32
batch_start = 0
batch_stop = 2

In [38]:
prompts_pred = prompts.find_predicates_prompts(df_sop_filtered, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_pred, batch_start, batch_stop, 
)

Token count for prompt 0: 2400, Price: $0.01200
Token count for prompt 1: 2393, Price: $0.01196
Total token count: 4793
Total input price: $0.02397


(4793, 0.023965)

In [39]:
responses_list_pred = scripts.process_prompts(prompts_pred, client, batch_start, batch_stop)

2024-09-01 17:58:28,406 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 17:58:37,490 - INFO - Token count for completion: 912, Price: $0.01368
2024-09-01 17:58:37,928 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 17:58:50,285 - INFO - Token count for completion: 907, Price: $0.01361


In [40]:
df_responses_pred = pd.DataFrame(responses_list_pred, index=None)
df_pred_merged = df_sop_filtered.merge(df_responses_pred, 
                                                  on=['design_id','s_o_id'], how='left'
                                                  )
df_pred_merged.head(5)



Unnamed: 0,design_id,s_o_id,s,subject_class,o,object_class,validity_sop,comment_sop,design_en,new_list_of_strings,completeness,relevance,correctness,comment_enh,list_of_strings,predicate
0,1531,a,,,,,-1,Invalid pair as both subject and object are NULL.,"Head of bearded Heracles, left. Border of dots.","[[Heracles, PERSON]]",0.0,1,1,The enhanced list is missing 'head' which is mentioned in the design description.,"[[Head, OBJECT], [Heracles, PERSON]]",
1,1532,a,,,,,-1,Invalid pair as both subject and object are NULL.,Bull standing right.,"[[Bull, ANIMAL]]",1.0,1,1,The enhanced list includes the main object of the design.,"[[Bull, ANIMAL]]",standing
2,1534,a,,,,,-1,Invalid pair as both subject and object are NULL.,Upright club. Border of dots.,"[[club, OBJECT]]",1.0,1,1,The enhanced list includes the main object of the design.,"[[club, OBJECT]]",
3,1536,a,,,,,-1,Invalid pair as both subject and object are NULL.,Bull standing left. Ground line. Border of dots.,"[[Bull, ANIMAL]]",0.0,1,1,The enhanced list is missing 'Ground line' and 'Border of dots' which are mentioned in the design description.,"[[Bull, ANIMAL]]",standing
4,1540,a,,,,,-1,Invalid pair as both subject and object are NULL.,"Draped bust of Agrippina I, left. Border of dots.","[[Agrippina I, PERSON]]",1.0,1,1,The enhanced list correctly identifies 'Agrippina I' as the main subject of the design.,"[[Draped, OBJECT], [bust, OBJECT]]",


## Step 2.1: Validate and Classify Extractes Relations
- **Input:** List of subject-predicate-object triples.
- **Output:** Validated and classified relations, marked as "added predicates" or "used predicates in design".

#### Notes
- Avoid/Filter predicates which a in the text, and a valid relation, but not in the design description.
- Example 28/27
    - Antoninus Pius	wearing	Wreath

In [41]:
prompts_validate_pred = prompts.validate_spo_triples(df_pred_merged, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_validate_pred, batch_start, batch_stop, 
)

Token count for prompt 0: 2666, Price: $0.01333
Token count for prompt 1: 2664, Price: $0.01332
Total token count: 5330
Total input price: $0.02665


(5330, 0.02665)

In [42]:
responses_list_validated_pred = scripts.process_prompts(prompts_validate_pred, client, batch_start, batch_stop)

2024-09-01 18:02:27,356 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 18:02:50,048 - INFO - Token count for completion: 1662, Price: $0.02493
2024-09-01 18:02:50,697 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 18:03:12,385 - INFO - Token count for completion: 1812, Price: $0.02718


In [43]:
df_responses_val_pred = pd.DataFrame(responses_list_validated_pred, index=None)
print(df_responses_val_pred.head(5))
df_pred_validated = df_responses_val_pred.merge(df_pred_merged, on=['design_id', 's_o_id'], how='left')

df_pred_validated.head(2)

   design_id s_o_id  validity_pred                        comment_pred implicit_pred
0       1531      a             -1      NULL is not a valid predicate.       showing
1       1532      a              1  Correct and meaningful SPO triple.          NULL
2       1534      a             -1      NULL is not a valid predicate.       showing
3       1536      a              1  Correct and meaningful SPO triple.          NULL
4       1540      a             -1      NULL is not a valid predicate.       showing


Unnamed: 0,design_id,s_o_id,validity_pred,comment_pred,implicit_pred,s,subject_class,o,object_class,validity_sop,comment_sop,design_en,new_list_of_strings,completeness,relevance,correctness,comment_enh,list_of_strings,predicate
0,1531,a,-1,NULL is not a valid predicate.,showing,,,,,-1,Invalid pair as both subject and object are NULL.,"Head of bearded Heracles, left. Border of dots.","[[Heracles, PERSON]]",0.0,1,1,The enhanced list is missing 'head' which is mentioned in the design description.,"[[Head, OBJECT], [Heracles, PERSON]]",
1,1532,a,1,Correct and meaningful SPO triple.,,,,,,-1,Invalid pair as both subject and object are NULL.,Bull standing right.,"[[Bull, ANIMAL]]",1.0,1,1,The enhanced list includes the main object of the design.,"[[Bull, ANIMAL]]",standing


In [44]:
columns = ['design_id', 's_o_id', 's', 'subject_class', 'predicate', 'o', 'object_class', 
           "validity_pred", "comment_pred", "implicit_pred", 
           'validity_sop', 'comment_sop', 'design_en', 'new_list_of_strings', 
           'completeness', 'relevance', 'correctness', 'comment_enh', 'list_of_strings'
           ]

scripts.update_json_with_merged_df(df_pred_validated, columns, json_dir, pred_json_filename)

In [46]:
df_pred_validated = pd.read_json(Path(json_dir) / pred_json_filename)
df_pred_validated.head()

Unnamed: 0,design_id,s_o_id,s,subject_class,predicate,o,object_class,validity_pred,comment_pred,implicit_pred,validity_sop,comment_sop,design_en,new_list_of_strings,relevance,correctness,comment_enh,list_of_strings,completeness
0,1526,a,Apollo,PERSON,wearing,garment,OBJECT,1,Correct and meaningful SPO triple.,,1,Correct and meaningful pair.,"Apollo standing facing, head left,wearing long garment, holding lyre with left hand on a short column to his left.","[[Apollo, PERSON], [garment, OBJECT], [lyre, OBJECT], [column, OBJECT]]",1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by excluding redundant 'head' and 'hand'.,"[[Apollo, PERSON], [head, OBJECT], [garment, OBJECT], [lyre, OBJECT], [hand, OBJECT], [column, OBJECT]]",1.0
1,1526,b,Apollo,PERSON,holding,lyre,OBJECT,1,Correct and meaningful SPO triple.,,1,Correct and meaningful pair.,"Apollo standing facing, head left,wearing long garment, holding lyre with left hand on a short column to his left.","[[Apollo, PERSON], [garment, OBJECT], [lyre, OBJECT], [column, OBJECT]]",1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by excluding redundant 'head' and 'hand'.,"[[Apollo, PERSON], [head, OBJECT], [garment, OBJECT], [lyre, OBJECT], [hand, OBJECT], [column, OBJECT]]",1.0
2,1526,c,Apollo,PERSON,on,column,OBJECT,-1,Predicate 'on' is not plausible in this context.,standing_on,1,Correct and meaningful pair.,"Apollo standing facing, head left,wearing long garment, holding lyre with left hand on a short column to his left.","[[Apollo, PERSON], [garment, OBJECT], [lyre, OBJECT], [column, OBJECT]]",1,1,The enhanced list includes all significant objects mentioned in the design description and has improved relevance by excluding redundant 'head' and 'hand'.,"[[Apollo, PERSON], [head, OBJECT], [garment, OBJECT], [lyre, OBJECT], [hand, OBJECT], [column, OBJECT]]",1.0
3,1527,a,club,OBJECT,over,lion skin,OBJECT,1,Correct and meaningful SPO triple.,,1,Correct and meaningful pair.,"Upright club, lion skin to left over it.","[[club, OBJECT], [lion skin, OBJECT]]",1,1,The enhanced list includes the main objects of the design.,"[[club, OBJECT], [lion skin, OBJECT]]",1.0
4,1528,a,Apollo,PERSON,wearing,chlamys,OBJECT,1,Correct and meaningful SPO triple.,,1,Correct and meaningful pair.,"Wreath bust of Apollo, left, wearing chlamys; in front, bow. Border of dots.","[[Wreath, OBJECT], [bust, OBJECT], [Apollo, PERSON], [chlamys, OBJECT], [bow, OBJECT]]",1,1,The enhanced list includes all significant objects mentioned in the design description.,"[[Wreath, OBJECT], [bust, OBJECT], [Apollo, PERSON], [chlamys, OBJECT], [bow, OBJECT]]",1.0
