In [1]:
import sys
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI
import warnings

# Load environment variables
_ = load_dotenv(find_dotenv())

# Append the submodules path to the local libs directory
repo_dir = Path().resolve()
sys.path.append(str(repo_dir / 'libs'))

# Ensure the symlink exists (assuming setup_symlink.py has been executed)
symlink_path = repo_dir / 'libs' / 'NLP_on_multilingual_coin_datasets'
if not symlink_path.exists():
    print(f"Error: Symlink {symlink_path} does not exist. Run setup_symlink.py first.")
    sys.exit(1)

# Import the custom modules after ensuring symlink is in place
from NLP_on_multilingual_coin_datasets.cnt.io import Database_Connection
from modules.loading_preprocessed_designs import PreprocessingConfig, LoadingPreprocessedDesigns
from modules import scripts, prompts

# Set up pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

# Suppress warnings
# warnings.filterwarnings('ignore')

# Access the OpenAI API key from environment variables
api_key = os.getenv('OPENAI_API_KEY')

prep_cfg = PreprocessingConfig()

Define filenames and paths

In [2]:
json_dir = prep_cfg.json_path
enhanced_json_filename = "enhanced_objects.json"
sop_json_filename = "subject_object_pairs.json"
pred_json_filename = "subject_predicate_object_triples.json"
#tmps
tmp_dir = prep_cfg.tmp_path
job_ids_file_name = Path("batch_job_ids.json")
job_ids_file_path = tmp_dir / job_ids_file_name

Define database connection parameters or set them as environment variables.

In [3]:
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
database = prep_cfg.database

connection_string = f"mysql+mysqlconnector://{db_user}:{db_password}@{db_host}:{db_port}/{database}"
dc = Database_Connection(connection_string)


# Load or preprocess data
- Check for the preprocessed designs.csv file in the `data` directory. 
    - If it does not exist, get the data from the database and preprocess it.
    - Else load the data from the file.
- Create Dataframe

In [4]:
# TODO test file creation if not exist, after adjustments 
lpd = LoadingPreprocessedDesigns(dc, prep_cfg)
df_designs = lpd.load_designs_csv_or_process_database()

2024-09-02 18:46:16,603 - INFO - Checking if file data/source/lists/csv/annotated_designs.csv exists.
2024-09-02 18:46:16,605 - INFO - File does not exist. Loading from database and running preprocessing.
2024-09-02 18:46:16,605 - INFO - Starting preprocessing of designs.
2024-09-02 18:46:16,864 - INFO - package: mysql.connector.plugins
2024-09-02 18:46:16,865 - INFO - plugin_name: caching_sha2_password
2024-09-02 18:46:16,865 - INFO - AUTHENTICATION_PLUGIN_CLASS: MySQLCachingSHA2PasswordAuthPlugin
2024-09-02 18:46:17,105 - INFO - Loaded PERSON entities successfully.
2024-09-02 18:46:17,176 - INFO - Loaded OBJECT entities successfully.
2024-09-02 18:46:17,244 - INFO - Loaded ANIMAL entities successfully.
2024-09-02 18:46:17,312 - INFO - Loaded PLANT entities successfully.
2024-09-02 18:46:29,499 - INFO - Adding rules from entities.
Initializing Preprocess: 100%|██████████| 849/849 [00:00<00:00, 29478.18it/s]
2024-09-02 18:46:29,557 - INFO - Completed adding rules from entities.
2024-09

AttributeError: 'DataFrame' object has no attribute 'swifter'

# Prep Dataframe 

- create copy
- filter the columns id, design_en and annotations
    - respresenting only the preprocessed columns

In [None]:
df_designs_0 = df_designs.copy()
df_designs = df_designs[["id", "design_en", "annotations"]]
df_designs.info()

In [None]:
df_designs_0.head(5)

**create strings from annotations**

In [None]:
df_designs["list_of_strings"] = df_designs.apply(scripts.generate_list_of_strings, axis=1)
df_designs.head(5)

### Define a subset of the data to be used for the testing of the implementation.
- 22332 rows
- define top and start 
- only new ones will be processed, existing will be skipped with filtering.

In [None]:
start = 0 
stop = 3250


df_designs_source = df_designs.iloc[start:stop].copy()
df_designs_source.info()

# Step 0: Check for More Possible Subjects or Objects
- **Input:** Design description and list of strings (entities).
- **Output:** Identified and verified subjects and objects categorized as PERSON, OBJECT, ANIMAL, PLANT.


**Filter source data for already computed datapoints.**

In [None]:
df_designs_filtered = scripts.filter_source_dataframe(df_designs_source, json_dir, enhanced_json_filename)
df_designs_filtered.info()

**Define client and batchsize (not the OpenAI batch) but the size of the datapoints to be processed in one prompt.**

In [None]:
client = OpenAI(api_key=api_key)
batch_size = 32
batch_start = 0 # for token calc
batch_stop = len(df_designs_filtered)//batch_size + 1 # for token calc

**Create prompts and batches for enhance objects**

**Note** The price claculation is only an example and only for the input token, based on:
- gpt4o, 2024-July:
    - 5$/Million Token * 0.5 for batch API discount 

In [None]:
prompts_enhance = prompts.enhance_objects_in_designs(df_designs_filtered, batch_size)
scripts.calculate_total_tokens_and_price(prompts_enhance, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_enhance, client, tmp_dir, step="0")

**¡¡¡ creates the batch job and sends them to the OpenAI API !!!**
- and also saves the batch job ID to a file

In [None]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="0")

In [None]:
print(batch_job)


**Load the newest job ID from the file**
- For previous job IDs check the file `temp/batch_jobs_id.json` with timestamps.
- The idea behind this:
    - If the kernel is restarted, the job ID is still available and the job can be continued.

In [None]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="0")
except (FileNotFoundError, ValueError) as e:
    print(e)

**Check the status of the job ID**

In [None]:
status_info = scripts.retrieve_batch_job_status(client, newest_job_id)

**If the status is completed, load the results and proceed**

In [None]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_enhanced = scripts.parse_and_clean_batch_responses(result)

df_enhanced_merged = df_responses_enhanced.merge(
    df_designs_filtered[['id', 'design_en', 'list_of_strings']], 
    left_on='design_id', 
    right_on='id', 
    how='left'
).drop(columns='id')

df_enhanced_merged.info()


### Step 0.1 Validate and Classify enhanced entities

**Create prompts and batches for validation of enhanced objects**

In [None]:
prompts_validate_enhanced = prompts.validate_overall_objects_in_designs(df_enhanced_merged, batch_size)
scripts.calculate_total_tokens_and_price(prompts_validate_enhanced, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_validate_enhanced, client, tmp_dir, step="0_1")


**¡¡¡ creates the batch job and sends them to the OpenAI API !!!**
- and also saves the batch job ID to a file

In [None]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="0_1")

**Load the newest job ID from the file**
- For previous job IDs check the file `temp/batch_jobs_id.json` with timestamps.
- The idea behind this:
    - If the kernel is restarted, the job ID is still available and the job can be continued.

In [None]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="0_1")
except (FileNotFoundError, ValueError) as e:
    print(e)

**Check the status of the job ID**

In [None]:
scripts.retrieve_batch_job_status(client, newest_job_id)

**If the status is completed, load the results and proceed**

In [None]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_val_enhanced = scripts.parse_and_clean_batch_responses(result)

df_enhanced_validated = df_responses_val_enhanced.merge(
    df_enhanced_merged, 
    on=['design_id'], 
    how='left')

df_enhanced_validated.info()

**Save validated enahanced objects to file**

In [None]:
columns = ['design_id', 'design_en', 'new_list_of_strings', 
           'relevance', 'correctness', 'comment_enh', 'list_of_strings']
scripts.update_json_with_merged_df(df_enhanced_validated, columns, json_dir, enhanced_json_filename)

In [None]:
# show all different values for correctness in df_enhanced_validated
df_enhanced_validated['correctness'].value_counts()


### Step 1: Identify Subject-Object Pairs
- **Input:** Design description and categorized entities.
- **Output:** List of subject-object pairs.

In [None]:
df_enhanced = pd.read_json(Path(json_dir) / enhanced_json_filename)
df_enhanced.info()
# df_enhanced['design_id'].nunique()


In [None]:
df_enhanced_filtered = scripts.filter_enhanced_designs(df_enhanced, json_dir, sop_json_filename)
df_enhanced_filtered.info()

**Create prompts and batches for enhance objects**
- reduce batchsize: because!

In [None]:
batch_size = 12
batch_stop = len(df_enhanced_filtered)//batch_size + 1
prompts_sop = prompts.find_subject_object_pairs_prompts(df_enhanced_filtered, batch_size)
scripts.calculate_total_tokens_and_price(prompts_sop, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_sop, client, tmp_dir, step="1")

**¡¡¡ creates the batch job and sends them to the OpenAI API !!!**
- and also saves the batch job ID to a file

In [None]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="1")

**Load the newest job ID from the file**


In [None]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="1")
except (FileNotFoundError, ValueError) as e:
    print(e)

In [None]:
status_info = scripts.retrieve_batch_job_status(client, newest_job_id)
status_info

In [None]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_sop = scripts.parse_and_clean_batch_responses(result)

df_sop_merged = df_responses_sop.merge(
    df_enhanced_filtered, 
    on=['design_id'], 
    how='left')

df_sop_merged.info()

In [None]:
# save df_sop_merged to json file in tmp
df_sop_merged.to_json(Path(tmp_dir) / "sop_before_val_temp_save_20240701_13h52m.json", orient='records')


In [None]:
#load the tmp saved file
df_sop_merged = pd.read_json(Path(tmp_dir) / "sop_before_val_temp_save_20240701_13h52m.json")
df_sop_merged.info()

### Step 1.1 Validate and Classify Object Subject

In [None]:
print(f"Batch start {batch_start}, Batch stop {batch_stop}, Batch size {batch_size}")

In [None]:
factor = 2
batch_size = batch_size * factor
# batch_start = 0 
# batch_stop = batch_stop // factor
# print(f"Batch start {batch_start}, Batch stop {batch_stop}, Batch size {batch_size}")

In [None]:
prompts_validate_sop = prompts.validate_subject_object_pairs(df_sop_merged, batch_size)
scripts.calculate_total_tokens_and_price(prompts_validate_sop, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_validate_sop, client, tmp_dir, step="1_1")

In [None]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="1_1")

In [None]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="1_1")
except (FileNotFoundError, ValueError) as e:
    print(e)

In [None]:
batch_job = client.batches.retrieve(newest_job_id)
batch_job


In [None]:
print(client.batches.list())
# list batches if status is not processed
for batch in client.batches.list():
    if batch.status == "in_progress":
        print(batch)

# client.batches.cancel(newest_job_id)

In [None]:
scripts.retrieve_batch_job_status(client, newest_job_id)

In [None]:
batch_job = client.batches.retrieve(newest_job_id)
batch_job

In [None]:
batch_job = client.batches.retrieve(newest_job_id)
# client.files.delete(batch_job.input_file_id)

result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_val_sop = scripts.parse_and_clean_batch_responses(result)

df_sop_validated = df_responses_val_sop.merge(
    df_sop_merged, 
    on=['design_id', 's_o_id'], 
    how='left')

df_sop_validated.info()

In [None]:
columns = ['design_id', 's_o_id', 's', 'subject_class', 'o', 'object_class', 
           'validity_sop', 'comment_sop', 'design_en', 'new_list_of_strings', 
           'relevance', 'correctness', 'comment_enh', 'list_of_strings'
           ]
scripts.update_json_with_merged_df(df_sop_validated, columns, json_dir, sop_json_filename)

### Step 2: Combine Subject-Predicate-Object
- **Input:** Design description, subject-object pairs, and possible predicates.
- **Output:** List of subject-predicate-object triples.

In [None]:
df_sop = pd.read_json(Path(json_dir) / sop_json_filename)
df_sop.info()

In [None]:
# count all entries in df_sop, for which validity is one value for each different value
df_sop['validity_sop'].value_counts()

In [None]:
df_sop_filtered = scripts.filter_sop_dataframe(df_sop, json_dir, pred_json_filename)
df_sop_filtered.info()

In [None]:
# filter all 'Null' values in 's' and 'o'
df_sop_filtered = df_sop_filtered[df_sop_filtered['o'] != 'NULL']
print(df_sop_filtered['validity_sop'].value_counts())
df_sop_filtered.info()  

In [None]:
batch_size = 32
batch_stop = len(df_sop_filtered)//batch_size + 1
prompts_pred = prompts.find_predicates_prompts(df_sop_filtered, batch_size)
scripts.calculate_total_tokens_and_price(prompts_pred, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_pred, client, tmp_dir, step="2")

In [None]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="2")

In [None]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="2")
except (FileNotFoundError, ValueError) as e:
    print(e)

In [None]:
scripts.retrieve_batch_job_status(client, newest_job_id)

In [None]:
df_sop_filtered.info()

In [None]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_pred = scripts.parse_and_clean_batch_responses(result)

df_pred_merged = df_responses_pred.merge(
    df_sop_filtered, 
    on=['design_id', 's_o_id'], 
    how='left')

df_pred_merged.info()

## Step 2.1: Validate and Classify Extractes Relations
- **Input:** List of subject-predicate-object triples.
- **Output:** Validated and classified relations, marked as "added predicates" or "used predicates in design".

#### Notes
- Avoid/Filter predicates which a in the text, and a valid relation, but not in the design description.
- Example 28/27
    - Antoninus Pius	wearing	Wreath

In [None]:
df_pred_merged.info()


In [None]:
print(f"Batch size: {batch_size}")

In [None]:
prompts_validate_pred = prompts.validate_spo_triples(df_pred_merged, batch_size)
scripts.calculate_total_tokens_and_price(prompts_validate_pred, batch_start, batch_stop, batch=True)
batch_file = scripts.create_tasks_batch(prompts_validate_pred, client, tmp_dir, step="2_1")

In [None]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

# Add job ID to file
scripts.add_job_to_file(job_ids_file_path, batch_job.id, step="2_1")

In [None]:
try:
    newest_job_id = scripts.load_newest_job_id(job_ids_file_path, step="2_1")
except (FileNotFoundError, ValueError) as e:
    print(e)

In [None]:
scripts.retrieve_batch_job_status(client, newest_job_id)

In [None]:
batch_job = client.batches.retrieve(newest_job_id)
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
df_responses_val_pred = scripts.parse_and_clean_batch_responses(result)

df_pred_validated = df_responses_val_pred.merge(
    df_pred_merged, 
    on=['design_id', 's_o_id'], 
    how='left')

df_pred_validated.info()

In [None]:
columns = ['design_id', 's_o_id', 's', 'subject_class', 'predicate', 'o', 'object_class', 
           "validity_pred", "comment_pred", "implicit_pred", 
           'validity_sop', 'comment_sop', 'design_en', 'new_list_of_strings', 
           'relevance', 'correctness', 'comment_enh', 'list_of_strings'
           ]

scripts.update_json_with_merged_df(df_pred_validated, columns, json_dir, pred_json_filename)

In [None]:
df_pred_validated['validity_pred'].value_counts()

In [None]:
df_pred_val = df_pred_validated.copy()
df_pred_val = df_pred_val[df_pred_val['predicate'] != 'NULL']
df_pred_val