In [1]:
import sys
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI
import warnings

# Load environment variables
_ = load_dotenv(find_dotenv())

# Append the submodules path to the local libs directory
repo_dir = Path().resolve()
sys.path.append(str(repo_dir / 'libs'))

# Ensure the symlink exists (assuming setup_symlink.py has been executed)
symlink_path = repo_dir / 'libs' / 'NLP_on_multilingual_coin_datasets'
if not symlink_path.exists():
    print(f"Error: Symlink {symlink_path} does not exist. Run setup_symlink.py first.")
    sys.exit(1)

# Import the custom modules after ensuring symlink is in place
from NLP_on_multilingual_coin_datasets.cnt.io import Database_Connection
from modules.loading_preprocessed_designs import PreprocessingConfig, LoadingPreprocessedDesigns
from modules import scripts, prompts

# Set up pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

# Suppress warnings
warnings.filterwarnings('ignore')

# Access the OpenAI API key from environment variables
api_key = os.getenv('OPENAI_API_KEY')

prep_cfg = PreprocessingConfig



  from .autonotebook import tqdm as notebook_tqdm


Define preprocessing config varialbes else default values will be used.

Define filenames and paths

In [2]:
json_dir = prep_cfg.json_path
enhanced_json_filename = "enhanced_objects.json"
sop_json_filename = "subject_object_pairs.json"
pred_json_filename = "subject_predicate_object_triples.json"

Define database connection parameters or set them as environment variables.

In [3]:
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
database = prep_cfg.database

connection_string = f"mysql+mysqlconnector://{db_user}:{db_password}@{db_host}:{db_port}/{database}"
dc = Database_Connection(connection_string)

# Load or preprocess data
- Check for the preprocessed designs.csv file in the `data` directory. 
    - If it does not exist, get the data from the database and preprocess it.
    - Else load the data from the file.
- Create Dataframe

In [4]:
lpd = LoadingPreprocessedDesigns(dc, prep_cfg)
df_designs = lpd.load_designs_csv_or_process_database()

2024-09-02 20:44:08,767 - INFO - Checking if file data/source/lists/csv/annotated_designs.csv exists.
2024-09-02 20:44:08,769 - INFO - File does not exist. Loading from database and running preprocessing.
2024-09-02 20:44:08,770 - INFO - Starting preprocessing of designs.
2024-09-02 20:44:09,046 - ERROR - Error loading entities: 'NoneType' object is not subscriptable
2024-09-02 20:44:09,047 - ERROR - Error during preprocessing: 'NoneType' object is not subscriptable


TypeError: 'NoneType' object is not subscriptable

# Prep Dataframe 

- create copy
- filter the columns id, design_en and annotations
    - respresenting only the preprocessed columns

In [None]:
df_designs_0 = df_designs.copy()
df_designs = df_designs[["id", "design_en", "annotations"]]
df_designs.info()

**create strings from annotations**

In [None]:
df_designs["list_of_strings"] = df_designs.apply(scripts.generate_list_of_strings, axis=1)
df_designs.head(5)

**load RE examples from prepared JSON file**

### Define a subset of the data to be used for the testing of the implementation.
- 22332 rows

In [None]:
start = 0 
stop = 3275

df_designs_source = df_designs.iloc[start:stop].copy()
df_designs_source.info()

# Step 0: Check for More Possible Subjects or Objects
- **Input:** Design description and list of strings (entities).
- **Output:** Identified subjects and objects categorized as PERSON, OBJECT, ANIMAL, PLANT.


In [None]:
df_designs_filtered = scripts.filter_source_dataframe(df_designs_source, json_dir, enhanced_json_filename)
df_designs_filtered.info()

In [None]:
batch_size = 32
batch_start = 0
batch_stop = 1
client = OpenAI(api_key=api_key)

In [None]:
prompts_enhance = prompts.enhance_objects_in_designs(df_designs_filtered, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_enhance, batch_start, batch_stop, 
)

In [None]:
responses_list_enhanced = scripts.process_prompts(prompts_enhance, client, batch_start, batch_stop)

In [None]:
df_responses_enhanced = pd.DataFrame(responses_list_enhanced, index=None)
df_responses_enhanced["design_id"] = df_responses_enhanced["design_id"].astype(int)

df_enhanced_merged = df_responses_enhanced.merge(df_designs_filtered[['id', 'design_en', 'list_of_strings']], 
                                       left_on='design_id', right_on='id', how='left'
                                       ).drop(columns='id')

df_enhanced_merged.info()

### Step 0.1 Validate and Classify enhanced entities

In [None]:
prompts_validate_enhanced = prompts.validate_overall_objects_in_designs(df_enhanced_merged, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_validate_enhanced, batch_start, batch_stop, 
)

In [None]:
responses_list_validated_enh = scripts.process_prompts(prompts_validate_enhanced, client, batch_start, batch_stop)

In [None]:
df_responses_val_enhanced = pd.DataFrame(responses_list_validated_enh, index=None)
df_enhanced_validated = df_responses_val_enhanced.merge(df_enhanced_merged, on=['design_id'], how='left')

df_enhanced_validated.head(2)

In [None]:

columns = ['design_id', 'design_en', 'new_list_of_strings', 
           'completeness', 'relevance', 'correctness', 'comment_enh', 'list_of_strings']
scripts.update_json_with_merged_df(df_enhanced_validated, columns, json_dir, enhanced_json_filename)

### Step 1: Identify Subject-Object Pairs
- **Input:** Design description and categorized entities.
- **Output:** List of subject-object pairs.

In [None]:
df_enhanced = pd.read_json(Path(json_dir) / enhanced_json_filename)
df_enhanced.info()


In [None]:
# show all unique values for completeness, relevance, correctness
print(df_enhanced['completeness'].value_counts(),
df_enhanced['relevance'].value_counts(),
df_enhanced['correctness'].value_counts())

In [None]:
df_enhanced_filtered = scripts.filter_enhanced_designs(df_enhanced, json_dir, sop_json_filename)
df_enhanced_filtered.info()

In [None]:
print(f"batch_start: {batch_start}, batch_stop: {batch_stop}")
print(f"batch_size: {batch_size}")
batch_size = 16
batch_start = 0
batch_stop = 2

In [None]:
#TODO where is id 67 ????
prompts_sop = prompts.find_subject_object_pairs_prompts(df_enhanced_filtered, batch_size=batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_sop, batch_start, batch_stop, 
)

In [None]:
responses_list_sop = scripts.process_prompts(prompts_sop, client, batch_start, batch_stop)


In [None]:
df_responses_sop = pd.DataFrame(responses_list_sop, index=None)
df_responses_sop["design_id"] = df_responses_sop["design_id"].astype(int)
# print(df_responses_sop.info())
# print(df_designs_filtered.info())
df_sop_merged = df_responses_sop.merge(df_enhanced_filtered, on=['design_id'], how='left')
# merged_df = merged_df.drop(columns='id')
df_sop_merged.head()

In [None]:
df_sop_merged.info()

### Step 1.1 Validate and Classify Object Subject

In [None]:
batch_size = 32
length_sop_merged = len(df_sop_merged)
print(length_sop_merged)
num_batches = (length_sop_merged + batch_size - 1) // batch_size

print(num_batches)
batch_stop = num_batches

prompts_validate_sop = prompts.validate_subject_object_pairs(df_sop_merged, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_validate_sop, batch_start, batch_stop, 
)

In [None]:
responses_list_validated_sop = scripts.process_prompts(prompts_validate_sop, client, batch_start, batch_stop)

In [None]:
df_responses_val_sop = pd.DataFrame(responses_list_validated_sop, index=None)
df_sop_validated = df_responses_val_sop.merge(df_sop_merged, on=['design_id', 's_o_id'], how='left')

df_sop_validated.head(2)

In [None]:
columns = ['design_id', 's_o_id', 's', 'subject_class', 'o', 'object_class', 
           'validity_sop', 'comment_sop', 'design_en', 'new_list_of_strings', 
           'completeness', 'relevance', 'correctness', 'comment_enh', 'list_of_strings'
           ]
scripts.update_json_with_merged_df(df_sop_validated, columns, json_dir, sop_json_filename)

### Step 2: Combine Subject-Predicate-Object
- **Input:** Design description, subject-object pairs, and possible predicates.
- **Output:** List of subject-predicate-object triples.

In [None]:
df_sop = pd.read_json(Path(json_dir) / sop_json_filename)
df_sop.info()

In [None]:
# count all entries in df_sop, for which validity is one value for each different value
df_sop['validity_sop'].value_counts()

In [None]:
df_sop_filtered = scripts.filter_sop_dataframe(df_sop, json_dir, pred_json_filename)
df_sop_filtered.info()

In [None]:
batch_size = 32
batch_start = 0
batch_stop = 2

In [None]:
prompts_pred = prompts.find_predicates_prompts(df_sop_filtered, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_pred, batch_start, batch_stop, 
)

In [None]:
responses_list_pred = scripts.process_prompts(prompts_pred, client, batch_start, batch_stop)

In [None]:
df_responses_pred = pd.DataFrame(responses_list_pred, index=None)
df_pred_merged = df_sop_filtered.merge(df_responses_pred, 
                                                  on=['design_id','s_o_id'], how='left'
                                                  )
df_pred_merged.head(5)



## Step 2.1: Validate and Classify Extractes Relations
- **Input:** List of subject-predicate-object triples.
- **Output:** Validated and classified relations, marked as "added predicates" or "used predicates in design".

#### Notes
- Avoid/Filter predicates which a in the text, and a valid relation, but not in the design description.
- Example 28/27
    - Antoninus Pius	wearing	Wreath

In [None]:
prompts_validate_pred = prompts.validate_spo_triples(df_pred_merged, batch_size)
scripts.calculate_total_tokens_and_price(
    prompts_validate_pred, batch_start, batch_stop, 
)

In [None]:
responses_list_validated_pred = scripts.process_prompts(prompts_validate_pred, client, batch_start, batch_stop)

In [None]:
df_responses_val_pred = pd.DataFrame(responses_list_validated_pred, index=None)
print(df_responses_val_pred.head(5))
df_pred_validated = df_responses_val_pred.merge(df_pred_merged, on=['design_id', 's_o_id'], how='left')

df_pred_validated.head(2)

In [None]:
columns = ['design_id', 's_o_id', 's', 'subject_class', 'predicate', 'o', 'object_class', 
           "validity_pred", "comment_pred", "implicit_pred", 
           'validity_sop', 'comment_sop', 'design_en', 'new_list_of_strings', 
           'completeness', 'relevance', 'correctness', 'comment_enh', 'list_of_strings'
           ]

scripts.update_json_with_merged_df(df_pred_validated, columns, json_dir, pred_json_filename)

In [None]:
df_pred_validated = pd.read_json(Path(json_dir) / pred_json_filename)
df_pred_validated.head()