In [None]:
import pandas as pd
import json
import urllib
import os
import random
from urllib.parse import quote
from IPython.display import Image
from datetime import datetime


In [None]:
paintings = pd.read_csv('data/wikidata_all_paintings.csv')
paintings = paintings.drop_duplicates(subset='item', keep='first')
food_keywords = pd.read_csv('data/food_related_words.csv')

display(paintings.head())
display(food_keywords)

In [None]:


# --- One-Hot Encoding Based on Closest Food Terms ---

# Step 1: Create a mapping from word to closest_food_term
word_to_food_term = food_keywords.set_index('word')['closest_food_term'].str.lower().to_dict()

# Step 2: Prepare the list of unique closest_food_terms for one-hot encoding
unique_food_terms = food_keywords['closest_food_term'].str.lower().unique()

# Step 3: Handle missing 'depicts' by replacing NaN with empty string
paintings['depicts'] = paintings['depicts'].fillna('')

# Step 4: Split the 'depicts' column into lists of words, lowercase, and strip whitespace
paintings['depicts_list'] = paintings['depicts'].str.lower().str.split(',').apply(lambda x: [word.strip() for word in x])

# Step 5: Explode the lists to create a row for each word
depicts_exploded = paintings.explode('depicts_list')

# Step 6: Map each word to its closest_food_term if it exists in the mapping
depicts_exploded['closest_food_term'] = depicts_exploded['depicts_list'].map(word_to_food_term)

# Step 7: Remove rows where there is no matching word in DataFrame B (i.e., closest_food_term is NaN)
matched_depicts = depicts_exploded.dropna(subset=['closest_food_term'])

# Step 8: Create one-hot encoded columns for the closest_food_terms
food_dummies = pd.get_dummies(matched_depicts['closest_food_term'])

# Step 9: Ensure that all unique_food_terms are represented as columns
food_dummies = food_dummies.reindex(columns=unique_food_terms, fill_value=0)

# Step 10: Aggregate the dummy columns back to the original DataFrame A's index
food_dummies_grouped = food_dummies.groupby(matched_depicts.index).max()

# Step 11: Join the one-hot encoded columns back to DataFrame A
paintings = paintings.join(food_dummies_grouped)

# Step 12: Fill any missing values in the one-hot encoded columns with 0
paintings[unique_food_terms] = paintings[unique_food_terms].fillna(0).astype(int)

# Step 13: Ensure all one-hot encoded columns are present in DataFrame A
for term in unique_food_terms:
    if term not in paintings.columns:
        paintings[term] = 0

# Optional Step: Drop the temporary 'depicts_list' column
paintings = paintings.drop(columns=['depicts_list'])

# Display the updated DataFrame A
print("Updated DataFrame A with One-Hot Encoded Closest Food Terms:")
display(paintings.head())

# If you want to save the updated DataFrame to a CSV file, uncomment the following line:
# df_A.to_csv('updated_dataframe_A_with_food_terms.csv', index=False)

In [None]:
# Define the columns to sum (one-hot encoded food term columns)
columns_to_sum = unique_food_terms

# Sum each row for the specified columns and add a new column 'food_count'
paintings['food_count'] = paintings[columns_to_sum].sum(axis=1)

# Display the updated DataFrame
paintings = paintings.sort_values('food_count', ascending=False)

paintings.dropna(subset=['image_url'], inplace=True)


In [None]:
def get_image_path(url,path='img/img_512/'):
    parsed = urllib.parse.urlparse(url)
    # First decoding: %2520 -> %20
    first_decode = urllib.parse.unquote(parsed.path)
    # Second decoding: %20 -> space
    #second_decode = urllib.parse.unquote(first_decode)
    filename = os.path.basename(first_decode)
    # Replace spaces with underscores as per MediaWiki API requirements
    #filename = filename.replace(' ', '_')
    return path + filename

paintings['image_path'] = paintings['image_url'].apply(get_image_path)

In [None]:
# Export a list of images not present in the directory for rows that contain food
missing_image_paths = paintings[paintings['food_count'] > 0 & ~paintings['image_path'].apply(os.path.exists)]
display(missing_image_paths)


In [None]:

# Save the list to a text file
missing_image_paths.to_csv('data/missing_image_paths.csv', index=False)

In [None]:
paintings_with_food = paintings[['item','image_path']].join(paintings[unique_food_terms])



# Remove image paths ending with 'tif'
paintings_with_food = paintings_with_food[~paintings_with_food['image_path'].str.endswith('.tif')]
# Filter out image paths that do not exist in the specified directory
paintings_with_food = paintings_with_food[paintings_with_food['image_path'].apply(lambda x: os.path.exists(x))]




# Display the updated DataFrame
display(paintings_with_food)

paintings_with_food.to_csv('data/paintings_with_food_nlp.csv', index=False)

In [None]:
paintings_image_path = paintings_with_food[['image_path','item']]
paintings_image_path.to_csv('data/paintings_image_path.csv', index=False)

In [None]:
# Filter paintings that have at least one food-related term
paintings_subset = paintings_with_food[paintings_with_food[unique_food_terms].sum(axis=1) > 0]
paintings_subset

In [None]:

paintings_subset = paintings_subset.drop(columns='item')
#paintings_subset = paintings_subset.sample(1000, random_state=1)
paintings_subset = paintings_subset.reset_index(drop=True)
paintings_subset.to_csv('data/paintings_subset.csv', index=False)

In [None]:


# Define the label columns
label_columns = unique_food_terms

# Function to create predictions
def create_prediction(row):
    selected_labels = [label.capitalize() for label in label_columns if row[label] == 1]
    if selected_labels:
        return {
            "result": [
                {
                    "from_name": "labels",
                    "to_name": "image",
                    "type": "choices",
                    "value": {
                        "choices": selected_labels
                    }
                }
            ]
        }
    else:
        return {"result": []}

# Create the Label Studio JSON structure with predictions
label_studio_data = []
for _, row in paintings_with_food.iterrows():
    data_entry = {
        "data": {
            "image": '/data/local-files/?d='+row['image_path'] # Ensure the path is correct
        },
        "predictions": [create_prediction(row)]
    }
    label_studio_data.append(data_entry)

# Save to JSON file

# Get the current timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save to JSON file with timestamp in the filename
""" with open(f'label_studio_predictions_{timestamp}.json', 'w') as f:
    json.dump(label_studio_data, f, indent=4) """

In [None]:


# Select a random image path from the DataFrame
random_image_path = random.choice(paintings_subset['image_path'].tolist())

# Display the image
display(Image(filename=random_image_path))
print(random_image_path)

In [None]:
import shutil

# Define the source and destination directories
source_dir = 'img/img_512/'
destination_dir = 'img/duplicate_images/'

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Copy the random image to the destination directory
shutil.copy(random_image_path, os.path.join(destination_dir, os.path.basename(random_image_path)))