# Food in Art

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import yaml
import numpy as np
from urllib.parse import urlparse, parse_qs, quote

from scipy import stats
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler



## Functions

In [None]:
def load_config(config_path: str) -> dict:
    """
    Load configuration from YAML file.
    
    Args:
        config_path: Path to YAML configuration file
    
    Returns:
        Dictionary containing configuration settings
    """
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)


In [None]:

def clean_datetime(series: pd.Series, value_type) -> pd.Series:
    """Convert series to datetime format."""
    series = pd.to_datetime(series, errors='coerce')
    if value_type == 'year':
        return pd.to_numeric(series.dt.year, downcast='integer', errors='coerce')
    return series

def clean_categorical(series: pd.Series, categories = None) -> pd.Series:
    """Convert series to categorical format with optional categories."""
    if categories:
        return pd.Categorical(series, categories=categories, ordered=False )
    return series.astype('category')


def extract_wikidata_id(series: pd.Series) -> pd.Series:
    """Extract the Wikidata ID from a series of URLs."""
    return series.str.extract(r'(Q\d+)', expand=False)



In [None]:

def process_column(series: pd.Series, dtype: str, value_type: str = None, categories = None) -> pd.Series:
    """
    Process a single column according to its configuration.
    
    Args:
        series: Column data to process
        dtype: Target data type
        categories: Optional list of categories for categorical data
    
    Returns:
        Processed column data
    """
    if value_type == 'wikidata_url':
        return extract_wikidata_id(series)
    
    if dtype == 'datetime64[ns]':
        return clean_datetime(series, value_type)
    elif dtype == 'category':
        return clean_categorical(series, categories)
    else:
        return series.astype(dtype)


In [None]:

def load_and_process_dataset(
    source_path: str,
    columns_config: dict
) -> pd.DataFrame:
    """
    Load and process a single dataset according to configuration.
    
    Args:
        source_path: Path to source CSV file
        columns_config: Configuration for columns
        dataset_name: Name of the dataset for specific processing
    
    Returns:
        Processed DataFrame
    """
    # Load data
    df = pd.read_csv(source_path)
    
    # Rename columns
    column_mappings = {
        config['original_name']: col_name
        for col_name, config in columns_config.items()
        if 'original_name' in config
    }
    df = df.rename(columns=column_mappings)
    
    # Select configured columns
    df = df[list(columns_config.keys())]
    
    # Process each column
    for column, config in columns_config.items():
        df[column] = process_column(
            df[column],
            config['dtype'],
            config.get('value_type'),
            config.get('categories')
        )
    
    # Set index if specified
    for column, config in columns_config.items():
        if config.get('is_index', False):
            df = df.drop_duplicates(subset=column, keep='first')
            #df = df.set_index(column)
    
    return df

def load_all_datasets(config: dict) -> dict:
    """
    Load and process all datasets defined in configuration.
    
    Args:
        config: Configuration dict
    
    Returns:
        Dictionary of processed DataFrames
    """
    
    return {
        dataset_name: load_and_process_dataset(
            dataset_config['source'],
            dataset_config['columns']
        )
        for dataset_name, dataset_config in config.items()
    }


In [None]:
def get_512px_thumbnail(url):
    """
    Transform a Wikimedia Commons Special:FilePath URL into its 512px thumbnail version.
    
    Args:
        url (str): URL in format: http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
        
    Returns:
        str: The 512px thumbnail URL or None if the URL is null
    
    """
    if not url:
        return None
    
    # Extract the filename from the URL
    filename = url.split('/')[-1]
    
    # Construct the 512px thumbnail URL
    thumbnail_url = f"https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/{filename}&width=512"
    
    return thumbnail_url

## Loading

In [None]:
# Load configuration from YAML file
config = load_config('config.yaml')


In [None]:
all_data = load_all_datasets(config)

#### Load correspondance table

In [None]:
correspondance_data = all_data['ids']
correspondance_data.drop_duplicates(inplace=True)
correspondance_data

#### Load paintings data

In [None]:
paintings_data = all_data['paintings']
paintings_data['image_url'] = paintings_data['image_url'].apply(lambda x: get_512px_thumbnail(x) if pd.notna(x) else x)
paintings_data

#### Load locations data

In [None]:
locations_data = all_data['locations']
locations_data

#### Load authors data

In [None]:
authors_data = all_data['authors']
authors_data

#### Load ML food data

In [None]:
food_words = all_data['food_words']
food_words['food_word_detected'] = food_words.select_dtypes(include='int').sum(axis=1) > 0
food_words

In [None]:
food_found = all_data['food_found']
food_found['food_image_detected'] = food_found['predictions'].apply(lambda x: len(x) > 3)
food_found

### Merging

In [None]:
merged_df = correspondance_data.merge(paintings_data, on='painting_id', how='inner')
merged_df = merged_df.merge(authors_data, on='author_id', how='left')
merged_df = merged_df.merge(locations_data, on='location_id', how='left')
merged_df = merged_df.merge(food_words, on='painting_id', how='left')
merged_df = merged_df.merge(food_found, on='painting_id', how='left')
merged_df

### Cleaning

### Duplicates

In [None]:
merged_df = merged_df.drop_duplicates(subset='painting_id', keep='first')

### Merging

In [None]:
merged_df['food_detected'] = (merged_df['food_word_detected'] | merged_df['food_image_detected']).astype(int)
merged_df

### Pruning

In [None]:
merged_df = merged_df[merged_df['image_path'].notna()]

In [None]:
print(merged_df['food_detected'].value_counts())
print(merged_df['food_word_detected'].value_counts())
print(merged_df['food_image_detected'].value_counts())

### Enhancing

In [None]:

display(merged_df[['creation_date','date_of_birth']])
display(merged_df[['creation_date','date_of_birth']].describe())


from datetime import datetime

def extract_year(input_str):
    current_year = datetime.now().year
    
    # Check if the input string has at least 4 characters and can be converted to an integer
    if isinstance(input_str, str) and len(input_str) >= 4:
        try:
            year = int(input_str[:4])
            if year > current_year:
                return np.nan
            return year
        except ValueError:
            return np.nan
    return np.nan

merged_df['creation_date'] = merged_df['creation_date'].apply(extract_year)
merged_df['date_of_birth'] = merged_df['date_of_birth'].apply(extract_year)


display(merged_df[['creation_date','date_of_birth']])
display(merged_df[['creation_date','date_of_birth']].describe())

In [None]:
# Fill missing creation year when possible
# Calculate the age of the painter at the time of painting
merged_df['painter_age_at_painting'] = merged_df['creation_date'] - merged_df['date_of_birth']

# Display the updated DataFrame
display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])

# Calculate the average painter_age_at_painting for each painter
avg_painter_age = merged_df['painter_age_at_painting'].mean().astype(int)

merged_df['painter_age_at_painting'].fillna(avg_painter_age, inplace=True)
#merged_df.dropna(subset=['creation_date', 'date_of_birth'], how='all', inplace=True)

# Fill missing creation_date with date_of_birth + avg_painter_age
merged_df['creation_date'].fillna(merged_df['date_of_birth'] + avg_painter_age, inplace=True)

display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])
merged_df

In [None]:
merged_df['painter'].fillna('Unknown Artist', inplace=True)
merged_df['author_country'].fillna('Unknown Country', inplace=True)
merged_df['location_country'].fillna('Unknown Country', inplace=True)
merged_df['location_name'].fillna('Unknown Location', inplace=True)

merged_df['author_gender'] = merged_df['author_gender'].astype('category')
merged_df['author_gender'] = merged_df['author_gender'].cat.set_categories(['male', 'female'])
merged_df['author_gender'].fillna('male', inplace=True)

In [None]:

# Add a column with decades
merged_df['decade'] = (merged_df['creation_date'] // 10) * 10

# Display the updated DataFrame
display(merged_df[['painter', 'creation_date', 'decade']])
print(merged_df['decade'].unique())

In [None]:
merged_df['location_country'] = merged_df['location_country'].replace('German Reich', 'Germany')
merged_df['author_country'] = merged_df['author_country'].replace('German Reich', 'Germany')

#### Add time period

In [None]:
def classify_period(decade):
    if decade < 1000:
        return "Antiquity"
    elif 1000 <= decade < 1400:
        return "Medieval"
    elif 1400 <= decade < 1500:
        return "Early Renaissance"
    elif 1500 <= decade < 1600:
        return "High Renaissance and Mannerism"
    elif 1600 <= decade < 1700:
        return "Baroque"
    elif 1700 <= decade < 1780:
        return "Rococo"
    elif 1780 <= decade < 1850:
        return "Neoclassicism and Romanticism"
    elif 1850 <= decade < 1900:
        return "Realism and Impressionism"
    elif 1900 <= decade < 1945:
        return "Modern Art"
    elif 1945 <= decade < 1970:
        return "Post-War and Abstract Expressionism"
    elif 1970 <= decade < 2000:
        return "Contemporary Art"
    else:
        return "Contemporary and Digital Art"



merged_df['time_period'] = merged_df['decade'].apply(classify_period)


#### Add gdp and pop

In [None]:
eco_df = pd.read_csv('data/gdp_pop_decades.csv')
eco_df

In [None]:
merged_df = merged_df.merge(
    eco_df,
    on='decade',
    how='left'  # Keep all artwork records, even if no economic data exists
)

merged_df

In [None]:

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize 'gdppc' and 'pop' columns
merged_df[['gdppc_normalized', 'pop_normalized']] = scaler.fit_transform(merged_df[['gdppc', 'pop']])

# Display the updated DataFrame
display(merged_df[['gdppc', 'gdppc_normalized', 'pop', 'pop_normalized']])

## FINAL DF

In [None]:
#export for clip
clip_train = merged_df[['painting_id','image_path', 'food_word_detected']]
clip_train = clip_train[clip_train['food_word_detected'] == 1]
clip_train

In [None]:
paintings_with_food = merged_df[merged_df['image_url'].isna() == False]
paintings_with_food = paintings_with_food[['title', 'painter', 'creation_date', 'author_gender', 'author_country', 'location_name', 'location_country', 'time_period', 'image_path', 'image_url', 'coordinates','food_detected','decade','gdppc','pop', 'gdppc_normalized', 'pop_normalized']]
paintings_with_food

## Export

In [None]:
paintings_with_food.to_csv('data/paintings_with_food.csv', index=False)

### GDP analysis

In [None]:
# Group by decade and calculate the proportion of food_detected
food_by_decade = merged_df.groupby('decade')['food_detected'].agg(artwork_count='count', food_related_sum='sum').reset_index()
food_by_decade['proportion_food_detected'] = food_by_decade['food_related_sum'] / food_by_decade['artwork_count']
# Merge normalized GDP and population data
food_by_decade = food_by_decade.merge(
    merged_df[['decade', 'gdppc_normalized', 'pop_normalized']].drop_duplicates(),
    on='decade',
    how='left'
)

# Filter the DataFrame to include only records from 1250 to 2000
food_by_decade = food_by_decade[(food_by_decade['decade'] >= 1250) & (food_by_decade['decade'] <= 2000)]
# Display the resulting DataFrame
food_by_decade.to_csv('data/food_by_decade_analysis.csv', index=False)
food_by_decade

In [None]:
# Read the data into a DataFrame
# Creating the DataFrame directly since we have the data as a string
data = food_by_decade

# Calculate Pearson correlation
correlation = data['proportion_food_detected'].corr(data['gdppc_normalized'])
correlation_pvalue = stats.pearsonr(data['proportion_food_detected'], data['gdppc_normalized'])

# Calculate summary statistics
summary_stats = {
    'Pearson Correlation': correlation,
    'P-value': correlation_pvalue[1],
    'Sample Size': len(data),
    'Mean Food Proportion': data['proportion_food_detected'].mean(),
    'Mean GDP per Capita': data['gdppc_normalized'].mean(),
}

print("\nCorrelation Analysis Results:")
for key, value in summary_stats.items():
    print(f"{key}: {value:.4f}")


In [None]:
df = data.copy()

# Step 2: Calculate the Pearson correlation coefficient and p-value
x = df['gdppc_normalized']
y = df['proportion_food_detected']

corr_coeff, p_value = pearsonr(x, y)
print(f"Pearson correlation coefficient: {corr_coeff}")
print(f"P-value: {p_value}")


# Step 4: Interpret the results
if p_value < 0.05:
    print("There is a statistically significant correlation between GDP per capita and the proportion of food artworks.")
else:
    print("There is no statistically significant correlation between GDP per capita and the proportion of food artworks.")