# Food in Art

In [625]:
%load_ext autoreload
%autoreload 2 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [626]:
from analysis import *
import os
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from dotenv import load_dotenv
from sqlalchemy import create_engine, text


## Loading

In [627]:
config = load_config('config.yaml')


In [628]:
all_data = load_all_datasets(config)

  df = pd.read_csv(source_path)


#### Load correspondance table

In [629]:
correspondance_data = all_data['ids']
correspondance_data.drop_duplicates(inplace=True)
correspondance_data

Unnamed: 0,painting_id,author_id,location_id
0,Q724861,unknown,Q728116
1,Q727875,Q47551,Q51252
2,Q605863,Q310973,Q338330
3,Q607598,Q723863,Q861252
4,Q607761,Q5432,unknown
...,...,...,...
643641,Q130724770,Q1826320,unknown
643642,Q130724778,Q3768945,unknown
643643,Q130724781,Q334262,unknown
643644,Q130724839,Q5825256,Q1992004


#### Load paintings data

In [630]:
paintings_data = all_data['paintings']
paintings_data['image_url'] = paintings_data['image_url'].apply(lambda x: get_512px_thumbnail(x) if pd.notna(x) else x)
paintings_data

Unnamed: 0,painting_id,creation_date,title,image_url,time_period
0,Q724861,1612.0,Ashbourne portrait,https://commons.wikimedia.org/w/index.php?titl...,unknown
1,Q727875,1538.0,Venus of Urbino,https://commons.wikimedia.org/w/index.php?titl...,unknown
5,Q605863,1530.0,Portrait of a gentleman in his studio,https://commons.wikimedia.org/w/index.php?titl...,unknown
6,Q607598,1445.0,Virgin of the Councillors,https://commons.wikimedia.org/w/index.php?titl...,unknown
7,Q607761,1793.0,The Death of the Picador,https://commons.wikimedia.org/w/index.php?titl...,unknown
...,...,...,...,...,...
666277,Q130724770,1851.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown
666278,Q130724778,1874.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown
666279,Q130724781,1646.0,Atlas holding up the celestial globe,https://commons.wikimedia.org/w/index.php?titl...,unknown
666280,Q130724839,1928.0,Self Portrait,https://commons.wikimedia.org/w/index.php?titl...,unknown


#### Load locations data

In [631]:
locations_data = all_data['locations']
locations_data

Unnamed: 0,location_id,location_name,location_country,coordinates
0,Q728116,Folger Shakespeare Library,United States of America,Point(-77.003172 38.889361)
4,Q51252,Uffizi Gallery,Italy,Point(11.255277777 43.768333333)
7,Q338330,Gallerie dell'Accademia,Italy,Point(12.328139 45.431078)
13,Q861252,Museu Nacional d'Art de Catalunya,Spain,Point(2.153305555 41.368333333)
14,Q1970945,Buffalo AKG Art Museum,United States of America,Point(-78.875618 42.93245)
...,...,...,...,...
20665,Q124695064,unknown,Germany,unknown
20666,Q5548057,"Georgian State Museum of Theatre, Music, Cinem...",Georgia,Point(44.794258 41.715799)
20667,Q3610430,Dadiani Palaces Museum,Georgia,Point(41.87416667 42.51222222)
20668,Q41185,Caen,France,Point(-0.363611111 49.181388888)


#### Load authors data

In [632]:
authors_data = all_data['authors']
authors_data

Unnamed: 0,author_id,painter,author_country,date_of_birth,author_gender
0,Q310973,Lorenzo Lotto,Republic of Venice,1480.0,male
1,Q723863,Lluís Dalmau,unknown,1428.0,male
2,Q5432,Francisco Goya,Spain,1746.0,male
3,Q37693,Paul Gauguin,France,1848.0,male
4,Q8459,Giorgione,Republic of Venice,1470.0,male
...,...,...,...,...,...
656,Q380708,Dirck van Baburen,Dutch Republic,1595.0,male
658,Q281998,Jean-Baptiste Debret,France,1768.0,male
659,Q3749980,Francesco Filippini,Kingdom of Italy,1853.0,male
660,Q737726,Robert Peake the elder,Kingdom of England,1551.0,male


#### Load ML food data

In [633]:
food_words = all_data['food_words']
food_words['food_word_detected'] = food_words.select_dtypes(include='int').sum(axis=1) > 0
food_words

Unnamed: 0,painting_id,image_path,fruit,bread,cookware,seafood,wine,meal,cheese,meat,food,beverage,dairy,vegetable,dessert,food_word_detected
0,Q27064304,img/img_512/Intérieur de cuisine - Joachim Beu...,0,0,0,1,0,0,0,1,0,0,1,1,1,True
1,Q12900365,img/img_512/The Luncheon (SM sg170).png,0,1,0,1,1,1,1,0,0,0,0,0,0,True
2,Q776175,img/img_512/Pieter Bruegel the Elder- The Harv...,0,0,0,0,0,1,0,1,1,1,0,1,0,True
3,Q72701665,img/img_512/Lille PdBA quellin fyt jesus marth...,0,0,1,1,0,0,0,1,0,0,0,1,1,True
4,Q20532659,"img/img_512/OA Hermansen, Et frokostbord, 1884...",0,0,0,0,0,1,1,0,0,1,0,0,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71165,Q51247485,"img/img_512/Alex Colville - Infantry, near Nij...",0,0,0,0,0,0,0,0,0,0,0,0,0,False
71166,Q51244389,img/img_512/Ivan Žabota - dekliški portret.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,False
71167,Q51235353,img/img_512/Ivan Žabota - Marta Krásovej.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,False
71168,Q51265369,img/img_512/Ivan Žabota - ženski portret.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,False


In [634]:
food_found = all_data['food_found']
food_found['food_image_detected'] = food_found['predictions'].apply(lambda x: len(x) > 3)
food_found

Unnamed: 0,painting_id,predictions,food_image_detected
0,Q27064304,[],False
1,Q776175,[],False
2,Q12900365,[],False
3,Q27974915,[],False
4,Q20532659,[],False
...,...,...,...
65084,Q51243217,[],False
65085,Q51247485,[],False
65086,Q51235353,[],False
65087,Q51265369,[],False


### SQL upload

In [635]:
load_dotenv()
password = os.getenv('PASSWORD')

In [636]:

# Create the Database
database_name = 'art_and_food_db'

# Set Up Database Connection
engine = create_engine(f'mysql+pymysql://root:{password}@localhost')

# Create Database if it Doesn't Exist
with engine.connect() as conn:
    conn.execute(text(f'CREATE DATABASE IF NOT EXISTS {database_name}'))

In [637]:
# Connect to the Newly Created Database
engine = create_engine(
    f'mysql+pymysql://root:{password}@localhost/{database_name}')

In [638]:
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Date, DateTime, Float, ForeignKey, Text
#from geoalchemy2 import Geometry
#from datetime import datetime

# Create MetaData instance
metadata = MetaData()

# Define tables
paintings = Table('paintings', metadata,
    Column('painting_id', String(50), primary_key=True),
    Column('creation_date', Integer),
    Column('title', String(255)),
    Column('image_url', Text),
    Column('time_period', String(100))
)

locations = Table('locations', metadata,
    Column('location_id', String(50), primary_key=True),
    Column('location_name', String(255)),
    Column('location_country', String(100)),
    Column('coordinates', String(100))
)

authors = Table('authors', metadata,
    Column('author_id', String(50), primary_key=True),
    Column('painter', String(255)),
    Column('author_country', String(100)),
    Column('date_of_birth', Integer),
    Column('author_gender', String(50))
)

correspondence = Table('correspondence', metadata,
    Column('id', Integer, primary_key=True, autoincrement=True),
    Column('painting_id', String(50)),
    Column('author_id', String(50)),
    Column('location_id', String(50))
)
""" correspondence = Table('correspondence', metadata,
    Column('id', Integer, primary_key=True, autoincrement=True),
    Column('painting_id', String(50), ForeignKey('paintings.painting_id'), nullable=False),
    Column('author_id', String(50), ForeignKey('authors.author_id')),
    Column('location_id', String(50), ForeignKey('locations.location_id'))
) """

def create_tables(engine):
    metadata.create_all(engine)
    

def insert_data(engine, paintings_df, locations_df, authors_df, correspondence_df):
    connection = engine.connect()
    
    try:
        # Insert Locations
        locations_df = locations_df.dropna(how='any')
        location_data = locations_df.to_dict(orient='records')
        if location_data:
            connection.execute(locations.insert(), location_data)
        
        # Insert Authors
        authors_df = authors_df.dropna(how='any')
        author_data = authors_df.to_dict(orient='records')
        if author_data:
            connection.execute(authors.insert(), author_data)
        
        # Insert Paintings
        paintings_df = paintings_df.dropna(how='any')
        painting_data = paintings_df.to_dict(orient='records')
        if painting_data:
            connection.execute(paintings.insert(), painting_data)
        
        # Insert Correspondence
        correspondence_df = correspondence_df.dropna(how='any')
        correspondence_data = correspondence_df.to_dict(orient='records')
        if correspondence_data:
            connection.execute(correspondence.insert(), correspondence_data)
        
        connection.commit()
    
    except Exception as e:
        connection.rollback()
        raise e
    
    finally:
        connection.close()


In [639]:
# Create tables
create_tables(engine)


In [640]:
# Insert data
insert_data(engine, paintings_data, locations_data, authors_data, correspondance_data)


### Merging

In [None]:
merged_df = correspondance_data.merge(paintings_data, on='painting_id', how='inner')
merged_df = merged_df.merge(authors_data, on='author_id', how='left')
merged_df = merged_df.merge(locations_data, on='location_id', how='left')
merged_df = merged_df.merge(food_words, on='painting_id', how='left')
merged_df = merged_df.merge(food_found, on='painting_id', how='left')
merged_df

### Cleaning

### Duplicates

In [None]:
merged_df = merged_df.drop_duplicates(subset='painting_id', keep='first')

### Merging

In [None]:
merged_df['food_detected'] = (merged_df['food_word_detected'] | merged_df['food_image_detected']).astype(int)
merged_df

### Pruning

In [None]:
merged_df = merged_df[merged_df['image_path'].notna()]

In [None]:
print(merged_df['food_detected'].value_counts())
print(merged_df['food_word_detected'].value_counts())
print(merged_df['food_image_detected'].value_counts())

### Enhancing

In [None]:

display(merged_df[['creation_date','date_of_birth']])
display(merged_df[['creation_date','date_of_birth']].describe())


merged_df['creation_date'] = merged_df['creation_date'].apply(extract_year)
merged_df['date_of_birth'] = merged_df['date_of_birth'].apply(extract_year)


display(merged_df[['creation_date','date_of_birth']])
display(merged_df[['creation_date','date_of_birth']].describe())

In [None]:
# Fill missing creation year when possible
# Calculate the age of the painter at the time of painting
merged_df['painter_age_at_painting'] = merged_df['creation_date'] - merged_df['date_of_birth']

display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])

# Calculate the average painter_age_at_painting for each painter
avg_painter_age = merged_df['painter_age_at_painting'].mean().astype(int)

merged_df['painter_age_at_painting'].fillna(avg_painter_age, inplace=True)

# Fill missing creation_date with date_of_birth + avg_painter_age
merged_df['creation_date'].fillna(merged_df['date_of_birth'] + avg_painter_age, inplace=True)

display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])
merged_df

In [None]:
merged_df['painter'].fillna('Unknown Artist', inplace=True)
merged_df['author_country'].fillna('Unknown Country', inplace=True)
merged_df['location_country'].fillna('Unknown Country', inplace=True)
merged_df['location_name'].fillna('Unknown Location', inplace=True)

merged_df['author_gender'] = merged_df['author_gender'].astype('category')
merged_df['author_gender'] = merged_df['author_gender'].cat.set_categories(['male', 'female'])
merged_df['author_gender'].fillna('male', inplace=True)

#### Add decades

In [None]:

# Add a column with decades
merged_df['decade'] = (merged_df['creation_date'] // 10) * 10

# Display the updated DataFrame
display(merged_df[['painter', 'creation_date', 'decade']])
print(merged_df['decade'].unique())

In [None]:
merged_df['location_country'] = merged_df['location_country'].replace('German Reich', 'Germany')
merged_df['author_country'] = merged_df['author_country'].replace('German Reich', 'Germany')

#### Add time period

In [None]:

merged_df['time_period'] = merged_df['decade'].apply(classify_period)


#### Add gdp and pop

In [None]:
eco_df = pd.read_csv('data/gdp_pop_decades.csv')
eco_df

In [None]:
merged_df = merged_df.merge(
    eco_df,
    on='decade',
    how='left' 
)

merged_df

In [None]:

scaler = MinMaxScaler()

# Normalize 'gdppc' and 'pop' columns
merged_df[['gdppc_normalized', 'pop_normalized']] = scaler.fit_transform(merged_df[['gdppc', 'pop']])

display(merged_df[['gdppc', 'gdppc_normalized', 'pop', 'pop_normalized']])

## FINAL DF

In [None]:
paintings_with_food = merged_df[merged_df['image_url'].isna() == False]
paintings_with_food = paintings_with_food[['title', 'painter', 'creation_date', 'author_gender', 'author_country', 'location_name', 'location_country', 'time_period', 'image_path', 'image_url', 'coordinates','food_detected','decade','gdppc','pop', 'gdppc_normalized', 'pop_normalized']]
paintings_with_food

## Export

In [None]:
paintings_with_food.to_csv('data/paintings_with_food.csv', index=False)

### GDP analysis

In [None]:
# Group by decade and calculate the proportion of food_detected
food_by_decade = merged_df.groupby('decade')['food_detected'].agg(artwork_count='count', food_related_sum='sum').reset_index()
food_by_decade['proportion_food_detected'] = food_by_decade['food_related_sum'] / food_by_decade['artwork_count']

# Merge normalized GDP and population data
food_by_decade = food_by_decade.merge(
    merged_df[['decade', 'gdppc_normalized', 'pop_normalized']].drop_duplicates(),
    on='decade',
    how='left'
)

# Filter the DataFrame to include only records from 1250 to 2000
food_by_decade = food_by_decade[(food_by_decade['decade'] >= 1250) & (food_by_decade['decade'] <= 2000)]


In [None]:
data = food_by_decade

# Calculate Pearson correlation
correlation = data['proportion_food_detected'].corr(data['gdppc_normalized'])
correlation_pvalue = stats.pearsonr(data['proportion_food_detected'], data['gdppc_normalized'])

# Calculate summary statistics
summary_stats = {
    'Pearson Correlation': correlation,
    'P-value': correlation_pvalue[1],
    'Sample Size': len(data),
    'Mean Food Proportion': data['proportion_food_detected'].mean(),
    'Mean GDP per Capita': data['gdppc_normalized'].mean(),
}

print("\nCorrelation Analysis Results:")
for key, value in summary_stats.items():
    print(f"{key}: {value:.4f}")
