# Food in Art

In [23]:
%load_ext autoreload
%autoreload 2 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
from analysis import *
import os
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from dotenv import load_dotenv
from sqlalchemy import create_engine, text


## Loading

In [25]:
config = load_config('config.yaml')


In [26]:
all_data = load_all_datasets(config)

  df = pd.read_csv(source_path)


#### Load correspondance table

In [27]:
correspondance_data = all_data['ids']
correspondance_data.drop_duplicates(inplace=True)
correspondance_data

Unnamed: 0,painting_id,author_id,location_id
0,Q724861,unknown,Q728116
1,Q727875,Q47551,Q51252
2,Q605863,Q310973,Q338330
3,Q607598,Q723863,Q861252
4,Q607761,Q5432,unknown
...,...,...,...
643641,Q130724770,Q1826320,unknown
643642,Q130724778,Q3768945,unknown
643643,Q130724781,Q334262,unknown
643644,Q130724839,Q5825256,Q1992004


#### Load paintings data

In [28]:
paintings_data = all_data['paintings']
paintings_data['image_url'] = paintings_data['image_url'].apply(lambda x: get_512px_thumbnail(x) if pd.notna(x) else x)
paintings_data

Unnamed: 0,painting_id,creation_date,title,image_url,time_period
0,Q724861,1612.0,Ashbourne portrait,https://commons.wikimedia.org/w/index.php?titl...,unknown
1,Q727875,1538.0,Venus of Urbino,https://commons.wikimedia.org/w/index.php?titl...,unknown
5,Q605863,1530.0,Portrait of a gentleman in his studio,https://commons.wikimedia.org/w/index.php?titl...,unknown
6,Q607598,1445.0,Virgin of the Councillors,https://commons.wikimedia.org/w/index.php?titl...,unknown
7,Q607761,1793.0,The Death of the Picador,https://commons.wikimedia.org/w/index.php?titl...,unknown
...,...,...,...,...,...
666277,Q130724770,1851.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown
666278,Q130724778,1874.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown
666279,Q130724781,1646.0,Atlas holding up the celestial globe,https://commons.wikimedia.org/w/index.php?titl...,unknown
666280,Q130724839,1928.0,Self Portrait,https://commons.wikimedia.org/w/index.php?titl...,unknown


#### Load locations data

In [29]:
locations_data = all_data['locations']
locations_data

Unnamed: 0,location_id,location_name,location_country,coordinates
0,Q728116,Folger Shakespeare Library,United States of America,Point(-77.003172 38.889361)
4,Q51252,Uffizi Gallery,Italy,Point(11.255277777 43.768333333)
7,Q338330,Gallerie dell'Accademia,Italy,Point(12.328139 45.431078)
13,Q861252,Museu Nacional d'Art de Catalunya,Spain,Point(2.153305555 41.368333333)
14,Q1970945,Buffalo AKG Art Museum,United States of America,Point(-78.875618 42.93245)
...,...,...,...,...
20665,Q124695064,unknown,Germany,unknown
20666,Q5548057,"Georgian State Museum of Theatre, Music, Cinem...",Georgia,Point(44.794258 41.715799)
20667,Q3610430,Dadiani Palaces Museum,Georgia,Point(41.87416667 42.51222222)
20668,Q41185,Caen,France,Point(-0.363611111 49.181388888)


#### Load authors data

In [30]:
authors_data = all_data['authors']
authors_data

Unnamed: 0,author_id,painter,author_country,date_of_birth,author_gender
0,Q310973,Lorenzo Lotto,Republic of Venice,1480.0,male
1,Q723863,Lluís Dalmau,unknown,1428.0,male
2,Q5432,Francisco Goya,Spain,1746.0,male
3,Q37693,Paul Gauguin,France,1848.0,male
4,Q8459,Giorgione,Republic of Venice,1470.0,male
...,...,...,...,...,...
656,Q380708,Dirck van Baburen,Dutch Republic,1595.0,male
658,Q281998,Jean-Baptiste Debret,France,1768.0,male
659,Q3749980,Francesco Filippini,Kingdom of Italy,1853.0,male
660,Q737726,Robert Peake the elder,Kingdom of England,1551.0,male


#### Load ML food data

In [31]:
food_words = all_data['food_words']
food_words['food_word_detected'] = food_words.select_dtypes(include='int').sum(axis=1) > 0
food_words

Unnamed: 0,painting_id,image_path,fruit,bread,cookware,seafood,wine,meal,cheese,meat,food,beverage,dairy,vegetable,dessert,food_word_detected
0,Q27064304,img/img_512/Intérieur de cuisine - Joachim Beu...,0,0,0,1,0,0,0,1,0,0,1,1,1,True
1,Q12900365,img/img_512/The Luncheon (SM sg170).png,0,1,0,1,1,1,1,0,0,0,0,0,0,True
2,Q776175,img/img_512/Pieter Bruegel the Elder- The Harv...,0,0,0,0,0,1,0,1,1,1,0,1,0,True
3,Q72701665,img/img_512/Lille PdBA quellin fyt jesus marth...,0,0,1,1,0,0,0,1,0,0,0,1,1,True
4,Q20532659,"img/img_512/OA Hermansen, Et frokostbord, 1884...",0,0,0,0,0,1,1,0,0,1,0,0,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71165,Q51247485,"img/img_512/Alex Colville - Infantry, near Nij...",0,0,0,0,0,0,0,0,0,0,0,0,0,False
71166,Q51244389,img/img_512/Ivan Žabota - dekliški portret.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,False
71167,Q51235353,img/img_512/Ivan Žabota - Marta Krásovej.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,False
71168,Q51265369,img/img_512/Ivan Žabota - ženski portret.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,False


In [32]:
food_found = all_data['food_found']
food_found['food_image_detected'] = food_found['predictions'].apply(lambda x: len(x) > 3)
food_found

Unnamed: 0,painting_id,predictions,food_image_detected
0,Q27064304,[],False
1,Q776175,[],False
2,Q12900365,[],False
3,Q27974915,[],False
4,Q20532659,[],False
...,...,...,...
65084,Q51243217,[],False
65085,Q51247485,[],False
65086,Q51235353,[],False
65087,Q51265369,[],False


In [33]:
food_detected_df = food_words.merge(food_found, on='painting_id', how='outer')
food_detected_df['food_detected'] = (food_detected_df['food_word_detected'] | food_detected_df['food_image_detected']).astype(int)
food_detected_df = food_detected_df[['painting_id', 'food_detected']]

### SQL upload

In [34]:
load_dotenv()
password = os.getenv('PASSWORD')

In [35]:

# Create the Database
database_name = 'art_and_food_db'

# Set Up Database Connection
engine = create_engine(f'mysql+pymysql://root:{password}@localhost')

# Create Database if it Doesn't Exist
with engine.connect() as conn:
    conn.execute(text(f'CREATE DATABASE IF NOT EXISTS {database_name}'))

In [36]:
# Connect to the Newly Created Database
engine = create_engine(
    f'mysql+pymysql://root:{password}@localhost/{database_name}')

In [37]:
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Date, DateTime, Float, ForeignKey, Text, Boolean
#from geoalchemy2 import Geometry
#from datetime import datetime

# Create MetaData instance
metadata = MetaData()

# Define tables
paintings = Table('paintings', metadata,
    Column('painting_id', String(50), primary_key=True),
    Column('creation_date', Integer),
    Column('title', String(255)),
    Column('image_url', Text),
    Column('time_period', String(100))
)

locations = Table('locations', metadata,
    Column('location_id', String(50), primary_key=True),
    Column('location_name', String(255)),
    Column('location_country', String(100)),
    Column('coordinates', String(100))
)

authors = Table('authors', metadata,
    Column('author_id', String(50), primary_key=True),
    Column('painter', String(255)),
    Column('author_country', String(100)),
    Column('date_of_birth', Integer),
    Column('author_gender', String(50))
)

food_detected = Table('food_detected', metadata,
    Column('painting_id', String(50), primary_key=True),
    Column('food_detected', Boolean),
)


correspondence = Table('correspondence', metadata,
    Column('id', Integer, primary_key=True, autoincrement=True),
    Column('painting_id', String(50)),
    Column('author_id', String(50)),
    Column('location_id', String(50))
)

""" food_detected = Table('food_detected', metadata,
    Column('painting_id', String(50), ForeignKey('paintings.painting_id'), primary_key=True),
    Column('food_detected', Boolean),
)

correspondence = Table('correspondence', metadata,
    Column('id', Integer, primary_key=True, autoincrement=True),
    Column('painting_id', String(50), ForeignKey('paintings.painting_id'), nullable=False),
    Column('author_id', String(50), ForeignKey('authors.author_id')),
    Column('location_id', String(50), ForeignKey('locations.location_id'))
) """

def create_tables(engine):
    metadata.create_all(engine)
    

def insert_data(engine, paintings_df, locations_df, authors_df, food_detected_df, correspondence_df):
    connection = engine.connect()
    
    try:
        # Insert Locations
        #locations_df = locations_df.dropna(how='any')
        location_data = locations_df.to_dict(orient='records')
        if location_data:
            connection.execute(locations.insert(), location_data)
        
        # Insert Authors
        #authors_df = authors_df.dropna(how='any')
        author_data = authors_df.to_dict(orient='records')
        if author_data:
            connection.execute(authors.insert(), author_data)
        
        # Insert Paintings
        #paintings_df = paintings_df.dropna(how='any')
        painting_data = paintings_df.to_dict(orient='records')
        if painting_data:
            connection.execute(paintings.insert(), painting_data)
        
        #paintings_df = paintings_df.dropna(how='any')
        food_detected_data = food_detected_df.to_dict(orient='records')
        if painting_data:
            connection.execute(food_detected.insert(), food_detected_data)
            
        # Insert Correspondence
        #correspondence_df = correspondence_df.dropna(how='any')
        correspondence_data = correspondence_df.to_dict(orient='records')
        if correspondence_data:
            connection.execute(correspondence.insert(), correspondence_data)
        
        connection.commit()
    
    except Exception as e:
        connection.rollback()
        raise e
    
    finally:
        connection.close()


### Merging

In [38]:
merged_df = correspondance_data.merge(paintings_data, on='painting_id', how='inner')
merged_df = merged_df.merge(authors_data, on='author_id', how='left')
merged_df = merged_df.merge(locations_data, on='location_id', how='left')
merged_df = merged_df.merge(food_words, on='painting_id', how='left')
merged_df = merged_df.merge(food_found, on='painting_id', how='left')
merged_df

Unnamed: 0,painting_id,author_id,location_id,creation_date,title,image_url,time_period,painter,author_country,date_of_birth,...,cheese,meat,food,beverage,dairy,vegetable,dessert,food_word_detected,predictions,food_image_detected
0,Q724861,unknown,Q728116,1612.0,Ashbourne portrait,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False
1,Q727875,Q47551,Q51252,1538.0,Venus of Urbino,https://commons.wikimedia.org/w/index.php?titl...,unknown,Titian,Republic of Venice,1490.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False
2,Q605863,Q310973,Q338330,1530.0,Portrait of a gentleman in his studio,https://commons.wikimedia.org/w/index.php?titl...,unknown,Lorenzo Lotto,Republic of Venice,1480.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False
3,Q607598,Q723863,Q861252,1445.0,Virgin of the Councillors,https://commons.wikimedia.org/w/index.php?titl...,unknown,Lluís Dalmau,unknown,1428.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False
4,Q607761,Q5432,unknown,1793.0,The Death of the Picador,https://commons.wikimedia.org/w/index.php?titl...,unknown,Francisco Goya,Spain,1746.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643641,Q130724770,Q1826320,unknown,1851.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,,,,,,,,,,
643642,Q130724778,Q3768945,unknown,1874.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,,,,,,,,,,
643643,Q130724781,Q334262,unknown,1646.0,Atlas holding up the celestial globe,https://commons.wikimedia.org/w/index.php?titl...,unknown,Guercino,Papal States,1591.0,...,,,,,,,,,,
643644,Q130724839,Q5825256,Q1992004,1928.0,Self Portrait,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,,,,,,,,,,


### Cleaning

### Duplicates

In [39]:
merged_df = merged_df.drop_duplicates(subset='painting_id', keep='first')

### Merging

In [40]:
merged_df['food_detected'] = (merged_df['food_word_detected'] | merged_df['food_image_detected']).astype(int)
merged_df

Unnamed: 0,painting_id,author_id,location_id,creation_date,title,image_url,time_period,painter,author_country,date_of_birth,...,meat,food,beverage,dairy,vegetable,dessert,food_word_detected,predictions,food_image_detected,food_detected
0,Q724861,unknown,Q728116,1612.0,Ashbourne portrait,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False,0
1,Q727875,Q47551,Q51252,1538.0,Venus of Urbino,https://commons.wikimedia.org/w/index.php?titl...,unknown,Titian,Republic of Venice,1490.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False,0
2,Q605863,Q310973,Q338330,1530.0,Portrait of a gentleman in his studio,https://commons.wikimedia.org/w/index.php?titl...,unknown,Lorenzo Lotto,Republic of Venice,1480.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False,0
3,Q607598,Q723863,Q861252,1445.0,Virgin of the Councillors,https://commons.wikimedia.org/w/index.php?titl...,unknown,Lluís Dalmau,unknown,1428.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False,0
4,Q607761,Q5432,unknown,1793.0,The Death of the Picador,https://commons.wikimedia.org/w/index.php?titl...,unknown,Francisco Goya,Spain,1746.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,[],False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643641,Q130724770,Q1826320,unknown,1851.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,,,,,,,,,,0
643642,Q130724778,Q3768945,unknown,1874.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,,,,,,,,,,0
643643,Q130724781,Q334262,unknown,1646.0,Atlas holding up the celestial globe,https://commons.wikimedia.org/w/index.php?titl...,unknown,Guercino,Papal States,1591.0,...,,,,,,,,,,0
643644,Q130724839,Q5825256,Q1992004,1928.0,Self Portrait,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,,,,,,,,,,0


### Pruning

In [41]:
merged_df = merged_df[merged_df['image_path'].notna()]

In [42]:
print(merged_df['food_detected'].value_counts())
print(merged_df['food_word_detected'].value_counts())
print(merged_df['food_image_detected'].value_counts())

food_detected
0    66905
1     4265
Name: count, dtype: int64
food_word_detected
False    69763
True      1407
Name: count, dtype: int64
food_image_detected
False    61710
True      3262
Name: count, dtype: int64


### Enhancing

In [43]:
# Fill missing creation year when possible
# Calculate the age of the painter at the time of painting
merged_df['painter_age_at_painting'] = merged_df['creation_date'] - merged_df['date_of_birth']

display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])

# Calculate the average painter_age_at_painting for each painter
avg_painter_age = merged_df['painter_age_at_painting'].mean().astype(int)

merged_df['painter_age_at_painting'].fillna(avg_painter_age, inplace=True)

# Fill missing creation_date with date_of_birth + avg_painter_age
merged_df['creation_date'].fillna(merged_df['date_of_birth'] + avg_painter_age, inplace=True)

display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])
merged_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['painter_age_at_painting'] = merged_df['creation_date'] - merged_df['date_of_birth']


Unnamed: 0,painter,creation_date,date_of_birth,painter_age_at_painting
0,,1612.0,,
1,Titian,1538.0,1490.0,48.0
2,Lorenzo Lotto,1530.0,1480.0,50.0
3,Lluís Dalmau,1445.0,1428.0,17.0
4,Francisco Goya,1793.0,1746.0,47.0
...,...,...,...,...
641646,,1913.0,,
641925,,1900.0,,
642132,,1900.0,,
643066,,1700.0,,


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['painter_age_at_painting'].fillna(avg_painter_age, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['painter_age_at_painting'].fillna(avg_painter_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: va

Unnamed: 0,painter,creation_date,date_of_birth,painter_age_at_painting
0,,1612.0,,43.0
1,Titian,1538.0,1490.0,48.0
2,Lorenzo Lotto,1530.0,1480.0,50.0
3,Lluís Dalmau,1445.0,1428.0,17.0
4,Francisco Goya,1793.0,1746.0,47.0
...,...,...,...,...
641646,,1913.0,,43.0
641925,,1900.0,,43.0
642132,,1900.0,,43.0
643066,,1700.0,,43.0


Unnamed: 0,painting_id,author_id,location_id,creation_date,title,image_url,time_period,painter,author_country,date_of_birth,...,food,beverage,dairy,vegetable,dessert,food_word_detected,predictions,food_image_detected,food_detected,painter_age_at_painting
0,Q724861,unknown,Q728116,1612.0,Ashbourne portrait,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,0.0,0.0,0.0,0.0,0.0,False,[],False,0,43.0
1,Q727875,Q47551,Q51252,1538.0,Venus of Urbino,https://commons.wikimedia.org/w/index.php?titl...,unknown,Titian,Republic of Venice,1490.0,...,0.0,0.0,0.0,0.0,0.0,False,[],False,0,48.0
2,Q605863,Q310973,Q338330,1530.0,Portrait of a gentleman in his studio,https://commons.wikimedia.org/w/index.php?titl...,unknown,Lorenzo Lotto,Republic of Venice,1480.0,...,0.0,0.0,0.0,0.0,0.0,False,[],False,0,50.0
3,Q607598,Q723863,Q861252,1445.0,Virgin of the Councillors,https://commons.wikimedia.org/w/index.php?titl...,unknown,Lluís Dalmau,unknown,1428.0,...,0.0,0.0,0.0,0.0,0.0,False,[],False,0,17.0
4,Q607761,Q5432,unknown,1793.0,The Death of the Picador,https://commons.wikimedia.org/w/index.php?titl...,unknown,Francisco Goya,Spain,1746.0,...,0.0,0.0,0.0,0.0,0.0,False,[],False,0,47.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641646,Q130379774,Q112722072,Q163804,1913.0,"Still Life with Flowers, Onions and Stoneware Jug",https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,0.0,0.0,0.0,0.0,1.0,True,[],False,1,43.0
641925,Q130408122,Q130408058,Q3330195,1900.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,0.0,0.0,0.0,0.0,0.0,True,[],False,1,43.0
642132,Q130431179,Q130408058,Q3330195,1900.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,0.0,0.0,0.0,1.0,0.0,True,[],False,1,43.0
643066,Q129264357,Q19569575,Q965780,1700.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,unknown,,,,...,0.0,0.0,0.0,0.0,0.0,True,[],False,1,43.0


In [44]:
merged_df['painter'].fillna('Unknown Artist', inplace=True)
merged_df['author_country'].fillna('Unknown Country', inplace=True)
merged_df['location_country'].fillna('Unknown Country', inplace=True)
merged_df['location_name'].fillna('Unknown Location', inplace=True)

merged_df['author_gender'] = merged_df['author_gender'].astype('category')
merged_df['author_gender'] = merged_df['author_gender'].cat.set_categories(['male', 'female'])
merged_df['author_gender'].fillna('male', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['painter'].fillna('Unknown Artist', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['painter'].fillna('Unknown Artist', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col

#### Add decades

In [45]:

# Add a column with decades
merged_df['decade'] = (merged_df['creation_date'] // 10) * 10

# Display the updated DataFrame
display(merged_df[['painter', 'creation_date', 'decade']])
print(merged_df['decade'].unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['decade'] = (merged_df['creation_date'] // 10) * 10


Unnamed: 0,painter,creation_date,decade
0,Unknown Artist,1612.0,1610.0
1,Titian,1538.0,1530.0
2,Lorenzo Lotto,1530.0,1530.0
3,Lluís Dalmau,1445.0,1440.0
4,Francisco Goya,1793.0,1790.0
...,...,...,...
641646,Unknown Artist,1913.0,1910.0
641925,Unknown Artist,1900.0,1900.0
642132,Unknown Artist,1900.0,1900.0
643066,Unknown Artist,1700.0,1700.0


[1610. 1530. 1440. 1790. 1890. 1650.  200. 1850. 1500. 1620. 1450. 1900.
 1880. 1700. 1600. 1840. 1640. 1490. 1430. 1780. 1870. 1470. 1560. 1540.
 1820. 1800. 1660. 1630. 1510. 1710. 1550. 1460. 1920. 1770. 1590. 1120.
 1480. 2020. 1380. 1860. 1390. 1910. 1420. 1830. 1670. 1280. 1760. 1520.
 1400. 1150. 1810. 1720. 1340. 1940. 1300.  -50. 1410. 1930. 1290. 1250.
 1580.   nan 1570. 1960.  -60. 1680.  700. 1970. 1330. 1750. 1320. 1740.
 1990. 1370. 1230. 1690. 1200. 2000. 1260. 1950. 1730. 1310. 1360. 1270.
 1350. 1210. 1170. 1980. 1130.  800. 2010. -200.  960.    0.  160. 1190.
 1240. 1220. 1000.  350. 1100.  900.  950. 1160.  750.  850. -170. -300.
  220. 1180.  150.  -70. 1140. 1110. 1040. 1060. 1080. -130.]


In [46]:
merged_df['location_country'] = merged_df['location_country'].replace('German Reich', 'Germany')
merged_df['author_country'] = merged_df['author_country'].replace('German Reich', 'Germany')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['location_country'] = merged_df['location_country'].replace('German Reich', 'Germany')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['author_country'] = merged_df['author_country'].replace('German Reich', 'Germany')


#### Add time period

In [47]:

merged_df['time_period'] = merged_df['decade'].apply(classify_period)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['time_period'] = merged_df['decade'].apply(classify_period)


#### Add gdp and pop

In [48]:
eco_df = pd.read_csv('data/gdp_pop_decades.csv')
eco_df

Unnamed: 0,decade,gdppc,pop
0,0,1070.357143,4685.348837
1,730,1075.946667,4729.143601
2,1000,1069.459500,4775.957447
3,1090,1221.711000,4825.790374
4,1150,997.385800,4878.642383
...,...,...,...
78,1980,8730.836597,30123.380994
79,1990,9835.542353,35134.994373
80,2000,14235.135654,39809.634402
81,2010,18242.708915,44670.518233


In [49]:
merged_df = merged_df.merge(
    eco_df,
    on='decade',
    how='left' 
)

merged_df

Unnamed: 0,painting_id,author_id,location_id,creation_date,title,image_url,time_period,painter,author_country,date_of_birth,...,vegetable,dessert,food_word_detected,predictions,food_image_detected,food_detected,painter_age_at_painting,decade,gdppc,pop
0,Q724861,unknown,Q728116,1612.0,Ashbourne portrait,https://commons.wikimedia.org/w/index.php?titl...,Baroque,Unknown Artist,Unknown Country,,...,0.0,0.0,False,[],False,0,43.0,1610.0,1537.388656,10227.003496
1,Q727875,Q47551,Q51252,1538.0,Venus of Urbino,https://commons.wikimedia.org/w/index.php?titl...,High Renaissance and Mannerism,Titian,Republic of Venice,1490.0,...,0.0,0.0,False,[],False,0,48.0,1530.0,1633.617645,8324.964223
2,Q605863,Q310973,Q338330,1530.0,Portrait of a gentleman in his studio,https://commons.wikimedia.org/w/index.php?titl...,High Renaissance and Mannerism,Lorenzo Lotto,Republic of Venice,1480.0,...,0.0,0.0,False,[],False,0,50.0,1530.0,1633.617645,8324.964223
3,Q607598,Q723863,Q861252,1445.0,Virgin of the Councillors,https://commons.wikimedia.org/w/index.php?titl...,Early Renaissance,Lluís Dalmau,unknown,1428.0,...,0.0,0.0,False,[],False,0,17.0,1440.0,1480.513438,6702.630990
4,Q607761,Q5432,unknown,1793.0,The Death of the Picador,https://commons.wikimedia.org/w/index.php?titl...,Neoclassicism and Romanticism,Francisco Goya,Spain,1746.0,...,0.0,0.0,False,[],False,0,47.0,1790.0,1870.466476,8299.534918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71165,Q130379774,Q112722072,Q163804,1913.0,"Still Life with Flowers, Onions and Stoneware Jug",https://commons.wikimedia.org/w/index.php?titl...,Modern Art,Unknown Artist,Unknown Country,,...,0.0,1.0,True,[],False,1,43.0,1910.0,3188.170919,23782.400338
71166,Q130408122,Q130408058,Q3330195,1900.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,Modern Art,Unknown Artist,Unknown Country,,...,0.0,0.0,True,[],False,1,43.0,1900.0,3033.082168,23305.662362
71167,Q130431179,Q130408058,Q3330195,1900.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,Modern Art,Unknown Artist,Unknown Country,,...,1.0,0.0,True,[],False,1,43.0,1900.0,3033.082168,23305.662362
71168,Q129264357,Q19569575,Q965780,1700.0,unknown,https://commons.wikimedia.org/w/index.php?titl...,Rococo,Unknown Artist,Unknown Country,,...,0.0,0.0,True,[],False,1,43.0,1700.0,1871.222136,10715.437500


In [50]:

scaler = MinMaxScaler()

# Normalize 'gdppc' and 'pop' columns
merged_df[['gdppc_normalized', 'pop_normalized']] = scaler.fit_transform(merged_df[['gdppc', 'pop']])

display(merged_df[['gdppc', 'gdppc_normalized', 'pop', 'pop_normalized']])

Unnamed: 0,gdppc,gdppc_normalized,pop,pop_normalized
0,1537.388656,0.029856,10227.003496,0.140646
1,1633.617645,0.035176,8324.964223,0.097084
2,1633.617645,0.035176,8324.964223,0.097084
3,1480.513438,0.026711,6702.630990,0.059928
4,1870.466476,0.048271,8299.534918,0.096501
...,...,...,...,...
71165,3188.170919,0.121124,23782.400338,0.451102
71166,3033.082168,0.112549,23305.662362,0.440184
71167,3033.082168,0.112549,23305.662362,0.440184
71168,1871.222136,0.048312,10715.437500,0.151832


## FINAL DF

In [51]:
paintings_with_food = merged_df[merged_df['image_url'].isna() == False]
paintings_with_food = paintings_with_food[['title', 'painter', 'creation_date', 'author_gender', 'author_country', 'location_name', 'location_country', 'time_period', 'image_path', 'image_url', 'coordinates','food_detected','decade','gdppc','pop', 'gdppc_normalized', 'pop_normalized']]
paintings_with_food

Unnamed: 0,title,painter,creation_date,author_gender,author_country,location_name,location_country,time_period,image_path,image_url,coordinates,food_detected,decade,gdppc,pop,gdppc_normalized,pop_normalized
0,Ashbourne portrait,Unknown Artist,1612.0,male,Unknown Country,Folger Shakespeare Library,United States of America,Baroque,img/img_512/Ashbourne portrait ShakespeareHame...,https://commons.wikimedia.org/w/index.php?titl...,Point(-77.003172 38.889361),0,1610.0,1537.388656,10227.003496,0.029856,0.140646
1,Venus of Urbino,Titian,1538.0,male,Republic of Venice,Uffizi Gallery,Italy,High Renaissance and Mannerism,img/img_512/Tiziano's Venere di Urbino (from T...,https://commons.wikimedia.org/w/index.php?titl...,Point(11.255277777 43.768333333),0,1530.0,1633.617645,8324.964223,0.035176,0.097084
2,Portrait of a gentleman in his studio,Lorenzo Lotto,1530.0,male,Republic of Venice,Gallerie dell'Accademia,Italy,High Renaissance and Mannerism,img/img_512/Accademia - Ritratto di giovane ge...,https://commons.wikimedia.org/w/index.php?titl...,Point(12.328139 45.431078),0,1530.0,1633.617645,8324.964223,0.035176,0.097084
3,Virgin of the Councillors,Lluís Dalmau,1445.0,male,unknown,Museu Nacional d'Art de Catalunya,Spain,Early Renaissance,img/img_512/Dalmau Mare de Deu dels Consellers...,https://commons.wikimedia.org/w/index.php?titl...,Point(2.153305555 41.368333333),0,1440.0,1480.513438,6702.630990,0.026711,0.059928
4,The Death of the Picador,Francisco Goya,1793.0,male,Spain,Unknown Location,Unknown Country,Neoclassicism and Romanticism,img/img_512/La muerte del picador.jpg,https://commons.wikimedia.org/w/index.php?titl...,,0,1790.0,1870.466476,8299.534918,0.048271,0.096501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71165,"Still Life with Flowers, Onions and Stoneware Jug",Unknown Artist,1913.0,male,Unknown Country,Städel Museum,Germany,Modern Art,"img/img_512/Pauline Kowarzik, Stillleben mit b...",https://commons.wikimedia.org/w/index.php?titl...,Point(8.6737 50.1032),1,1910.0,3188.170919,23782.400338,0.121124,0.451102
71166,unknown,Unknown Artist,1900.0,male,Unknown Country,Musée des Beaux-Arts de Carcassonne,France,Modern Art,img/img_512/Joseph Maguet - Nature morte aux p...,https://commons.wikimedia.org/w/index.php?titl...,Point(2.35527 43.2125),1,1900.0,3033.082168,23305.662362,0.112549,0.440184
71167,unknown,Unknown Artist,1900.0,male,Unknown Country,Musée des Beaux-Arts de Carcassonne,France,Modern Art,img/img_512/Joseph Maguet - Nature morte.jpg,https://commons.wikimedia.org/w/index.php?titl...,Point(2.35527 43.2125),1,1900.0,3033.082168,23305.662362,0.112549,0.440184
71168,unknown,Unknown Artist,1700.0,male,Unknown Country,Musée Granet,France,Rococo,img/img_512/Musée Granet - Nature morte aux po...,https://commons.wikimedia.org/w/index.php?titl...,Point(5.4525 43.525555555),1,1700.0,1871.222136,10715.437500,0.048312,0.151832


## Export

In [52]:
#paintings_with_food.to_csv('data/paintings_with_food.csv', index=False)

### GDP analysis

In [53]:
# Group by decade and calculate the proportion of food_detected
food_by_decade = merged_df.groupby('decade')['food_detected'].agg(artwork_count='count', food_related_sum='sum').reset_index()
food_by_decade['proportion_food_detected'] = food_by_decade['food_related_sum'] / food_by_decade['artwork_count']

# Merge normalized GDP and population data
food_by_decade = food_by_decade.merge(
    merged_df[['decade', 'gdppc_normalized', 'pop_normalized']].drop_duplicates(),
    on='decade',
    how='left'
)

# Filter the DataFrame to include only records from 1250 to 2000
food_by_decade = food_by_decade[(food_by_decade['decade'] >= 1250) & (food_by_decade['decade'] <= 2000)]


In [54]:
data = food_by_decade

# Calculate Pearson correlation
correlation = data['proportion_food_detected'].corr(data['gdppc_normalized'])
correlation_pvalue = stats.pearsonr(data['proportion_food_detected'], data['gdppc_normalized'])

# Calculate summary statistics
summary_stats = {
    'Pearson Correlation': correlation,
    'P-value': correlation_pvalue[1],
    'Sample Size': len(data),
    'Mean Food Proportion': data['proportion_food_detected'].mean(),
    'Mean GDP per Capita': data['gdppc_normalized'].mean(),
}

print("\nCorrelation Analysis Results:")
for key, value in summary_stats.items():
    print(f"{key}: {value:.4f}")



Correlation Analysis Results:
Pearson Correlation: 0.2500
P-value: 0.0294
Sample Size: 76.0000
Mean Food Proportion: 0.0668
Mean GDP per Capita: 0.0740


## EDA

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Read the data
df = paintings_with_food.copy()


In [56]:

# 1. Time Period Distribution
time_period_counts = df['time_period'].value_counts()
fig1 = px.bar(time_period_counts, 
              title='Distribution of Artworks by Time Period',
              labels={'index': 'Time Period', 'value': 'Count'},
              color=time_period_counts.values,
              height=500)
fig1.update_layout(showlegend=False, xaxis_tickangle=-45)
fig1.show()


In [57]:

# 2. Gender and Food Analysis
gender_food = pd.crosstab(df['author_gender'], df['food_detected'])
fig2 = px.bar(gender_food, 
              title='Gender Distribution and Food Presence in Artworks',
              labels={'author_gender': 'Artist Gender', 'value': 'Count', 'food_detected': 'Food Detected'},
              barmode='group',
              height=400)
fig2.show()


In [58]:

# 3. Geographic Distribution
location_counts = df['location_country'].value_counts().head(10)
fig3 = px.pie(values=location_counts.values, 
              names=location_counts.index,
              title='Top 10 Countries Housing the Artworks',
              height=500)
fig3.show()


In [61]:

# 4. Timeline Analysis
df['creation_date'].fillna(0, inplace=True)  # Fill NaN values with 0 or any other placeholder
df['century'] = (df['creation_date'] // 100 + 1).astype(int)
century_counts = df['century'].value_counts().sort_index()
fig4 = px.line(x=century_counts.index, 
               y=century_counts.values,
               title='Distribution of Artworks Across Centuries',
               labels={'x': 'Century', 'y': 'Number of Artworks'},
               markers=True)
fig4.show()



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [62]:

# 5. Artist Countries
artist_countries = df['author_country'].value_counts().head(10)
fig5 = px.bar(artist_countries,
              title='Top 10 Artist Countries of Origin',
              labels={'index': 'Country', 'value': 'Count'},
              color=artist_countries.values)
fig5.update_layout(showlegend=False, xaxis_tickangle=-45)
fig5.show()


In [66]:

# 9. Food Detection Analysis by Period
food_by_period = pd.crosstab(df['time_period'], df['food_detected'])
fig9 = px.bar(food_by_period,
              title='Presence of Food in Artworks by Period',
              labels={'time_period': 'Art Period', 'value': 'Count', 'food_detected': 'Food Detected'},
              barmode='group')
fig9.update_layout(xaxis_tickangle=-45)
fig9.show()


In [68]:

# 10. Advanced Combined Analysis
fig10 = make_subplots(rows=2, cols=2,
                      subplot_titles=('Gender Distribution', 'Food Detection',
                                      'Time Period Distribution', 'Century Distribution'),
                      specs=[[{'type': 'domain'}, {'type': 'domain'}],
                             [{'type': 'xy'}, {'type': 'xy'}]])

# Gender Distribution
gender_counts = df['author_gender'].value_counts()
fig10.add_trace(go.Pie(labels=gender_counts.index, values=gender_counts.values),
                row=1, col=1)

# Food Detection
food_counts = df['food_detected'].value_counts()
fig10.add_trace(go.Pie(labels=['No Food', 'Food Present'], values=food_counts.values),
                row=1, col=2)

# Time Period
time_period_counts = df['time_period'].value_counts()
fig10.add_trace(go.Bar(x=time_period_counts.index, y=time_period_counts.values),
                row=2, col=1)

# Century Distribution
fig10.add_trace(go.Bar(x=century_counts.index, y=century_counts.values),
                row=2, col=2)

fig10.update_layout(height=800, title_text="Multi-dimensional Analysis of Artwork Dataset")
fig10.show()


In [69]:

# Print some statistical insights
print("\nStatistical Insights:")
print(f"Total number of artworks: {len(df)}")
print(f"Date range: {df['creation_date'].min()} to {df['creation_date'].max()}")
print(f"Number of unique artists: {df['painter'].nunique()}")
print(f"Number of unique museums: {df['location_name'].nunique()}")
print(f"Percentage of artworks with food: {(df['food_detected'].sum()/len(df)*100):.2f}%")


Statistical Insights:
Total number of artworks: 71170
Date range: -300.0 to 2024.0
Number of unique artists: 455
Number of unique museums: 3046
Percentage of artworks with food: 5.99%


### Food based

In [72]:

# 2. Food by Time Period (Normalized)
period_food = pd.crosstab(df['time_period'], df['food_detected'], normalize='index') * 100
fig2 = px.bar(period_food,
              title='Percentage of Artworks with Food by Period',
              labels={'time_period': 'Art Period', 'value': 'Percentage', 'food_detected': 'Food Present'},
              height=500)
fig2.update_layout(xaxis_tickangle=-45)
fig2.show()


In [73]:

# 3. Food by Artist's Country
country_food = pd.crosstab(df['author_country'], df['food_detected'])
country_food_pct = (country_food[1] / (country_food[0] + country_food[1]) * 100).sort_values(ascending=False)
fig3 = px.bar(country_food_pct.head(10),
              title='Top 10 Countries by Percentage of Food in Artworks',
              labels={'index': 'Artist Country', 'value': 'Percentage with Food'})
fig3.update_layout(xaxis_tickangle=-45)
fig3.show()


In [78]:

# 8. Geographic Distribution of Food in Art
location_food = pd.crosstab(df['location_country'], df['food_detected'])
location_food_pct = (location_food[1] / (location_food[0] + location_food[1]) * 100).sort_values(ascending=False)
fig8 = px.pie(values=location_food_pct.head(5),
              names=location_food_pct.head(5).index,
              title='Top 5 Countries Housing Artworks with Food')
fig8.show()


In [81]:

# 10. Multi-dimensional Food Analysis
fig10 = make_subplots(rows=2, cols=2,
                      subplot_titles=('Food Presence Over Time',
                                    'Food by Period',
                                    'Economic Context',
                                    'Geographic Distribution'),
                      specs=[[{'type': 'xy'}, {'type': 'xy'}],
                             [{'type': 'xy'}, {'type': 'domain'}]])

# Food Over Time
food_time = df.groupby('creation_date')['food_detected'].mean().rolling(window=50).mean()
fig10.add_trace(go.Scatter(x=food_time.index, y=food_time.values,
                          mode='lines', name='Food Presence'),
                row=1, col=1)

# Food by Period
period_counts = period_food[1].sort_values(ascending=False)
fig10.add_trace(go.Bar(x=period_counts.index, y=period_counts.values,
                       name='Period Distribution'),
                row=1, col=2)

# Economic Context
fig10.add_trace(go.Box(x=df['food_detected'].map({0: 'No Food', 1: 'Food Present'}),
                       y=df['gdppc_normalized'],
                       name='Economic Context'),
                row=2, col=1)

# Geographic Distribution
fig10.add_trace(go.Pie(labels=location_food_pct.head(5).index,
                       values=location_food_pct.head(5).values,
                       name='Geographic Distribution'),
                row=2, col=2)

fig10.update_layout(height=800, title_text="Comprehensive Analysis of Food in Artworks")
fig10.show()


In [82]:

# Print detailed statistics about food in artworks
print("\nFood in Artwork Statistics:")
print(f"Total artworks with food: {df['food_detected'].sum()}")
print(f"Percentage of artworks with food: {(df['food_detected'].sum()/len(df)*100):.2f}%")
print("\nFood presence by time period:")
print(period_food[1].sort_values(ascending=False))
print("\nFood presence by century:")
print(century_food[1].sort_values(ascending=False))


Food in Artwork Statistics:
Total artworks with food: 4265
Percentage of artworks with food: 5.99%

Food presence by time period:
time_period
Contemporary Art                       14.102564
Modern Art                             11.504043
Post-War and Abstract Expressionism     9.574468
Baroque                                 7.701838
Medieval                                6.405229
Rococo                                  5.725912
Realism and Impressionism               4.574930
Contemporary and Digital Art            4.435798
Early Renaissance                       4.225924
High Renaissance and Mannerism          4.172173
Antiquity                               3.448276
Neoclassicism and Romanticism           3.154006
Name: 1, dtype: float64

Food presence by century:
century
 9     33.333333
 21    21.176471
 13    12.418301
 20    11.506298
 17     7.701838
 12     5.263158
 18     5.037675
 14     4.914934
 15     4.225924
 16     4.172173
 19     4.098521
 1      3.662507
-2    

In [84]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Calculate absolute counts
gender_food_counts = pd.crosstab(df['author_gender'], df['food_detected'])

# Calculate percentages
gender_food_pct = pd.crosstab(df['author_gender'], df['food_detected'], normalize='index') * 100

# Add bars for absolute counts
fig.add_trace(
    go.Bar(
        name='No Food',
        x=gender_food_counts.index,
        y=gender_food_counts[0],
        offsetgroup=0,
        marker_color='lightgray'
    ),
    secondary_y=False
)

fig.add_trace(
    go.Bar(
        name='Contains Food',
        x=gender_food_counts.index,
        y=gender_food_counts[1],
        offsetgroup=0,
        marker_color='darkgreen'
    ),
    secondary_y=False
)

# Add line for percentage
fig.add_trace(
    go.Scatter(
        name='% with Food',
        x=gender_food_pct.index,
        y=gender_food_pct[1],
        mode='markers+lines+text',
        text=gender_food_pct[1].round(1).astype(str) + '%',
        textposition='top center',
        line=dict(color='red', width=2),
        marker=dict(size=10)
    ),
    secondary_y=True
)

# Update layout
fig.update_layout(
    title={
        'text': 'Food Representation in Paintings by Artist Gender',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    barmode='group',
    height=600,
    width=800,
    showlegend=True,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    )
)

# Set y-axes titles
fig.update_yaxes(title_text="Number of Paintings", secondary_y=False)
fig.update_yaxes(title_text="Percentage of Paintings with Food", secondary_y=True)

# Update x-axis
fig.update_xaxes(title_text="Artist Gender")

fig.show()

# Print detailed statistics
print("\nDetailed Statistics:")
print("\nAbsolute Counts:")
print(gender_food_counts)
print("\nPercentages:")
print(gender_food_pct[1].round(2).astype(str) + '%')
print("\nTotal number of paintings by gender:")
print(df['author_gender'].value_counts())


Detailed Statistics:

Absolute Counts:
food_detected      0     1
author_gender             
male           66730  4264
female           175     1

Percentages:
author_gender
male      6.01%
female    0.57%
Name: 1, dtype: object

Total number of paintings by gender:
author_gender
male      70994
female      176
Name: count, dtype: int64
