# Food in Art

In [827]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import yaml


In [828]:
#TODO: one hot encode food types
#TODO: calculate proportions

In [829]:


def load_config(config_path: str) -> dict:
    """
    Load configuration from YAML file.
    
    Args:
        config_path: Path to YAML configuration file
    
    Returns:
        Dictionary containing configuration settings
    """
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)


In [830]:

def clean_datetime(series: pd.Series, value_type) -> pd.Series:
    """Convert series to datetime format."""
    series = pd.to_datetime(series, errors='coerce')
    if value_type == 'year':
        return pd.to_numeric(series.dt.year, downcast='integer', errors='coerce')
    return series

def clean_categorical(series: pd.Series, categories = None) -> pd.Series:
    """Convert series to categorical format with optional categories."""
    if categories:
        return pd.Categorical(series, categories=categories, ordered=False )
    return series.astype('category')


def extract_wikidata_id(series: pd.Series) -> pd.Series:
    """Extract the Wikidata ID from a series of URLs."""
    return series.str.extract(r'(Q\d+)', expand=False)



In [831]:

def process_column(series: pd.Series, dtype: str, value_type: str = None, categories = None) -> pd.Series:
    """
    Process a single column according to its configuration.
    
    Args:
        series: Column data to process
        dtype: Target data type
        categories: Optional list of categories for categorical data
    
    Returns:
        Processed column data
    """
    if value_type == 'wikidata_url':
        return extract_wikidata_id(series)
    
    if dtype == 'datetime64[ns]':
        return clean_datetime(series, value_type)
    elif dtype == 'category':
        return clean_categorical(series, categories)
    else:
        return series.astype(dtype)


In [832]:

def load_and_process_dataset(
    source_path: str,
    columns_config: dict
) -> pd.DataFrame:
    """
    Load and process a single dataset according to configuration.
    
    Args:
        source_path: Path to source CSV file
        columns_config: Configuration for columns
        dataset_name: Name of the dataset for specific processing
    
    Returns:
        Processed DataFrame
    """
    # Load data
    df = pd.read_csv(source_path)
    
    # Rename columns
    column_mappings = {
        config['original_name']: col_name
        for col_name, config in columns_config.items()
        if 'original_name' in config
    }
    df = df.rename(columns=column_mappings)
    
    # Select configured columns
    df = df[list(columns_config.keys())]
    
    # Process each column
    for column, config in columns_config.items():
        df[column] = process_column(
            df[column],
            config['dtype'],
            config.get('value_type'),
            config.get('categories')
        )
    
    # Set index if specified
    for column, config in columns_config.items():
        if config.get('is_index', False):
            df = df.drop_duplicates(subset=column, keep='first')
            #df = df.set_index(column)
    
    return df

def load_all_datasets(config: dict) -> dict:
    """
    Load and process all datasets defined in configuration.
    
    Args:
        config: Configuration dict
    
    Returns:
        Dictionary of processed DataFrames
    """
    
    return {
        dataset_name: load_and_process_dataset(
            dataset_config['source'],
            dataset_config['columns']
        )
        for dataset_name, dataset_config in config.items()
    }


In [833]:
# Load configuration from YAML file
config = load_config('config.yaml')


### Loading

In [834]:
all_data = load_all_datasets(config)

#### Load correspondance table

In [835]:
correspondance_data = all_data['ids']
correspondance_data.drop_duplicates(inplace=True)
correspondance_data

Unnamed: 0,painting_id,author_id,location_id
0,Q724861,,Q728116
1,Q727875,Q47551,Q51252
2,Q467376,Q41264,Q682827
3,Q469722,,Q156722
4,Q470541,Q159606,Q195436
...,...,...,...
455436,Q130739400,Q209050,Q314082
455437,Q130739506,Q970511,Q337555
455438,Q130739574,Q33477,Q124695064
455439,Q130741175,Q3069236,Q2338135


#### Load paintings data

In [836]:
paintings_data = all_data['paintings']
paintings_data

Unnamed: 0,painting_id,title,creation_date,filename,food_words,contains_food
0,Q607761,The Death of the Picador,1793.0,La_muerte_del_picador.jpg,[],False
1,Q609572,Manaò tupapaú,1892.0,Paul_Gauguin_-_Manaò_tupapaú_(Spirit_of_the_De...,[],False
2,Q607598,Virgin of the Councillors,,Dalmau_Mare_de_Deu_dels_Consellers.jpg,[],False
3,Q734082,Regatta at Sainte-Adresse,1867.0,"Claude_Monet,_1867,_Regatta_at_Sainte-Adresse,...",[],False
4,Q472037,By the Seashore,1883.0,Pierre-Auguste_Renoir_-_Femme_assise_au_bord_d...,[],False
...,...,...,...,...,...,...
121662,Q98966261,Musical Entertainment,1899.0,Jakob_Emanuel_Gaisser_-_Musical_Entertainment.jpg,[],False
121663,Q98977855,"Césarine de Houdetot, Baronne de Barante, read...",1818.0,Portrait_of_Cesarine_de_Houdetot_by_Louise_Bou...,[],False
121664,Q99025930,The Broken Jug,1847.0,The_Broken_Jug_by_Jenny_Berger-Desoras.jpg,[],False
121665,Q98970362,Dr Philippe Pinel (1745-1826) and his family,1807.0,Philippe_Pinel_and_his_family_by_Julie_Foresti...,[],False


#### Load locations data

In [837]:
locations_data = all_data['locations']
locations_data

Unnamed: 0,location_id,location_name,location_country,coordinates
0,Q728116,Folger Shakespeare Library,United States of America,Point(-77.003172 38.889361)
1,Q51252,Uffizi Gallery,Italy,Point(11.255277777 43.768333333)
2,Q682827,The Frick Collection,United States of America,Point(-73.967222222 40.771111111)
3,Q156722,Altes Museum,Germany,Point(13.398888888 52.519444444)
4,Q195436,Tate Britain,United Kingdom,Point(-0.127789 51.491062)
...,...,...,...,...
455393,Q21072596,Rocca Meli Lupi,Italy,Point(10.122271 44.927151)
455394,Q3622570,National Gallery for Foreign Art,Bulgaria,Point(23.334444 42.696111)
455405,Q156752,Sumy,Ukraine,Point(34.799166666 50.906666666)
455406,Q12158693,"Local Museum, Sumy",Ukraine,Point(34.79851 50.90648)


#### Load authors data

In [838]:
authors_data = all_data['authors']
authors_data

Unnamed: 0,author_id,painter,author_country,date_of_birth,gender
0,Q5432,Francisco Goya,Spain,1746.0,male
1,Q37693,Paul Gauguin,France,1848.0,male
2,Q723863,Lluís Dalmau,,,male
3,Q296,Claude Monet,France,1840.0,male
4,Q39931,Pierre-Auguste Renoir,France,1841.0,male
...,...,...,...,...,...
28050,Q7361580,Roman Artymowski,Poland,1919.0,male
28051,Q12296556,Todor Panayotov,,1927.0,male
28052,Q14657728,Amarnath Sehgal,Dominion of India,1922.0,male
28053,Q19595721,Manu Parekh,Dominion of India,1939.0,male


### Merging

In [839]:
merged_df = correspondance_data.merge(paintings_data, on='painting_id', how='inner')
merged_df = merged_df.merge(authors_data, on='author_id', how='left')
merged_df = merged_df.merge(locations_data, on='location_id', how='left')
merged_df

Unnamed: 0,painting_id,author_id,location_id,title,creation_date,filename,food_words,contains_food,painter,author_country,date_of_birth,gender,location_name,location_country,coordinates
0,Q727875,Q47551,Q51252,Venus of Urbino,,Tiziano's_Venere_di_Urbino_(from_The_History_B...,['dining'],True,Titian,Republic of Venice,,male,Uffizi Gallery,Italy,Point(11.255277777 43.768333333)
1,Q467376,Q41264,Q682827,Officer and Laughing Girl,,Johannes_Vermeer_-_De_Soldaat_en_het_Lachende_...,['wine'],True,Johannes Vermeer,Dutch Republic,,male,The Frick Collection,United States of America,Point(-73.967222222 40.771111111)
2,Q470541,Q159606,Q195436,The Vale of Rest,1858.0,Millais_-_Das_Tal_der_Stille.jpg,[],False,John Everett Millais,United Kingdom of Great Britain and Ireland,1829.0,male,Tate Britain,United Kingdom,Point(-0.127789 51.491062)
3,Q877191,Q8459,Q95569,The Three Philosophers,,Giorgione_-_Three_Philosophers_-_Google_Art_Pr...,[],False,Giorgione,Republic of Venice,,male,Kunsthistorisches Museum,Austria,Point(16.3616 48.203880555)
4,Q878981,Q154338,Q154568,The Mocking of Christ,,Mathis_Gothart_Grünewald_062.jpg,[],False,Matthias Grünewald,Germany,,male,Alte Pinakothek,Germany,Point(11.569985 48.148291)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59275,Q100347957,Q2095810,Q213322,"Buddhist stupa at Sarnath, Benaras",1864.0,"William_Simpson_-_Buddhist_stupa_at_Sarnath,_B...",[],False,William Simpson,United Kingdom of Great Britain and Ireland,1823.0,male,Victoria and Albert Museum,United Kingdom,Point(-0.17162 51.49685)
59276,Q100354671,Q983663,Q768717,Jens Johan Hjort,1870.0,Knud_Bergslien_-_Portrett_av_Jens_Johan_Hjort_...,[],False,Knud Bergslien,Norway,1827.0,male,private collection,,
59277,Q100311223,Q114853863,Q1356138,Ram enthroned,1840.0,"Baijnath._Rama_Enthroned._Deogarh,_Mewar,_1840...",[],False,Baijnath,Deogarh,1800.0,male,National Museum,India,Point(77.219262 28.611811)
59278,Q100310216,Q116616297,Q1356138,Awadh Sultan Asaf-ud-daulah in a garden,1775.0,"Shuja_ud_Daula,_Nawab_of_Awadh,_Receiving_Two_...",[],False,Nidha Mal,Oudh State,,male,National Museum,India,Point(77.219262 28.611811)


In [840]:
import numpy as np



# Calculate the age of the painter at the time of painting
merged_df['painter_age_at_painting'] = merged_df['creation_date'] - merged_df['date_of_birth']

# Display the updated DataFrame
display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])

# Calculate the average painter_age_at_painting for each painter
avg_painter_age = merged_df['painter_age_at_painting'].mean().astype(int)

merged_df['painter_age_at_painting'].fillna(avg_painter_age, inplace=True)
merged_df.dropna(subset=['creation_date', 'date_of_birth'], how='all', inplace=True)

merged_df['creation_date'].fillna(merged_df['date_of_birth'] + merged_df['painter_age_at_painting'], inplace=True)

display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])


Unnamed: 0,painter,creation_date,date_of_birth,painter_age_at_painting
0,Titian,,,
1,Johannes Vermeer,,,
2,John Everett Millais,1858.0,1829.0,29.0
3,Giorgione,,,
4,Matthias Grünewald,,,
...,...,...,...,...
59275,William Simpson,1864.0,1823.0,41.0
59276,Knud Bergslien,1870.0,1827.0,43.0
59277,Baijnath,1840.0,1800.0,40.0
59278,Nidha Mal,1775.0,,



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Unnamed: 0,painter,creation_date,date_of_birth,painter_age_at_painting
2,John Everett Millais,1858.0,1829.0,29.0
8,Paul Gauguin,1892.0,1848.0,44.0
10,Édouard Manet,1862.0,1832.0,30.0
11,Salvador Dalí,1924.0,1904.0,20.0
12,Théodore Géricault,1819.0,1791.0,28.0
...,...,...,...,...
59275,William Simpson,1864.0,1823.0,41.0
59276,Knud Bergslien,1870.0,1827.0,43.0
59277,Baijnath,1840.0,1800.0,40.0
59278,Nidha Mal,1775.0,,41.0


In [841]:
merged_df = merged_df.dropna()


In [842]:
df = merged_df[merged_df['contains_food']]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1051 entries, 12 to 59246
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   painting_id              1051 non-null   object  
 1   author_id                1051 non-null   object  
 2   location_id              1051 non-null   object  
 3   title                    1051 non-null   object  
 4   creation_date            1051 non-null   float64 
 5   filename                 1051 non-null   object  
 6   food_words               1051 non-null   object  
 7   contains_food            1051 non-null   bool    
 8   painter                  1051 non-null   object  
 9   author_country           1051 non-null   object  
 10  date_of_birth            1051 non-null   float64 
 11  gender                   1051 non-null   category
 12  location_name            1051 non-null   object  
 13  location_country         1051 non-null   category
 14  coordinates

In [843]:
merged_df

Unnamed: 0,painting_id,author_id,location_id,title,creation_date,filename,food_words,contains_food,painter,author_country,date_of_birth,gender,location_name,location_country,coordinates,painter_age_at_painting
2,Q470541,Q159606,Q195436,The Vale of Rest,1858.0,Millais_-_Das_Tal_der_Stille.jpg,[],False,John Everett Millais,United Kingdom of Great Britain and Ireland,1829.0,male,Tate Britain,United Kingdom,Point(-0.127789 51.491062),29.0
8,Q609572,Q37693,Q1970945,Manaò tupapaú,1892.0,Paul_Gauguin_-_Manaò_tupapaú_(Spirit_of_the_De...,[],False,Paul Gauguin,France,1848.0,male,Buffalo AKG Art Museum,United States of America,Point(-78.875618 42.93245),44.0
10,Q210277,Q40599,Q180788,Music in the Tuileries,1862.0,MANET_-_Música_en_las_Tullerías_(National_Gall...,[],False,Édouard Manet,France,1832.0,male,National Gallery,United Kingdom,Point(-0.128333333 51.508888888),30.0
11,Q211293,Q5577,Q460889,Portrait of Luis Buñuel,1924.0,Madrid_(49191190833).jpg,[],False,Salvador Dalí,Spain,1904.0,male,Museo Nacional Centro de Arte Reina Sofía,Spain,Point(-3.693996097 40.408571429),20.0
12,Q212616,Q184212,Q4032385,The Raft of the Medusa,1819.0,JEAN_LOUIS_THÉODORE_GÉRICAULT_-_La_Balsa_de_la...,['wine'],True,Théodore Géricault,France,1791.0,male,Room 700,France,Point(2.33494 48.8605),28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59273,Q100333064,Q2095810,Q213322,View of the Kaiser Bagh in Lucknow,1864.0,William_Simpson_-_View_of_the_Kaiser_Bagh_in_L...,[],False,William Simpson,United Kingdom of Great Britain and Ireland,1823.0,male,Victoria and Albert Museum,United Kingdom,Point(-0.17162 51.49685),41.0
59274,Q100334168,Q351746,Q1592523,Boat in the Albufera of Valencia,1895.0,Barca-en-Albufera-Sorolla.jpg,[],False,Joaquín Sorolla,Spain,1863.0,male,Sorolla Museum,Spain,Point(-3.692539 40.435404),32.0
59275,Q100347957,Q2095810,Q213322,"Buddhist stupa at Sarnath, Benaras",1864.0,"William_Simpson_-_Buddhist_stupa_at_Sarnath,_B...",[],False,William Simpson,United Kingdom of Great Britain and Ireland,1823.0,male,Victoria and Albert Museum,United Kingdom,Point(-0.17162 51.49685),41.0
59277,Q100311223,Q114853863,Q1356138,Ram enthroned,1840.0,"Baijnath._Rama_Enthroned._Deogarh,_Mewar,_1840...",[],False,Baijnath,Deogarh,1800.0,male,National Museum,India,Point(77.219262 28.611811),40.0


### Visualisations

In [844]:
# Count paintings per author country
paintings_per_country = df['author_country'].value_counts().reset_index()
paintings_per_country.columns = ['Author Country', 'Number of Paintings']

# Create bar chart
fig1 = px.bar(paintings_per_country, 
              x='Author Country', 
              y='Number of Paintings',
              title='Number of Paintings per Author Country',
              labels={'Author Country': 'Country', 'Number of Paintings': 'Paintings'})
fig1.show()

In [845]:
# Count authors by gender
gender_counts = df['gender'].value_counts().reset_index()
gender_counts.columns = ['Gender', 'Count']

# Create pie chart
fig2 = px.pie(gender_counts, 
             names='Gender', 
             values='Count',
             title='Gender Distribution of Authors',
             hole=0.3)
fig2.show()

In [846]:
# Drop rows with missing values in 'title' column
df = df.dropna(subset=['title', 'creation_date'])

# Sample 100 random rows from the DataFrame
df_subset = df.sample(n=100, random_state=42)  # Use `n` to specify the number of rows you want in the subset

# Alternatively, if you want a percentage-based sample (e.g., 10% of the data)
df_subset = df.sample(frac=0.1, random_state=42)  # `frac` is the fraction of rows to sample

# Now, you can create the plot with this subset
fig3 = px.scatter(df_subset, 
                  x='creation_date', 
                  y='title',
                  #title='Timeline of Paintings Creation Dates (Random Subset)',
                  #labels={'creation_date': 'Creation Date', 'title': 'Painting Title'},
                  #hover_data=['author_name']
                  )
fig3.update_yaxes(autorange="reversed")  # Optional: To display earliest at top
fig3.show()

In [847]:
# Extract latitude and longitude from coordinates
df[['Longitude', 'Latitude']] = df['coordinates'].str.extract(r'Point\(([-\d.]+) ([-\d.]+)\)', expand=True).astype(float)

# Create map
fig4 = px.scatter_geo(df,
                      lat='Latitude',
                      lon='Longitude',
                      hover_name='location_name',
                      hover_data={'Latitude': False, 'Longitude': False},
                      title='Geographical Distribution of Painting Locations',
                      projection='natural earth')
fig4.show()

In [848]:
df

Unnamed: 0,painting_id,author_id,location_id,title,creation_date,filename,food_words,contains_food,painter,author_country,date_of_birth,gender,location_name,location_country,coordinates,painter_age_at_painting,Longitude,Latitude
12,Q212616,Q184212,Q4032385,The Raft of the Medusa,1819.0,JEAN_LOUIS_THÉODORE_GÉRICAULT_-_La_Balsa_de_la...,['wine'],True,Théodore Géricault,France,1791.0,male,Room 700,France,Point(2.33494 48.8605),28.0,2.334940,48.860500
60,Q354396,Q34661,Q40,Portrait of Adele Bloch-Bauer I,1907.0,"Gustav_Klimt,_1907,_Adele_Bloch-Bauer_I,_Neue_...",['sugar'],True,Gustav Klimt,Cisleithania,1862.0,male,Austria,Austria,Point(14.0 48.0),45.0,14.000000,48.000000
80,Q428985,Q310715,Q23402,Un coin de table,1872.0,Henri_Fantin-Latour_-_By_the_Table_-_Google_Ar...,"['fruit', 'coffee', 'wine']",True,Henri Fantin-Latour,France,1836.0,male,Musée d'Orsay,France,Point(2.326527 48.859972),36.0,2.326527,48.859972
114,Q636001,Q326434,Q23402,The Romans of the Decadence,1847.0,Thomas_Couture_-_Romans_during_the_Decadence_-...,"['apple', 'banquet', 'grape']",True,Thomas Couture,France,1815.0,male,Musée d'Orsay,France,Point(2.326527 48.859972),32.0,2.326527,48.859972
122,Q644887,Q5432,Q160112,Judith and Holofernes,1819.0,Judith_y_Holofernes_(Goya).jpg,['serving'],True,Francisco Goya,Spain,1746.0,male,Museo del Prado,Spain,Point(-3.692222222 40.413888888),73.0,-3.692222,40.413889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59049,Q98113753,Q211763,Q403080,The Lady of Shalott,1868.0,"The_Lady_of_Shalott_by_William_Holman_Hunt,_c....",['apple'],True,William Holman Hunt,United Kingdom of Great Britain and Ireland,1827.0,male,Wadsworth Atheneum Museum of Art,United States of America,Point(-72.673086 41.763711),41.0,-72.673086,41.763711
59160,Q99936270,Q326167,Q3330848,The Tea Cup,1918.0,"Alfred_Roll,_La_Tasse_de_thé.jpg","['tea', 'breakfast']",True,Alfred Philippe Roll,France,1846.0,male,Municipal museum (Saverne),France,Point(7.363757 48.741999),72.0,7.363757,48.741999
59192,Q97730676,Q37693,Q845468,Still Life with a Sketch after Delacroix,1887.0,"Nature_morte_de_P._Gauguin_(MAMC,_Strasbourg)_...",['fruit'],True,Paul Gauguin,France,1848.0,male,Strasbourg Museum of Modern and Contemporary Art,France,Point(7.736111111 48.579444444),39.0,7.736111,48.579444
59236,Q99732949,Q351746,Q5043601,Selling Melons,1890.0,Joaquín_Sorolla_y_Bastida_-_Vendiendo_melones.jpg,['melon'],True,Joaquín Sorolla,Spain,1863.0,male,Carmen Thyssen Museum,Spain,Point(-4.422778 36.721389),27.0,-4.422778,36.721389


In [849]:


# Count paintings per author
paintings_per_author = df['painter'].value_counts().head(10).reset_index()
paintings_per_author.columns = ['Painter', 'Number of Paintings']

# Create horizontal bar chart
fig5 = px.bar(paintings_per_author.sort_values('Number of Paintings'),
             x='Number of Paintings',
             y='Painter',
             orientation='h',
             #title='Top 10 Authors by Number of Paintings',
             #labels={'Number of Paintings': 'Paintings', 'Painter': 'Author'}
             )
fig5.show()

In [850]:
# Count paintings with and without food
food_counts = merged_df['contains_food'].value_counts().reset_index()
food_counts.columns = ['Contains Food', 'Count']
food_counts['Contains Food'] = food_counts['Contains Food'].map({True: 'Contains Food', False: 'No Food'})

# Create donut chart
fig6 = px.pie(food_counts, 
             names='Contains Food', 
             values='Count',
             title='Paintings Containing Food vs. Not Containing Food',
             hole=0.4)
fig6.show()

In [851]:
from collections import Counter
import itertools

# Split the strings into individual words
all_food_words = list(itertools.chain.from_iterable(df['food_words'].dropna().apply(lambda x: x.strip("[]").replace("'", "").split(", "))))
food_counter = Counter(all_food_words).most_common(10)

# Create dataframe for plotting
food_df = pd.DataFrame(food_counter, columns=['Food Item', 'Frequency'])

# Create bar chart
fig7 = px.bar(food_df, 
             x='Food Item', 
             y='Frequency',
             title='Frequency of Different Food Items in Paintings',
             labels={'Food Item': 'Food Item', 'Frequency': 'Count'})
fig7.show()

In [852]:
# Drop rows with missing data
scatter_df = df.dropna(subset=['date_of_birth', 'creation_date'])

# Create scatter plot
fig8 = px.scatter(scatter_df, 
                  x='date_of_birth', 
                  y='creation_date',
                  trendline='ols',
                  title="Authors' Birth Years vs. Painting Creation Years",
                  labels={'date_of_birth': 'Author Birth Year', 'creation_date': 'Creation Year'},
                  hover_data=['painter', 'title'])
fig8.show()

In [853]:
# Select a very small subset for testing
# Drop rows with missing data
scatter_df = df.dropna(subset=['date_of_birth', 'creation_date'])
df_subset = scatter_df[['creation_date', 'title']].head(100)
fig3 = px.scatter(df_subset, 
                  x='creation_date', 
                  y='title',
                  title='Test Plot of Paintings Creation Dates')
fig3.show()

In [854]:
# Count paintings per location country
paintings_per_location_country = df['location_country'].value_counts().reset_index()
paintings_per_location_country.columns = ['Location Country', 'Number of Paintings']

# Create treemap
fig9 = px.treemap(paintings_per_location_country, 
                 path=['Location Country'], 
                 values='Number of Paintings',
                 title='Distribution of Paintings by Location Country')
fig9.show()





In [855]:
import hashlib

def get_wikimedia_thumbnail(filename, width=200):
    # Calculate the MD5 hash of the filename
    md5_hash = hashlib.md5(filename.encode('utf-8')).hexdigest()
    
    # Construct the thumbnail URL
    url = f"https://upload.wikimedia.org/wikipedia/commons/thumb/{md5_hash[0]}/{md5_hash[:2]}/{filename}/{width}px-{filename}"
    
    return url



# Ensure there are no missing values in hover_data columns
gallery = df[['creation_date', 'title', 'painter', 'location_name', 'filename']].dropna().head(10)
gallery['url'] = gallery['filename'].apply(get_wikimedia_thumbnail)


fig10 = px.scatter(gallery, 
                   x='creation_date', 
                   y='title',
                   hover_data=['painter', 'location_name', 'url'],
                   title='Interactive Gallery of Paintings',
                   labels={'creation_date': 'Creation Date', 'title': 'Painting Title'},
                   template='plotly_white')

# Add images as hover
fig10.update_traces(marker=dict(size=12,
                                 color='LightSkyBlue'),
                   selector=dict(mode='markers'))

fig10.update_layout(
    hovermode='closest',
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    )
)

# Update hovertemplate to include image
fig10.update_traces(
    hovertemplate="<b>%{y}</b><br><br>" +
                  "Author: %{customdata[0]}<br>" +
                  "Location: %{customdata[1]}<br>" +
                  "<br><img src='%{customdata[2]}' width='150' height='150'><br>" +
                  "<extra></extra>"
)

fig10.show()

In [856]:


# Group by year and calculate proportions
food_over_time = merged_df.groupby('creation_date')['contains_food'].agg(['count', 'sum']).reset_index()
food_over_time['Proportion with Food'] = food_over_time['sum'] / food_over_time['count']

# Create line chart
fig2 = px.line(food_over_time, 
               x='creation_date', 
               y='Proportion with Food',
               title='Proportion of Paintings Containing Food Over Time',
               labels={'creation_date': 'Year', 'Proportion with Food': 'Proportion'},
               markers=True)

fig2.show()

In [857]:


# Expand food_words into separate rows
df_food = df.explode('food_words').dropna(subset=['food_words'])

# Count food items per author country
food_country = df_food.groupby(['author_country', 'food_words']).size().reset_index(name='Count')

# Create stacked bar chart
fig3 = px.bar(food_country, 
              x='author_country', 
              y='Count', 
              color='food_words',
              title='Distribution of Food Items by Author Country',
              labels={'author_country': 'Author Country', 'food_words': 'Food Item', 'Count': 'Count'},
              barmode='stack')

fig3.show()

In [858]:
# Filter paintings that contain food
df_food = df[df['contains_food'] == True]

# Create heatmap
fig4 = px.density_mapbox(df_food, 
                         lat='Latitude', 
                         lon='Longitude',
                         radius=10,
                         center=dict(lat=20, lon=0),
                         zoom=1,
                         mapbox_style='stamen-terrain',
                         title='Geographical Heatmap of Food-Containing Paintings')

fig4.show()


In [859]:
# Calculate number of food items per painting
merged_df['num_food_items'] = merged_df['food_words'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Group by author and calculate average
avg_food_per_author = merged_df.groupby('painter')['num_food_items'].mean().reset_index()
avg_food_per_author = avg_food_per_author.sort_values('num_food_items', ascending=False).head(10)

# Create bar chart
fig6 = px.bar(avg_food_per_author, 
              x='painter', 
              y='num_food_items',
              title='Average Number of Food Items per Painting by Author',
              labels={'painter': 'Author', 'num_food_items': 'Average Number of Food Items'},
              color='num_food_items',
              color_continuous_scale='Oranges')

fig6.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [860]:
# Prepare data
df_food = df.explode('food_words').dropna(subset=['food_words'])
food_location = df_food.groupby(['food_words', 'location_country']).size().reset_index(name='Count')

# Select top 10 food items
top_food = food_location['food_words'].value_counts().head(10).index
food_location_top = food_location[food_location['food_words'].isin(top_food)]

# Create bubble chart
fig7 = px.scatter(food_location_top, 
                  x='location_country', 
                  y='food_words',
                  size='Count',
                  color='food_words',
                  title='Correlation Between Specific Food Items and Painting Locations',
                  labels={'location_country': 'Location Country', 'food_words': 'Food Item', 'Count': 'Number of Paintings'},
                  hover_name='food_words',
                  size_max=40)

fig7.show()





In [861]:
# Prepare pivot table
food_country_pivot = df_food.groupby(['author_country', 'food_words']).size().reset_index(name='Count')
food_country_pivot = food_country_pivot.pivot(index='food_words', columns='author_country', values='Count').fillna(0)

# Create heatmap
fig8 = px.imshow(food_country_pivot,
                labels=dict(x="Author Country", y="Food Item", color="Count"),
                title="Heatmap of Food Item Frequencies Across Author Countries",
                aspect="auto",
                color_continuous_scale='Viridis')

fig8.show()

In [862]:
# Prepare data
df['num_food_items'] = df['food_words'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Create box plot
fig9 = px.box(df, 
             x='gender', 
             y='num_food_items',
             title='Number of Food Items in Paintings by Author Gender',
             labels={'gender': 'Author Gender', 'num_food_items': 'Number of Food Items'},
             color='gender')

fig9.show()

In [863]:
# Prepare data
df_food = df.explode('food_words').dropna(subset=['food_words']).head(30)

# Define unique labels
authors = df_food['painter'].unique().tolist()
foods = df_food['food_words'].unique().tolist()
locations = df_food['location_name'].unique().tolist()

labels = authors + foods + locations

# Create source and target indices
df_food['source'] = df_food['painter'].apply(lambda x: labels.index(x))
df_food['target_food'] = df_food['food_words'].apply(lambda x: labels.index(x) + len(authors))
df_food['target_location'] = df_food['location_name'].apply(lambda x: labels.index(x) + len(authors) + len(foods))

# Create links for authors to food
links_auth_food = df_food.groupby(['source', 'target_food']).size().reset_index(name='value')

# Create links for food to locations
links_food_loc = df_food.groupby(['target_food', 'target_location']).size().reset_index(name='value')

# Combine links
links = pd.concat([
    links_auth_food.rename(columns={'target_food': 'target'}),
    links_food_loc.rename(columns={'target_location': 'target'})
], ignore_index=True)

# Create Sankey diagram
fig10 = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels,
        color="blue"
    ),
    link=dict(
        source=links['source'],
        target=links['target'],
        value=links['value']
    ))])

fig10.update_layout(title_text="Sankey Diagram: Authors → Food Items → Locations", font_size=10)
fig10.show()