# Food in Art

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import yaml
import numpy as np
from urllib.parse import urlparse, parse_qs, quote



- what is food
- 

In [None]:
#time consuming : 
#get time periods
#deal with missing data
#dl more images
#retrain clip

In [None]:
#TODO: #get time periods + decades
#TODO: calculate proportions
#TODO: retrain CLiP for recognition of the new data

## Functions

In [None]:


def load_config(config_path: str) -> dict:
    """
    Load configuration from YAML file.
    
    Args:
        config_path: Path to YAML configuration file
    
    Returns:
        Dictionary containing configuration settings
    """
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)


In [None]:

def clean_datetime(series: pd.Series, value_type) -> pd.Series:
    """Convert series to datetime format."""
    series = pd.to_datetime(series, errors='coerce')
    if value_type == 'year':
        return pd.to_numeric(series.dt.year, downcast='integer', errors='coerce')
    return series

def clean_categorical(series: pd.Series, categories = None) -> pd.Series:
    """Convert series to categorical format with optional categories."""
    if categories:
        return pd.Categorical(series, categories=categories, ordered=False )
    return series.astype('category')


def extract_wikidata_id(series: pd.Series) -> pd.Series:
    """Extract the Wikidata ID from a series of URLs."""
    return series.str.extract(r'(Q\d+)', expand=False)



In [None]:

def process_column(series: pd.Series, dtype: str, value_type: str = None, categories = None) -> pd.Series:
    """
    Process a single column according to its configuration.
    
    Args:
        series: Column data to process
        dtype: Target data type
        categories: Optional list of categories for categorical data
    
    Returns:
        Processed column data
    """
    if value_type == 'wikidata_url':
        return extract_wikidata_id(series)
    
    if dtype == 'datetime64[ns]':
        return clean_datetime(series, value_type)
    elif dtype == 'category':
        return clean_categorical(series, categories)
    else:
        return series.astype(dtype)


In [None]:

def load_and_process_dataset(
    source_path: str,
    columns_config: dict
) -> pd.DataFrame:
    """
    Load and process a single dataset according to configuration.
    
    Args:
        source_path: Path to source CSV file
        columns_config: Configuration for columns
        dataset_name: Name of the dataset for specific processing
    
    Returns:
        Processed DataFrame
    """
    # Load data
    df = pd.read_csv(source_path)
    
    # Rename columns
    column_mappings = {
        config['original_name']: col_name
        for col_name, config in columns_config.items()
        if 'original_name' in config
    }
    df = df.rename(columns=column_mappings)
    
    # Select configured columns
    df = df[list(columns_config.keys())]
    
    # Process each column
    for column, config in columns_config.items():
        df[column] = process_column(
            df[column],
            config['dtype'],
            config.get('value_type'),
            config.get('categories')
        )
    
    # Set index if specified
    for column, config in columns_config.items():
        if config.get('is_index', False):
            df = df.drop_duplicates(subset=column, keep='first')
            #df = df.set_index(column)
    
    return df

def load_all_datasets(config: dict) -> dict:
    """
    Load and process all datasets defined in configuration.
    
    Args:
        config: Configuration dict
    
    Returns:
        Dictionary of processed DataFrames
    """
    
    return {
        dataset_name: load_and_process_dataset(
            dataset_config['source'],
            dataset_config['columns']
        )
        for dataset_name, dataset_config in config.items()
    }


In [None]:
def get_512px_thumbnail(url):
    """
    Transform a Wikimedia Commons Special:FilePath URL into its 512px thumbnail version.
    
    Args:
        url (str): URL in format: http://commons.wikimedia.org/wiki/Special:FilePath/Filename.jpg
        
    Returns:
        str: The 512px thumbnail URL or None if the URL is null
    
    """
    if not url:
        return None
    
    # Extract the filename from the URL
    filename = url.split('/')[-1]
    
    # Construct the 512px thumbnail URL
    thumbnail_url = f"https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/{filename}&width=512"
    
    return thumbnail_url

## Loading

In [None]:
# Load configuration from YAML file
config = load_config('config.yaml')


In [None]:
all_data = load_all_datasets(config)

#### Load correspondance table

In [None]:
correspondance_data = all_data['ids']
correspondance_data.drop_duplicates(inplace=True)
correspondance_data

#### Load paintings data

In [None]:
paintings_data = all_data['paintings']
paintings_data['image_url'] = paintings_data['image_url'].apply(lambda x: get_512px_thumbnail(x) if pd.notna(x) else x)
paintings_data

#### Load locations data

In [None]:
locations_data = all_data['locations']
locations_data

#### Load authors data

In [None]:
authors_data = all_data['authors']
authors_data

#### Load ML food data

In [None]:
food_words = all_data['food_words']
food_words['food_word_detected'] = food_words.select_dtypes(include='int').sum(axis=1) > 0
food_words

In [None]:
food_found = all_data['food_found']
food_found['food_image_detected'] = food_found['predictions'].apply(lambda x: len(x) > 3)
food_found

### Merging

In [None]:
merged_df = correspondance_data.merge(paintings_data, on='painting_id', how='inner')
merged_df = merged_df.merge(authors_data, on='author_id', how='left')
merged_df = merged_df.merge(locations_data, on='location_id', how='left')
merged_df = merged_df.merge(food_words, on='painting_id', how='left')
merged_df = merged_df.merge(food_found, on='painting_id', how='left')
merged_df

### Cleaning

### Duplicates

In [None]:
merged_df = merged_df.drop_duplicates(subset='painting_id', keep='first')

### Merging

In [None]:
merged_df['food_detected'] = (merged_df['food_word_detected'] | merged_df['food_image_detected']).astype(int)
merged_df

### Pruning

In [None]:
merged_df = merged_df[merged_df['image_path'].notna()]

In [None]:
print(merged_df['food_detected'].value_counts())
print(merged_df['food_word_detected'].value_counts())
print(merged_df['food_image_detected'].value_counts())

### Enhancing

In [None]:

display(merged_df[['creation_date','date_of_birth']])
display(merged_df[['creation_date','date_of_birth']].describe())


from datetime import datetime

def extract_year(input_str):
    current_year = datetime.now().year
    
    # Check if the input string has at least 4 characters and can be converted to an integer
    if isinstance(input_str, str) and len(input_str) >= 4:
        try:
            year = int(input_str[:4])
            if year > current_year:
                return np.nan
            return year
        except ValueError:
            return np.nan
    return np.nan

merged_df['creation_date'] = merged_df['creation_date'].apply(extract_year)
merged_df['date_of_birth'] = merged_df['date_of_birth'].apply(extract_year)


display(merged_df[['creation_date','date_of_birth']])
display(merged_df[['creation_date','date_of_birth']].describe())

In [None]:
# Fill missing creation year when possible
# Calculate the age of the painter at the time of painting
merged_df['painter_age_at_painting'] = merged_df['creation_date'] - merged_df['date_of_birth']

# Display the updated DataFrame
display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])

# Calculate the average painter_age_at_painting for each painter
avg_painter_age = merged_df['painter_age_at_painting'].mean().astype(int)

merged_df['painter_age_at_painting'].fillna(avg_painter_age, inplace=True)
#merged_df.dropna(subset=['creation_date', 'date_of_birth'], how='all', inplace=True)

# Fill missing creation_date with date_of_birth + avg_painter_age
merged_df['creation_date'].fillna(merged_df['date_of_birth'] + avg_painter_age, inplace=True)

display(merged_df[['painter', 'creation_date', 'date_of_birth', 'painter_age_at_painting']])
merged_df

In [None]:
merged_df['painter'].fillna('Unknown Artist', inplace=True)
merged_df['author_country'].fillna('Unknown Country', inplace=True)
merged_df['location_country'].fillna('Unknown Country', inplace=True)
merged_df['location_name'].fillna('Unknown Location', inplace=True)

merged_df['author_gender'] = merged_df['author_gender'].astype('category')
merged_df['author_gender'] = merged_df['author_gender'].cat.set_categories(['male', 'female'])
merged_df['author_gender'].fillna('male', inplace=True)

In [None]:

# Add a column with decades
merged_df['decade'] = (merged_df['creation_date'] // 10) * 10

# Display the updated DataFrame
display(merged_df[['painter', 'creation_date', 'decade']])
print(merged_df['decade'].unique())

In [None]:
merged_df['location_country'] = merged_df['location_country'].replace('German Reich', 'Germany')
merged_df['author_country'] = merged_df['author_country'].replace('German Reich', 'Germany')

#### Add time period

In [None]:
def classify_period(decade):
    if decade < 1000:
        return "Antiquity"
    elif 1000 <= decade < 1400:
        return "Medieval"
    elif 1400 <= decade < 1500:
        return "Early Renaissance"
    elif 1500 <= decade < 1600:
        return "High Renaissance and Mannerism"
    elif 1600 <= decade < 1700:
        return "Baroque"
    elif 1700 <= decade < 1780:
        return "Rococo"
    elif 1780 <= decade < 1850:
        return "Neoclassicism and Romanticism"
    elif 1850 <= decade < 1900:
        return "Realism and Impressionism"
    elif 1900 <= decade < 1945:
        return "Modern Art"
    elif 1945 <= decade < 1970:
        return "Post-War and Abstract Expressionism"
    elif 1970 <= decade < 2000:
        return "Contemporary Art"
    else:
        return "Contemporary and Digital Art"



merged_df['time_period'] = merged_df['decade'].apply(classify_period)


#### Add gdp and pop

In [None]:
eco_df = pd.read_csv('data/gdp_pop_decades.csv')
eco_df

In [None]:
merged_df = merged_df.merge(
    eco_df,
    on='decade',
    how='left'  # Keep all artwork records, even if no economic data exists
)

merged_df

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize 'gdppc' and 'pop' columns
merged_df[['gdppc_normalized', 'pop_normalized']] = scaler.fit_transform(merged_df[['gdppc', 'pop']])

# Display the updated DataFrame
display(merged_df[['gdppc', 'gdppc_normalized', 'pop', 'pop_normalized']])

## FINAL DF

In [None]:
#export for clip
clip_train = merged_df[['painting_id','image_path', 'food_word_detected']]
clip_train = clip_train[clip_train['food_word_detected'] == 1]
clip_train

In [None]:
paintings_with_food = merged_df[merged_df['image_url'].isna() == False]
paintings_with_food = paintings_with_food[['title', 'painter', 'creation_date', 'author_gender', 'author_country', 'location_name', 'location_country', 'time_period', 'image_path', 'image_url', 'coordinates','food_detected','decade','gdppc','pop', 'gdppc_normalized', 'pop_normalized']]
paintings_with_food

## Export

In [None]:
paintings_with_food.to_csv('data/paintings_with_food.csv', index=False)

### GDP analysis

In [None]:
# Group by decade and calculate the proportion of food_detected
food_by_decade = merged_df.groupby('decade')['food_detected'].agg(artwork_count='count', food_related_sum='sum').reset_index()
food_by_decade['proportion_food_detected'] = food_by_decade['food_related_sum'] / food_by_decade['artwork_count']
# Merge normalized GDP and population data
food_by_decade = food_by_decade.merge(
    merged_df[['decade', 'gdppc_normalized', 'pop_normalized']].drop_duplicates(),
    on='decade',
    how='left'
)

# Filter the DataFrame to include only records from 1250 to 2000
food_by_decade = food_by_decade[(food_by_decade['decade'] >= 1250) & (food_by_decade['decade'] <= 2000)]
# Display the resulting DataFrame
food_by_decade.to_csv('data/food_by_decade_analysis.csv', index=False)
food_by_decade

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

# Read the data into a DataFrame
# Creating the DataFrame directly since we have the data as a string
data = food_by_decade

# Calculate Pearson correlation
correlation = data['proportion_food_detected'].corr(data['gdppc_normalized'])
correlation_pvalue = stats.pearsonr(data['proportion_food_detected'], data['gdppc_normalized'])

# Create a scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=data, 
            x='proportion_food_detected', 
            y='gdppc_normalized',
            scatter_kws={'alpha':0.5},
            line_kws={'color': 'red'})

plt.title('Correlation between Food-Related Artwork Proportion and GDP per Capita')
plt.xlabel('Proportion of Food-Related Artwork')
plt.ylabel('Normalized GDP per Capita')

# Calculate summary statistics
summary_stats = {
    'Pearson Correlation': correlation,
    'P-value': correlation_pvalue[1],
    'Sample Size': len(data),
    'Mean Food Proportion': data['proportion_food_detected'].mean(),
    'Mean GDP per Capita': data['gdppc_normalized'].mean(),
}

print("\nCorrelation Analysis Results:")
for key, value in summary_stats.items():
    print(f"{key}: {value:.4f}")

# Additional analysis: Rolling correlation
window_size = 10
rolling_corr = data.rolling(window=window_size).corr()[['proportion_food_detected', 'gdppc_normalized']]['gdppc_normalized'].dropna()

""" plt.figure(figsize=(10, 6))
plt.plot(data['decade'][window_size-1:len(rolling_corr)+window_size-1], rolling_corr, label='Rolling Correlation')
plt.axhline(y=0, color='r', linestyle='--', alpha=0.3)
plt.title(f'Rolling Correlation (Window Size: {window_size} decades)')
plt.xlabel('Decade')
plt.ylabel('Correlation Coefficient')
plt.legend() """

# Show plots
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr


df = data.copy()

# Step 2: Calculate the Pearson correlation coefficient and p-value
x = df['gdppc_normalized']
y = df['proportion_food_detected']

corr_coeff, p_value = pearsonr(x, y)
print(f"Pearson correlation coefficient: {corr_coeff}")
print(f"P-value: {p_value}")

# Step 3: Visualize the data with a scatter plot and a regression line
plt.figure(figsize=(10, 6))
plt.scatter(x, y, color='blue', label='Data points')

# Fit a regression line
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, color='red', label='Regression line')

plt.xlabel('GDP per Capita (Normalized)')
plt.ylabel('Proportion of Food Artworks')
plt.title('Correlation between GDP per Capita and Proportion of Food Artworks')
plt.legend()
plt.show()

# Step 4: Interpret the results
if p_value < 0.05:
    print("There is a statistically significant correlation between GDP per capita and the proportion of food artworks.")
else:
    print("There is no statistically significant correlation between GDP per capita and the proportion of food artworks.")

## Exploration

In [None]:
print(paintings_with_food.value_counts())
print(merged_df['image_path'].count())

### Viz

#### Countries

In [None]:
paintings_with_food

In [None]:
merged_df = merged_df.dropna()


In [None]:
df = merged_df[merged_df['food_detected'] == True]
df.info()

## gdp analysis

### Visualisations

In [None]:
# Count paintings per author country
paintings_per_country = paintings_with_food['author_country'].value_counts().reset_index()
paintings_per_country.columns = ['Author Country', 'Number of Paintings']

# Create bar chart
fig1 = px.bar(paintings_per_country, 
              x='Author Country', 
              y='Number of Paintings',
              title='Number of Paintings per Author Country',
              labels={'Author Country': 'Country', 'Number of Paintings': 'Paintings'})
fig1.show()

In [None]:
# Count authors by gender
gender_counts = paintings_with_food['author_gender'].value_counts().reset_index()
gender_counts.columns = ['Gender', 'Count']

# Create pie chart
fig2 = px.pie(gender_counts, 
             names='Gender', 
             values='Count',
             title='Gender Distribution of Authors',
             hole=0.3)
fig2.show()

In [None]:
# Drop rows with missing values in 'title' column
paintings_with_food = paintings_with_food.dropna(subset=['title', 'creation_date'])

# Sample 100 random rows from the DataFrame
df_subset = paintings_with_food.sample(n=100, random_state=42)  # Use `n` to specify the number of rows you want in the subset

# Alternatively, if you want a percentage-based sample (e.g., 10% of the data)
df_subset = paintings_with_food.sample(frac=0.1, random_state=42)  # `frac` is the fraction of rows to sample

# Now, you can create the plot with this subset
fig3 = px.scatter(df_subset, 
                  x='creation_date', 
                  y='title',
                  #title='Timeline of Paintings Creation Dates (Random Subset)',
                  #labels={'creation_date': 'Creation Date', 'title': 'Painting Title'},
                  #hover_data=['author_name']
                  )
fig3.update_yaxes(autorange="reversed")  # Optional: To display earliest at top
fig3.show()

In [None]:
# Extract latitude and longitude from coordinates
paintings_with_food[['Longitude', 'Latitude']] = paintings_with_food['coordinates'].str.extract(r'Point\(([-\d.]+) ([-\d.]+)\)', expand=True).astype(float)

# Create map
fig4 = px.scatter_geo(paintings_with_food,
                      lat='Latitude',
                      lon='Longitude',
                      hover_name='location_name',
                      hover_data={'Latitude': False, 'Longitude': False},
                      title='Geographical Distribution of Painting Locations',
                      projection='natural earth')
fig4.show()

In [None]:
paintings_with_food

In [None]:


# Count paintings per author
paintings_per_author = paintings_with_food['painter'].value_counts().head(10).reset_index()
paintings_per_author.columns = ['Painter', 'Number of Paintings']

# Create horizontal bar chart
fig5 = px.bar(paintings_per_author.sort_values('Number of Paintings'),
             x='Number of Paintings',
             y='Painter',
             orientation='h',
             #title='Top 10 Authors by Number of Paintings',
             #labels={'Number of Paintings': 'Paintings', 'Painter': 'Author'}
             )
fig5.show()

In [None]:
# Count paintings with and without food
food_counts = merged_df['food_detected'].value_counts().reset_index()
food_counts.columns = ['food_detected', 'Count']
food_counts['food_detected'] = food_counts['food_detected'].map({True: 'Contains Food', False: 'No Food'})

# Create donut chart
fig6 = px.pie(food_counts, 
             names='Contains Food', 
             values='Count',
             title='Paintings Containing Food vs. Not Containing Food',
             hole=0.4)
fig6.show()

In [None]:
from collections import Counter
import itertools

# Split the strings into individual words
all_food_words = list(itertools.chain.from_iterable(df['food_words'].dropna().apply(lambda x: x.strip("[]").replace("'", "").split(", "))))
food_counter = Counter(all_food_words).most_common(10)

# Create dataframe for plotting
food_df = pd.DataFrame(food_counter, columns=['Food Item', 'Frequency'])

# Create bar chart
fig7 = px.bar(food_df, 
             x='Food Item', 
             y='Frequency',
             title='Frequency of Different Food Items in Paintings',
             labels={'Food Item': 'Food Item', 'Frequency': 'Count'})
fig7.show()

In [None]:
# Drop rows with missing data
scatter_df = df.dropna(subset=['date_of_birth', 'creation_date'])

# Create scatter plot
fig8 = px.scatter(scatter_df, 
                  x='date_of_birth', 
                  y='creation_date',
                  trendline='ols',
                  title="Authors' Birth Years vs. Painting Creation Years",
                  labels={'date_of_birth': 'Author Birth Year', 'creation_date': 'Creation Year'},
                  hover_data=['painter', 'title'])
fig8.show()

In [None]:
# Select a very small subset for testing
# Drop rows with missing data
scatter_df = df.dropna(subset=['date_of_birth', 'creation_date'])
df_subset = scatter_df[['creation_date', 'title']].head(100)
fig3 = px.scatter(df_subset, 
                  x='creation_date', 
                  y='title',
                  title='Test Plot of Paintings Creation Dates')
fig3.show()

In [None]:
# Count paintings per location country
paintings_per_location_country = df['location_country'].value_counts().reset_index()
paintings_per_location_country.columns = ['Location Country', 'Number of Paintings']

# Create treemap
fig9 = px.treemap(paintings_per_location_country, 
                 path=['Location Country'], 
                 values='Number of Paintings',
                 title='Distribution of Paintings by Location Country')
fig9.show()

In [None]:
import hashlib

def get_wikimedia_thumbnail(filename, width=200):
    # Calculate the MD5 hash of the filename
    md5_hash = hashlib.md5(filename.encode('utf-8')).hexdigest()
    
    # Construct the thumbnail URL
    url = f"https://upload.wikimedia.org/wikipedia/commons/thumb/{md5_hash[0]}/{md5_hash[:2]}/{filename}/{width}px-{filename}"
    
    return url



# Ensure there are no missing values in hover_data columns
gallery = df[['creation_date', 'title', 'painter', 'location_name', 'filename']].dropna().head(10)
gallery['url'] = gallery['filename'].apply(get_wikimedia_thumbnail)


fig10 = px.scatter(gallery, 
                   x='creation_date', 
                   y='title',
                   hover_data=['painter', 'location_name', 'url'],
                   title='Interactive Gallery of Paintings',
                   labels={'creation_date': 'Creation Date', 'title': 'Painting Title'},
                   template='plotly_white')

# Add images as hover
fig10.update_traces(marker=dict(size=12,
                                 color='LightSkyBlue'),
                   selector=dict(mode='markers'))

fig10.update_layout(
    hovermode='closest',
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    )
)

# Update hovertemplate to include image
fig10.update_traces(
    hovertemplate="<b>%{y}</b><br><br>" +
                  "Author: %{customdata[0]}<br>" +
                  "Location: %{customdata[1]}<br>" +
                  "<br><img src='%{customdata[2]}' width='150' height='150'><br>" +
                  "<extra></extra>"
)

fig10.show()

In [None]:


# Group by year and calculate proportions
food_over_time = merged_df.groupby('creation_date')['contains_food'].agg(['count', 'sum']).reset_index()
food_over_time['Proportion with Food'] = food_over_time['sum'] / food_over_time['count']

# Create line chart
fig2 = px.line(food_over_time, 
               x='creation_date', 
               y='Proportion with Food',
               title='Proportion of Paintings Containing Food Over Time',
               labels={'creation_date': 'Year', 'Proportion with Food': 'Proportion'},
               markers=True)

fig2.show()

In [None]:


# Expand food_words into separate rows
df_food = df.explode('food_words').dropna(subset=['food_words'])

# Count food items per author country
food_country = df_food.groupby(['author_country', 'food_words']).size().reset_index(name='Count')

# Create stacked bar chart
fig3 = px.bar(food_country, 
              x='author_country', 
              y='Count', 
              color='food_words',
              title='Distribution of Food Items by Author Country',
              labels={'author_country': 'Author Country', 'food_words': 'Food Item', 'Count': 'Count'},
              barmode='stack')

fig3.show()

In [None]:
# Filter paintings that contain food
df_food = df[df['contains_food'] == True]

# Create heatmap
fig4 = px.density_mapbox(df_food, 
                         lat='Latitude', 
                         lon='Longitude',
                         radius=10,
                         center=dict(lat=20, lon=0),
                         zoom=1,
                         mapbox_style='stamen-terrain',
                         title='Geographical Heatmap of Food-Containing Paintings')

fig4.show()


In [None]:
# Calculate number of food items per painting
merged_df['num_food_items'] = merged_df['food_words'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Group by author and calculate average
avg_food_per_author = merged_df.groupby('painter')['num_food_items'].mean().reset_index()
avg_food_per_author = avg_food_per_author.sort_values('num_food_items', ascending=False).head(10)

# Create bar chart
fig6 = px.bar(avg_food_per_author, 
              x='painter', 
              y='num_food_items',
              title='Average Number of Food Items per Painting by Author',
              labels={'painter': 'Author', 'num_food_items': 'Average Number of Food Items'},
              color='num_food_items',
              color_continuous_scale='Oranges')

fig6.show()

In [None]:
# Prepare data
df_food = df.explode('food_words').dropna(subset=['food_words'])
food_location = df_food.groupby(['food_words', 'location_country']).size().reset_index(name='Count')

# Select top 10 food items
top_food = food_location['food_words'].value_counts().head(10).index
food_location_top = food_location[food_location['food_words'].isin(top_food)]

# Create bubble chart
fig7 = px.scatter(food_location_top, 
                  x='location_country', 
                  y='food_words',
                  size='Count',
                  color='food_words',
                  title='Correlation Between Specific Food Items and Painting Locations',
                  labels={'location_country': 'Location Country', 'food_words': 'Food Item', 'Count': 'Number of Paintings'},
                  hover_name='food_words',
                  size_max=40)

fig7.show()

In [None]:
# Prepare pivot table
food_country_pivot = df_food.groupby(['author_country', 'food_words']).size().reset_index(name='Count')
food_country_pivot = food_country_pivot.pivot(index='food_words', columns='author_country', values='Count').fillna(0)

# Create heatmap
fig8 = px.imshow(food_country_pivot,
                labels=dict(x="Author Country", y="Food Item", color="Count"),
                title="Heatmap of Food Item Frequencies Across Author Countries",
                aspect="auto",
                color_continuous_scale='Viridis')

fig8.show()

In [None]:
# Prepare data
df['num_food_items'] = df['food_words'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Create box plot
fig9 = px.box(df, 
             x='gender', 
             y='num_food_items',
             title='Number of Food Items in Paintings by Author Gender',
             labels={'gender': 'Author Gender', 'num_food_items': 'Number of Food Items'},
             color='gender')

fig9.show()

In [None]:
# Prepare data
df_food = df.explode('food_words').dropna(subset=['food_words']).head(30)

# Define unique labels
authors = df_food['painter'].unique().tolist()
foods = df_food['food_words'].unique().tolist()
locations = df_food['location_name'].unique().tolist()

labels = authors + foods + locations

# Create source and target indices
df_food['source'] = df_food['painter'].apply(lambda x: labels.index(x))
df_food['target_food'] = df_food['food_words'].apply(lambda x: labels.index(x) + len(authors))
df_food['target_location'] = df_food['location_name'].apply(lambda x: labels.index(x) + len(authors) + len(foods))

# Create links for authors to food
links_auth_food = df_food.groupby(['source', 'target_food']).size().reset_index(name='value')

# Create links for food to locations
links_food_loc = df_food.groupby(['target_food', 'target_location']).size().reset_index(name='value')

# Combine links
links = pd.concat([
    links_auth_food.rename(columns={'target_food': 'target'}),
    links_food_loc.rename(columns={'target_location': 'target'})
], ignore_index=True)

# Create Sankey diagram
fig10 = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels,
        color="blue"
    ),
    link=dict(
        source=links['source'],
        target=links['target'],
        value=links['value']
    ))])

fig10.update_layout(title_text="Sankey Diagram: Authors → Food Items → Locations", font_size=10)
fig10.show()