# Food in Art

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import yaml


In [None]:


def load_config(config_path: str) -> dict:
    """
    Load configuration from YAML file.
    
    Args:
        config_path: Path to YAML configuration file
    
    Returns:
        Dictionary containing configuration settings
    """
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)


In [None]:

def clean_datetime(series: pd.Series, value_type) -> pd.Series:
    """Convert series to datetime format."""
    series = pd.to_datetime(series, errors='coerce')
    if value_type == 'year':
        return pd.to_numeric(series.dt.year, downcast='integer', errors='coerce')
    return series

def clean_categorical(series: pd.Series, categories: Optional[List[str]] = None) -> pd.Series:
    """Convert series to categorical format with optional categories."""
    if categories:
        return pd.Categorical(series, categories=categories, ordered=False )
    return series.astype('category')


def extract_wikidata_id(series: pd.Series) -> pd.Series:
    """Extract the Wikidata ID from a series of URLs."""
    return series.str.extract(r'(Q\d+)', expand=False)



In [None]:

def process_column(series: pd.Series, dtype: str, value_type: str = None, categories: Optional[List[str]] = None) -> pd.Series:
    """
    Process a single column according to its configuration.
    
    Args:
        series: Column data to process
        dtype: Target data type
        categories: Optional list of categories for categorical data
    
    Returns:
        Processed column data
    """
    if value_type == 'wikidata_url':
        return extract_wikidata_id(series)
    
    if dtype == 'datetime64[ns]':
        return clean_datetime(series, value_type)
    elif dtype == 'category':
        return clean_categorical(series, categories)
    else:
        return series.astype(dtype)


In [None]:

def load_and_process_dataset(
    source_path: str,
    columns_config: dict
) -> pd.DataFrame:
    """
    Load and process a single dataset according to configuration.
    
    Args:
        source_path: Path to source CSV file
        columns_config: Configuration for columns
        dataset_name: Name of the dataset for specific processing
    
    Returns:
        Processed DataFrame
    """
    # Load data
    df = pd.read_csv(source_path)
    
    # Rename columns
    column_mappings = {
        config['original_name']: col_name
        for col_name, config in columns_config.items()
        if 'original_name' in config
    }
    df = df.rename(columns=column_mappings)
    
    # Select configured columns
    df = df[list(columns_config.keys())]
    
    # Process each column
    for column, config in columns_config.items():
        df[column] = process_column(
            df[column],
            config['dtype'],
            config.get('value_type'),
            config.get('categories')
        )
    
    # Set index if specified
    for column, config in columns_config.items():
        if config.get('is_index', False):
            df = df.drop_duplicates(subset=column, keep='first')
            #df = df.set_index(column)
    
    return df

def load_all_datasets(config: dict) -> Dict[str, pd.DataFrame]:
    """
    Load and process all datasets defined in configuration.
    
    Args:
        config: Configuration dict
    
    Returns:
        Dictionary of processed DataFrames
    """
    
    return {
        dataset_name: load_and_process_dataset(
            dataset_config['source'],
            dataset_config['columns']
        )
        for dataset_name, dataset_config in config.items()
    }


In [None]:
# Load configuration from YAML file
config = load_config('config.yaml')


### Loading

In [None]:
all_data = load_all_datasets(config)

#### Load correspondance table

In [None]:
correspondance_data = all_data['ids']
correspondance_data.drop_duplicates(inplace=True)
correspondance_data

#### Load paintings data

In [None]:
paintings_data = all_data['paintings']
paintings_data

#### Load locations data

In [None]:
locations_data = all_data['locations']
locations_data

#### Load authors data

In [None]:
authors_data = all_data['authors']
authors_data

### Merging

In [None]:
df = correspondance_data.merge(paintings_data, on='painting_id', how='inner')
df = df.merge(authors_data, on='author_id', how='left')
df = df.merge(locations_data, on='location_id', how='left')
df

In [None]:
df = df[df['contains_food']]
df.info()

In [None]:
df['creation_date'].describe()


### Visualisations

In [None]:
# Count paintings per author country
paintings_per_country = df['author_country'].value_counts().reset_index()
paintings_per_country.columns = ['Author Country', 'Number of Paintings']

# Create bar chart
fig1 = px.bar(paintings_per_country, 
              x='Author Country', 
              y='Number of Paintings',
              title='Number of Paintings per Author Country',
              labels={'Author Country': 'Country', 'Number of Paintings': 'Paintings'})
fig1.show()

In [None]:
# Count authors by gender
gender_counts = df['gender'].value_counts().reset_index()
gender_counts.columns = ['Gender', 'Count']

# Create pie chart
fig2 = px.pie(gender_counts, 
             names='Gender', 
             values='Count',
             title='Gender Distribution of Authors',
             hole=0.3)
fig2.show()

In [None]:
""" # Drop rows with missing values in 'title' column
df = df.dropna(subset=['title', 'creation_date'])

# Sample 100 random rows from the DataFrame
df_subset = df.sample(n=100, random_state=42)  # Use `n` to specify the number of rows you want in the subset

# Alternatively, if you want a percentage-based sample (e.g., 10% of the data)
df_subset = df.sample(frac=0.1, random_state=42)  # `frac` is the fraction of rows to sample

# Now, you can create the plot with this subset
fig3 = px.scatter(df_subset, 
                  x='creation_date', 
                  y='title',
                  title='Timeline of Paintings Creation Dates (Random Subset)',
                  labels={'creation_date': 'Creation Date', 'title': 'Painting Title'},
                  hover_data=['author_name'])
fig3.update_yaxes(autorange="reversed")  # Optional: To display earliest at top
fig3.show() """

In [None]:
# Extract latitude and longitude from coordinates
df[['Longitude', 'Latitude']] = df['coordinates'].str.extract(r'Point\(([-\d.]+) ([-\d.]+)\)', expand=True).astype(float)

# Create map
fig4 = px.scatter_geo(df,
                      lat='Latitude',
                      lon='Longitude',
                      hover_name='location_name',
                      hover_data={'Latitude': False, 'Longitude': False},
                      title='Geographical Distribution of Painting Locations',
                      projection='natural earth')
fig4.show()

In [None]:
""" # Ensure 'author_name' column contains only scalar values
df['author_name'] = df['author_name'].apply(lambda x: x if isinstance(x, str) else str(x))

# Count paintings per author
paintings_per_author = df['author_name'].value_counts().head(10).reset_index()
paintings_per_author.columns = ['Author Name', 'Number of Paintings']

# Create horizontal bar chart
fig5 = px.bar(paintings_per_author.sort_values('Number of Paintings'),
             x='Number of Paintings',
             y='Author Name',
             orientation='h',
             title='Top 10 Authors by Number of Paintings',
             labels={'Number of Paintings': 'Paintings', 'Author Name': 'Author'})
fig5.show() """

In [None]:
# Count paintings with and without food
food_counts = df['contains_food'].value_counts().reset_index()
food_counts.columns = ['Contains Food', 'Count']
food_counts['Contains Food'] = food_counts['Contains Food'].map({True: 'Contains Food', False: 'No Food'})

# Create donut chart
fig6 = px.pie(food_counts, 
             names='Contains Food', 
             values='Count',
             title='Paintings Containing Food vs. Not Containing Food',
             hole=0.4)
fig6.show()

In [None]:
from collections import Counter
import itertools

# Split the strings into individual words
all_food_words = list(itertools.chain.from_iterable(df['food_words'].dropna().apply(lambda x: x.strip("[]").replace("'", "").split(", "))))
food_counter = Counter(all_food_words).most_common()

# Create dataframe for plotting
food_df = pd.DataFrame(food_counter, columns=['Food Item', 'Frequency'])

# Create bar chart
fig7 = px.bar(food_df, 
             x='Food Item', 
             y='Frequency',
             title='Frequency of Different Food Items in Paintings',
             labels={'Food Item': 'Food Item', 'Frequency': 'Count'})
fig7.show()

In [None]:
""" 

# Drop rows with missing data
scatter_df = df.dropna(subset=['date_of_birth', 'creation_year'])

# Create scatter plot
fig8 = px.scatter(scatter_df, 
                  x='date_of_birth', 
                  y='creation_year',
                  trendline='ols',
                  title="Authors' Birth Years vs. Painting Creation Years",
                  labels={'date_of_birth': 'Author Birth Year', 'creation_year': 'Creation Year'},
                  hover_data=['author_name', 'title'])
fig8.show() """

In [None]:
# Select a very small subset for testing
# Drop rows with missing data
scatter_df = df.dropna(subset=['date_of_birth', 'creation_date'])
df_subset = scatter_df[['creation_date', 'title']].head(100)
fig3 = px.scatter(df_subset, 
                  x='creation_date', 
                  y='title',
                  title='Test Plot of Paintings Creation Dates')
fig3.show()

In [None]:
# Count paintings per location country
paintings_per_location_country = df['location_country'].value_counts().reset_index()
paintings_per_location_country.columns = ['Location Country', 'Number of Paintings']

# Create treemap
fig9 = px.treemap(paintings_per_location_country, 
                 path=['Location Country'], 
                 values='Number of Paintings',
                 title='Distribution of Paintings by Location Country')
fig9.show()

In [None]:
# Ensure there are no missing values in hover_data columns
gallery = df[['creation_date', 'title', 'author_name', 'location_name', 'food_words']].dropna().head(10)

fig10 = px.scatter(gallery, 
                   x='creation_date', 
                   y='title',
                   hover_data=['author_name', 'location_name', 'food_words'],
                   title='Interactive Gallery of Paintings',
                   labels={'creation_date': 'Creation Date', 'title': 'Painting Title'},
                   template='plotly_white')

# Add images as hover
fig10.update_traces(marker=dict(size=12,
                                 color='LightSkyBlue'),
                   selector=dict(mode='markers'))

fig10.update_layout(
    hovermode='closest'
)

fig10.show()