This workbook looks at the materials and technique columns in the BM and V&A combined dataset. The main idea was to see what sculpture materials and techniques were the most common in the selected museums, and how they may have changed over the years.

In [None]:
# load packages
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Read data
df = pd.read_csv('../data/combined_collections_dataset.csv') 

Familiarising myself with the two columns I am looking at. A visual inspection showed that some information was missing, and I noticed that some rows contained multiple materials and techniques (using Nicky’s audit code as a reference – thanks Nicky). Some rows with Chinese characters?

In [None]:
# Quick look at the data
df[["ItemMaterial", "ItemTechnique"]].sample(20)

In [None]:
# Check for multi value rows using semicolons
multi_materials = df["ItemMaterial"].fillna("").str.contains(";")
multi_techniques = df["ItemTechnique"].fillna("").str.contains(";")

print(f"Total rows: {len(df)}")
print(f"Rows with multiple materials: {multi_materials.sum()}")
print(f"Rows with multiple techniques: {multi_techniques.sum()}")

# Show a few examples
print("\nExample multi-material rows:")
display(df.loc[multi_materials, ["ItemMaterial"]].head(20))

print("\nExample multi-technique rows:")
display(df.loc[multi_techniques, ["ItemTechnique"]].head(20))

In [None]:
nan_count = df['ItemMaterial'].isna().sum()
print("NaN rows:", nan_count)

empty_count = (df['ItemMaterial'] == '').sum()
print("Empty string rows:", empty_count)

Some adjustments to make looking at the data easier. I replaced missing values with unknown and considered splitting rows that contains multiple techniques or materials (pd.explode?) but decided against it because that would duplicate all the other information. So I regrouped into new categories. Or I could have removed those rows entirely.

In [None]:
df = df[df['ItemMaterial'].notna()]
df = df[~df['ItemMaterial'].str.contains(';', na=False)]

In [None]:
unique_materials_df = pd.DataFrame(df['ItemMaterial'].unique(), columns=['ItemMaterial'])
unique_techniques_df = pd.DataFrame(df['ItemTechnique'].unique(), columns=['ItemTechnique'])

print(len(unique_materials_df))
print(len(unique_techniques_df))

There were more categories than I expected. I looked into whether they could be meaningfully combined into broader groups, but I wasn’t able to find enough reliable information to do this so I chose to focus on the top 8 materials/techniques. 

In [None]:
counts_materials = df['ItemMaterial'].value_counts()
print(counts_materials.head(8))

counts_technique = df['ItemTechnique'].value_counts()
print(counts_technique.head(8))

Next, I tried making some charts to explore the data a bit more. I started by looking at the most common materials used in sculptures (Fig 1). Then I wanted to see whether certain materials were more common in the BM or the V&A (Fig 2). I realised that the BM has a lot more entries than the V&A, so comparing total counts was not very useful. I attempted to convert the values into percentages so the two museums could be compared more easily (Fig 3). For the final chart (Fig 4), I wanted to explore how the use of different materials changed over time, so I attempted a plot fo the timeline (inspired by Jo-an’s code — thanks Jo-an).

In [None]:
top10_materials = df['ItemMaterial'].value_counts().nlargest(8)

plt.figure(figsize=(10,5))
top10_materials.plot(kind='bar')
plt.title("Top 8 Most Frequent Item Materials")

In [None]:
top8_materials = df['ItemMaterial'].value_counts().nlargest(8).index

df_top8 = df[df['ItemMaterial'].isin(top8_materials)]

material_order = df_top8['ItemMaterial'].value_counts().index

plt.figure(figsize=(12,6))
sns.countplot(data=df_top8,
    x='ItemMaterial',
    hue='Museum',
    palette='Set1',
    order=material_order)

plt.xticks(rotation=90)
plt.xlabel('Item Material', fontsize=12, fontweight='bold')
plt.ylabel('Count', fontsize=12, fontweight='bold')
plt.title('Top 8 Materials by Museum', fontsize=14, fontweight='bold')
plt.legend(title='Museum')
plt.tight_layout()
plt.show()

In [None]:
df_counts = df_top8.groupby(['Museum','ItemMaterial']).size().reset_index(name='Count')

df_counts['Percent'] = df_counts['Count'] / df_counts.groupby('Museum')['Count'].transform('sum') * 100
# df_counts['Percent'] = df_counts['Count'] / df_counts.groupby('ItemMaterial')['Count'].transform('sum') * 100

print(df_counts)

In [None]:
material_order = (df_counts.groupby('ItemMaterial')['Percent'].sum().sort_values(ascending=False).index)

plt.figure(figsize=(12,6))
sns.barplot(data=df_counts,
    x='ItemMaterial',
    y='Percent',
    hue='Museum',
    palette='Set2',
    order=material_order)

plt.xticks(rotation=90)
plt.xlabel('Item Material', fontsize=12, fontweight='bold')
plt.ylabel('Percent (%)', fontsize=12, fontweight='bold')
plt.title('Top 8 Materials by Museum', fontsize=14, fontweight='bold')
plt.legend(title='Museum')
plt.tight_layout()
plt.show()

In [None]:
palette = [
    "#003f5c",
    "#2f4b7c",
    "#665191",
    "#a05195",
    "#d45087",
    "#f95d6a",
    "#ff7c43",
    "#ffa600"]

In [None]:
df_filtered = df[(df['StartDate'] >= -6000) & (df['EndDate'] <= 2025)].copy()

top8_materials = df_filtered['ItemMaterial'].value_counts().nlargest(8).index
df_top8 = df_filtered[df_filtered['ItemMaterial'].isin(top8_materials)].copy()

#df_top8['StartDate'] = pd.to_datetime(df_top8['StartDate'])
#df_top8['EndDate'] = pd.to_datetime(df_top8['EndDate'])

df_top8['duration'] = df_top8['EndDate'] - df_top8['StartDate']

material_order = df_top8['ItemMaterial'].value_counts().index
material_colors = {mat: palette[i % len(palette)] for i, mat in enumerate(material_order)}

fig, ax = plt.subplots(figsize=(20, 10))

for index, row in df_top8.iterrows():
    ax.barh(
        y=row['ItemMaterial'],
        left=row['StartDate'],
        width=row['duration'],
        color=material_colors[row['ItemMaterial']],
       alpha=0.8)

plt.title("Timeline of Materials", fontsize=20)
plt.xlabel("Year", fontsize=18)
plt.ylabel("Material", fontsize=18)
fig.savefig('../visualisations/combined_materials_timeline.png')

 And a similar visualisation for the techniques column (fig 5). 

In [None]:
df_filtered = df[(df['StartDate'] >= -6000) & (df['EndDate'] <= 2025)].copy()

top8_materials = df_filtered['ItemTechnique'].value_counts().nlargest(8).index
df_top8 = df_filtered[df_filtered['ItemTechnique'].isin(top8_materials)].copy()

df_top8['duration'] = df_top8['EndDate'] - df_top8['StartDate']

material_order = df_top8['ItemTechnique'].value_counts().index
material_colors = {mat: palette[i % len(palette)] for i, mat in enumerate(material_order)}

fig, ax = plt.subplots(figsize=(20, 10))

for index, row in df_top8.iterrows():
    ax.barh(
        y=row['ItemTechnique'],
        left=row['StartDate'],
        width=row['duration'],
        color=material_colors[row['ItemTechnique']],
       alpha=0.8)

plt.title("Timeline of Techniques", fontsize=20)
plt.xlabel("Year", fontsize=18)
plt.ylabel("Techniques", fontsize=18)
fig.savefig('../visualisations/combined_techniques_timeline.png')