Preprocessing

In [37]:
import pandas as pd
import re

In [150]:
# Create an index on id
def create_index(reg_no):
    reg_string = str(reg_no)
    return re.sub('\W+','', reg_string)

def clean_metadata(name):
    df = pd.read_csv(name)
    print("Original length: " + str(len(df)))
    df["id"] = df.apply(lambda row: create_index(row['Reg number']), axis=1)
    df.drop_duplicates(inplace=True)
    df.dropna(subset=['Image'], inplace=True)
    print("Final length: " + str(len(df)))
    return df

In [42]:
gandhara = "collections-22-02-20-12_52_28-gandhara.csv"
gandhara_df = clean_metadata(gandhara)
gandhara_df.to_csv("gandhara.csv")

Original length: 196
Final length: 196


In [53]:
# Extract gandharan IDs to ensure no duplication in Buddha dataset
gandhara_ids = set(gandhara_df['id'])

In [86]:
buddha = "buddha_unfiltered.csv"
buddha_df = clean_metadata(buddha)

Original length: 479
Final length: 479


In [110]:
def excludes_words(string, word_list):
    word_set = set(word_list)
    str_set = set(str(string).replace(" ", "").split(";"))
    return word_set.isdisjoint(str_set)

In [111]:
# By changing the second input to exclude-words, we can remove objects that match certain descriptions while leaving everything else intact.

buddha_filtered = buddha_df[~buddha_df.id.isin(gandhara_ids)]
buddha_filtered = buddha_filtered[buddha_filtered["Materials"].apply(lambda t: excludes_words(t, ["copper", "paper"]))]
buddha_filtered = buddha_filtered[buddha_filtered["Object type"].apply(lambda t: excludes_words(t, ["coin", "wall-painting", "amulet-box"]))]
buddha_filtered

Unnamed: 0,Image,Object type,Museum number,Title,Denomination,Escapement,Description,Producer name,School/style,State,...,Acq notes (acq),Acq notes (exc),Dept,BM/Big number,Reg number,Add ids,Cat no,Banknote serial number,Joined objects,id
1,https://media.britishmuseum.org/media/Reposito...,artefact,"No: 1887,0717.144",,,,Conical spirals representing the curled hair o...,,,,...,,,Asia,,18870717.144,,,,,18870717144
2,https://media.britishmuseum.org/media/Reposito...,base,"No: 1892,0801.11",,,,"Corner of a carved base with part of a foot, p...",,,,...,,,Asia,,18920801.11,,,,,1892080111
3,https://media.britishmuseum.org/media/Reposito...,base,"No: 1890,1116.1",The Hashtnagar Pedestal (Object),,,"Carved image base, inscribed with a date in th...",,,,...,Originally received in the Department of Coins...,,Asia,,18901116.1,,,,,189011161
4,https://media.britishmuseum.org/media/Reposito...,base,"No: 1902,1002.17",,,,"Carved base with an indistinct foot, probably ...",,,,...,,,Asia,,19021002.17,,,,,1902100217
5,https://media.britishmuseum.org/media/Reposito...,block,"No: 1880,0709.40",,,,A rectangular slab carved in limestone ('Palna...,,,,...,Acquired as the result of the abolition in 187...,,Asia,,18800709.40,Miscellaneous number: Elliot no 98,,,,1880070940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,https://media.britishmuseum.org/media/Reposito...,votive panel; 木板畫,No: MAS.459,,,,"Rectangular votive panel with a pointed top, s...",,,,...,The 1917-11-28 group (with MAS numbering) refe...,,Asia,,MAS.459,Miscellaneous number: F.II.iii.2 (Stein no.),,,,MAS459
463,https://media.britishmuseum.org/media/Reposito...,votive panel; 木板畫,"No: 1907,1111.68",,,,Rectangular votive panel painted on both sides...,,,,...,The 1907-11-11 group refers to objects from St...,,Asia,,19071111.68,Miscellaneous number: D.IV.5 (Stein no.),,,,1907111168
464,https://media.britishmuseum.org/media/Reposito...,votive panel; 木板畫,"No: 1907,1111.72",,,,Rectangular votive panel painted on both sides...,,,,...,The 1907-11-11 group refers to objects from St...,,Asia,,19071111.72,Miscellaneous number: D.X.3 (Stein no.),,,,1907111172
465,https://media.britishmuseum.org/media/Reposito...,votive panel; 木板畫,"No: 1907,1111.73",,,,Rectangular votive panel painted on one side. ...,,,,...,The 1907-11-11 group refers to objects from St...,,Asia,,19071111.73,Miscellaneous number: D.X.4 (Stein no.),,,,1907111173


In [117]:
buddha_filtered["id"].is_unique

True

In [104]:
buddha_filtered.to_csv("buddha.csv")

In [151]:
greek = "greek_unfiltered.csv"
greek_df = clean_metadata(greek)

Original length: 651
Final length: 651


In [170]:
# By changing the second input to exclude-words, we can remove objects that match certain descriptions while leaving everything else intact.

greek_filtered = greek_df[~greek_df.id.isin(gandhara_ids)]
greek_filtered = greek_filtered[greek_filtered["Materials"].apply(lambda t: excludes_words(t, ["copper", "paper", "pottery"]))]
greek_filtered = greek_filtered[greek_filtered["Object type"].apply(lambda t: excludes_words(t, ["coin", "finger-ring", "flask", "goblet"]))]
greek_filtered

Unnamed: 0,Image,Object type,Museum number,Title,Denomination,Escapement,Description,Producer name,School/style,State,...,Acq notes (acq),Acq notes (exc),Dept,BM/Big number,Reg number,Add ids,Cat no,Banknote serial number,Joined objects,id
0,https://media.britishmuseum.org/media/Reposito...,altar; base,"No: 1816,0610.330",,,,Fragment from angle of Parian marble small alt...,,,,...,,,Greek and Roman,,18160610.330,,,,,18160610330
1,https://media.britishmuseum.org/media/Reposito...,amphora,"No: 1895,1020.2",,,,Miniature terracotta flask.\r\nAn amphoriskos ...,,,,...,,,Greek and Roman,,18951020.2,,,,,189510202
5,https://media.britishmuseum.org/media/Reposito...,amulet; figure,No: null,,,,Mould-made amuletic figure in glazed compositi...,,,,...,,Excavated: 1884-1885. Donated by Egypt Explora...,External,,,"Miscellaneous number: 9,9,86,76 (Accession Num...",,,,
6,https://media.britishmuseum.org/media/Reposito...,amulet; figure,No: null,,,,Mould-made amuletic figure in glazed compositi...,,,,...,,,External,,,Miscellaneous number: H1051.2 (Accession Number),,,,
7,https://media.britishmuseum.org/media/Reposito...,amulet; figure,No: null,,,,Solid cast amuletic figure of human-shaped god...,,,,...,,Excavated in 1884. Donated by Egypt Exploratio...,External,,,Miscellaneous number: H1034.1 (Accession Number),,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,https://media.britishmuseum.org/media/Reposito...,vessel-stamp,No: null,,,,"Egyptian terracotta circular stamp, sometimes ...",,,,...,,Excavated: 1884-1885. EEF 1886,External,,,Miscellaneous number: AN1886.517 (Accession Nu...,,,,
647,https://media.britishmuseum.org/media/Reposito...,vessel-stamp,No: null,,,,"Egyptian terracotta circular stamp, sometimes ...",,,,...,,Excavated: 1884-1885. EEF 1886,External,,,Miscellaneous number: AN1886.513 (Accession Nu...,,,,
648,https://media.britishmuseum.org/media/Reposito...,vessel-stamp,No: null,,,,"Egyptian terracotta circular stamp, sometimes ...",,,,...,,Excavated 1884-1885. 1885: excavated by Willia...,External,,,Miscellaneous number: 86.666 (Accession Number...,,,,
649,https://media.britishmuseum.org/media/Reposito...,vessel-stamp,"No: 1965,0930.895",,,,Fragment of a terracotta stamp. There is no ha...,,,,...,Previously unregistered.,,Greek and Roman,,19650930.895,,,,,19650930895


In [171]:
# There are some items in the greek collection that are without ids. I assign them random ones here.

def make_ids(index):
    return "greek" + str(index)

greek_filtered.reset_index(inplace=True)
greek_filtered['id'] = greek_filtered.apply(lambda row: make_ids(row["index"]) if row['id'] == 'nan' else row['id'], axis=1)
greek_filtered


Unnamed: 0,index,Image,Object type,Museum number,Title,Denomination,Escapement,Description,Producer name,School/style,...,Acq notes (acq),Acq notes (exc),Dept,BM/Big number,Reg number,Add ids,Cat no,Banknote serial number,Joined objects,id
0,0,https://media.britishmuseum.org/media/Reposito...,altar; base,"No: 1816,0610.330",,,,Fragment from angle of Parian marble small alt...,,,...,,,Greek and Roman,,18160610.330,,,,,18160610330
1,1,https://media.britishmuseum.org/media/Reposito...,amphora,"No: 1895,1020.2",,,,Miniature terracotta flask.\r\nAn amphoriskos ...,,,...,,,Greek and Roman,,18951020.2,,,,,189510202
2,5,https://media.britishmuseum.org/media/Reposito...,amulet; figure,No: null,,,,Mould-made amuletic figure in glazed compositi...,,,...,,Excavated: 1884-1885. Donated by Egypt Explora...,External,,,"Miscellaneous number: 9,9,86,76 (Accession Num...",,,,greek5
3,6,https://media.britishmuseum.org/media/Reposito...,amulet; figure,No: null,,,,Mould-made amuletic figure in glazed compositi...,,,...,,,External,,,Miscellaneous number: H1051.2 (Accession Number),,,,greek6
4,7,https://media.britishmuseum.org/media/Reposito...,amulet; figure,No: null,,,,Solid cast amuletic figure of human-shaped god...,,,...,,Excavated in 1884. Donated by Egypt Exploratio...,External,,,Miscellaneous number: H1034.1 (Accession Number),,,,greek7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539,646,https://media.britishmuseum.org/media/Reposito...,vessel-stamp,No: null,,,,"Egyptian terracotta circular stamp, sometimes ...",,,...,,Excavated: 1884-1885. EEF 1886,External,,,Miscellaneous number: AN1886.517 (Accession Nu...,,,,greek646
540,647,https://media.britishmuseum.org/media/Reposito...,vessel-stamp,No: null,,,,"Egyptian terracotta circular stamp, sometimes ...",,,...,,Excavated: 1884-1885. EEF 1886,External,,,Miscellaneous number: AN1886.513 (Accession Nu...,,,,greek647
541,648,https://media.britishmuseum.org/media/Reposito...,vessel-stamp,No: null,,,,"Egyptian terracotta circular stamp, sometimes ...",,,...,,Excavated 1884-1885. 1885: excavated by Willia...,External,,,Miscellaneous number: 86.666 (Accession Number...,,,,greek648
542,649,https://media.britishmuseum.org/media/Reposito...,vessel-stamp,"No: 1965,0930.895",,,,Fragment of a terracotta stamp. There is no ha...,,,...,Previously unregistered.,,Greek and Roman,,19650930.895,,,,,19650930895


In [174]:
greek_filtered['id'].value_counts()

greek370        1
19170701125     1
191104161       1
greek76         1
18591226184     1
               ..
1907051949      1
188604011566    1
1907051945      1
19040204397     1
18561004152     1
Name: id, Length: 543, dtype: int64

In [173]:
greek_filtered = greek_filtered.drop_duplicates(subset=["id"], keep="first")

In [175]:
greek_filtered.to_csv("greek.csv")