In [194]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import unicodedata
import re

In [195]:
df = pd.read_csv("../sources/export_06_18_24.csv", encoding='latin-1')

In [196]:
rf = pd.read_csv("../sources/lml-reconciled.csv", encoding='latin-1')

In [197]:
df['Secondary Catalog Number'] = df['Secondary Catalog Number'].fillna('')

In [198]:
rf['Secondary Catalog Number'] = rf['Secondary Catalog Number'].fillna('')

In [199]:
def get_catalog_number(smp):
    pcn = str(smp['Catalog Number'])
    scn = smp.fillna('')['Secondary Catalog Number']

    return pcn + scn

In [200]:
df['cat'] = [get_catalog_number(df.iloc[i]) for i in range(len(df))]

In [201]:
rf['cat'] = [get_catalog_number(rf.iloc[i]) for i in range(len(rf))]

In [202]:
rf = rf[['cat','Manufacturer Equivalent IDs','Brand Equivalent IDs']]

In [203]:
rf = rf.drop_duplicates()

In [204]:
df = df.join(rf.set_index('cat'),'cat')

In [205]:
df.ProcessName = df.ProcessName.apply(lambda x:x.strip('\t'))

In [206]:
df.Link = df.Link.apply(lambda x: 'http://vocab.getty.edu/' + "/".join(x.strip("#").split("/")[-2:]))

In [207]:
df.columns = ['photoid','year','circa','catalog','catalog2','man','bran','storfor','xd','gd','cd','td','s','omit',
              'locbox','locbag','expos','backp','processname','processlink','processaata','cat','manid','branid']

In [208]:
df = df[['cat','man','bran','year','circa','xd','gd','cd','td','s','storfor','omit',
        'locbox','locbag','expos','backp','processname','processlink','processaata','cat','manid','branid']]

In [209]:
# Autotype: http://viaf.org/viaf/131563321
# Foton: http://www.wikidata.org/entity/Q115823390

In [210]:
df.manid.loc[df.man=='Autotype'] = 'http://viaf.org/viaf/131563321'
df.manid.loc[df.man=='Foton'] = 'http://www.wikidata.org/entity/Q115823390'

In [211]:
df.manid.loc[df.man=='Kodak London'] = 'http://viaf.org/viaf/150021959'

In [212]:
mans = df.man.unique()

In [213]:
def to_filename(name):

    if pd.isna(name):
        return None

    name = name.lower().strip()
    
    # Normalize Unicode characters to their closest ASCII representation
    normalized_name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
    # Remove problematic characters
    safe_name = re.sub(r"[\/\\\:\*\?\"\<\>\|\#\%\&\{\}\\\<\>\*\?\$\/\'\"\:\@\+\`|=!]", "", normalized_name)
    # Replace spaces (and any residual space-like characters) with underscores
    safe_name = re.sub(r"\s+", "_", safe_name)
    # Append the .json extension
    return safe_name

In [214]:
df['mansafe'] = df.man.apply(to_filename)

In [215]:
len(df.mansafe.unique())==len(df.man.unique())

True

In [217]:
brans = df.bran.unique()

In [218]:
df['bransafe'] = df.bran.apply(to_filename)

In [219]:
len(df.bransafe.unique())==len(df.bran.unique())

True

In [221]:
df.to_pickle('reconc.pkl')

In [239]:
df.columns

Index(['cat', 'man', 'bran', 'year', 'circa', 'xd', 'gd', 'cd', 'td', 's',
       'storfor', 'omit', 'locbox', 'locbag', 'expos', 'backp', 'processname',
       'processlink', 'processaata', 'cat', 'manid', 'branid', 'mansafe',
       'bransafe'],
      dtype='object')

In [241]:
df

Unnamed: 0,cat,man,bran,year,circa,xd,gd,cd,td,s,...,expos,backp,processname,processlink,processaata,cat.1,manid,branid,mansafe,bransafe
0,996,Kodak,Solio,1890,1,[texture unspecified],[gloss unspecified],[base color unspecified],[weight unspecified],[not specified],...,,,gelatin silver process,http://vocab.getty.edu/aat/300139114,300139114,996,http://www.wikidata.org/entity/Q486269,,kodak,solio
1,5443,DuVoll's Paper,Halo Bome,1890,1,[texture unspecified],[gloss unspecified],[base color unspecified],[weight unspecified],[not specified],...,,,gelatin silver process,http://vocab.getty.edu/aat/300139114,300139114,5443,,,duvolls_paper,halo_bome
2,366,American Aristotype,Aristo,1896,0,[texture unspecified],[gloss unspecified],[base color unspecified],[weight unspecified],[not specified],...,,,gelatin silver process,http://vocab.getty.edu/aat/300139114,300139114,366,http://viaf.org/viaf/131668213,,american_aristotype,aristo
3,1452,American Self-Toning,Self-Toning,1898,1,Linen,[gloss unspecified],[base color unspecified],[weight unspecified],[not specified],...,,,gelatin silver process,http://vocab.getty.edu/aat/300139114,300139114,1452,,,american_self-toning,self-toning
4,5563,Kodak,Solio,1898,0,[texture unspecified],Glossy,Pearly White,[weight unspecified],Pensé (Velox),...,,,gelatin silver process,http://vocab.getty.edu/aat/300139114,300139114,5563,http://www.wikidata.org/entity/Q486269,,kodak,solio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7169,5245,Foma,Fomatone MG,2006,1,[texture unspecified],Matte,Chamois,Natural,542,...,,,gelatin silver process,http://vocab.getty.edu/aat/300139114,300139114,5245,http://www.wikidata.org/entity/Q1435608,,foma,fomatone_mg
7170,5417,Slavich Photographic Paper,Unibrom,2006,0,Smooth,Glossy,[base color unspecified],Medium Weight,[not specified],...,,,gelatin silver process,http://vocab.getty.edu/aat/300139114,300139114,5417,,,slavich_photographic_paper,unibrom
7171,2993,Kodak,Polymax Fine Art,2008,0,Smooth,Glossy,White,Double Weight,F,...,,,gelatin silver process,http://vocab.getty.edu/aat/300139114,300139114,2993,http://www.wikidata.org/entity/Q486269,,kodak,polymax_fine_art
7172,5023,Foma,Fomatone MG,2010,1,[texture unspecified],[gloss unspecified],[base color unspecified],[weight unspecified],[not specified],...,,,gelatin silver process,http://vocab.getty.edu/aat/300139114,300139114,5023,http://www.wikidata.org/entity/Q1435608,,foma,fomatone_mg
