In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import unicodedata
import re

In [2]:
df = pd.read_csv("../sources/export_07_11_24.csv", encoding='latin-1')

In [3]:
rf = pd.read_csv("../sources/lml-reconciled.csv", encoding='latin-1')

In [4]:
df['Secondary Catalog Number'] = df['Secondary Catalog Number'].fillna('')

In [5]:
rf['Secondary Catalog Number'] = rf['Secondary Catalog Number'].fillna('')

In [6]:
def get_catalog_number(smp):
    pcn = str(smp['Catalog Number'])
    scn = smp.fillna('')['Secondary Catalog Number']

    return pcn + scn

In [7]:
df['cat'] = [get_catalog_number(df.iloc[i]) for i in range(len(df))]

In [8]:
rf['cat'] = [get_catalog_number(rf.iloc[i]) for i in range(len(rf))]

In [9]:
rf = rf[['cat','Manufacturer Equivalent IDs','Brand Equivalent IDs']]

In [10]:
rf = rf.drop_duplicates()

In [11]:
df = df.join(rf.set_index('cat'),'cat')

In [12]:
#df.ProcessName = df.ProcessName.apply(lambda x:x.strip('\t'))

In [13]:
#df.Link = df.Link.apply(lambda x: 'http://vocab.getty.edu/' + "/".join(x.strip("#").split("/")[-2:]))

In [14]:
df['objectid'] = [f'{abs(item)}' for item in df.PhotoID]

In [15]:
df.columns

Index(['PhotoID', 'Year', 'DateUncertain', 'Catalog Number',
       'Secondary Catalog Number', 'Manufacturer', 'Brand', 'Format',
       'Texture2', 'Reflectance2', 'BaseColor2', 'Weight2',
       'SurfaceDesignation2', 'Omit_from_Paperbase', 'LocationBox',
       'LocationBag', 'Exposure', 'BackprintingPrimary', 'Postcards', 'Toner',
       'Resin_coated?', 'ProcessName', 'Link', 'AATA_ID', 'cat',
       'Manufacturer Equivalent IDs', 'Brand Equivalent IDs', 'objectid'],
      dtype='object')

In [16]:
df.columns = ['photoid','year','circa','catalog','catalog2','man','bran','storfor','xd','gd','cd','td','s','omit',
              'locbox','locbag','expos','backp','postcards','toner','rc','processname','processlink','processaata',
              'cat','manid','branid','objectid']

In [17]:
df = df[['objectid','cat','man','bran','year','circa','xd','gd','cd','td','s','storfor','omit',
        'locbox','locbag','expos','backp','manid','branid']]

In [18]:
df.manid.loc[df.man=='Autotype'] = 'http://viaf.org/viaf/131563321'
df.manid.loc[df.man=='Foton'] = 'http://www.wikidata.org/entity/Q115823390'

In [19]:
df.manid.loc[df.man=='Kodak London'] = 'http://viaf.org/viaf/150021959'

In [20]:
def parse_set(st):
    if len(st) > 1:
        nonnans = [item for item in st if not pd.isna(item)]
        assert len(nonnans)==1
        return nonnans[0]
    elif len(st)==1:
        nonnans = [item for item in st if not pd.isna(item)]
        if len(nonnans) > 0:
            return nonnans[0]
    else:
        print('err')

#### filling in manids and branids

In [21]:
tmp = df[['man','manid']].groupby('man').agg(set)
tmp.manid = tmp.manid.apply(parse_set)
d = tmp.to_dict()['manid']
df.manid = [d[item] for item in df.man]

In [22]:
tmp = df[['bran','branid']].groupby('bran').agg(set)
tmp.branid = tmp.branid.apply(parse_set)
d = tmp.to_dict()['branid']
df.branid = [d[item] for item in df.bran]

In [23]:
def to_filename(name):

    if pd.isna(name):
        return None

    name = name.strip()
    
    normalized_name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')

    safe_name = normalized_name.replace('&', 'and')
    
    safe_name = re.sub(r"[\/\\\:\*\?\"\<\>\|\#\%\&\{\}\\\<\>\*\?\$\/\'\"\:\@\+\`|=!]", "", safe_name)
    
    safe_name = re.sub(r"\s+", "_", safe_name)

    return safe_name

In [24]:
df['mansafe'] = df.man.apply(to_filename)
len(df.mansafe.unique())==len(df.man.unique())

True

In [25]:
df['bransafe'] = df.bran.apply(to_filename)
len(df.bransafe.unique())==len(df.bran.unique())

True

In [26]:
df['xdsafe'] = df.xd.apply(to_filename)
len(df.xdsafe.unique())==len(df.xd.unique())

True

In [27]:
df['gdsafe'] = df.gd.apply(to_filename)
len(df.gdsafe.unique())==len(df.gd.unique())

True

In [28]:
df['cdsafe'] = df.cd.apply(to_filename)
len(df.cdsafe.unique())==len(df.cd.unique())

True

In [29]:
df['tdsafe'] = df.td.apply(to_filename)
len(df.tdsafe.unique())==len(df.td.unique())

True

In [30]:
df['ssafe'] = df.s.apply(to_filename)
len(df.ssafe.unique())==len(df.s.unique())

True

In [31]:
df['backpsafe'] = df.backp.apply(to_filename)
len(df.backpsafe.unique())==len(df.backp.unique())

True

In [32]:
#df.processlink.loc[df.processlink=='http://vocab.getty.edu/vow/AATFullDisplay?find=&logic=AND&note=&page=1&subjectid=300149146'] = 'http://vocab.getty.edu/aat/300149146'

In [33]:
df.to_pickle('reconc.pkl')

In [34]:
df.to_csv('reconc.csv',index=False)