In [1]:
import pandas as pd
import glob,os
import shutil
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
smp = pd.read_pickle('db.pkl')

In [3]:
smp['id'] = smp.PhotoID.apply(abs)

In [4]:
keepcols = [
    'id',
    'catalog',
    'Manufacturer',
    'Brand',
    'Year',
    'DateUncertain',
    'imfiles_md5',
    'sb'
]

In [5]:
smp = smp[keepcols]

In [6]:
smp.columns = ['ObjectNumber','CatalogNumber','Manufacturer','Brand','Year','DateUncertain','Images','IsSample']

In [7]:
smp['CreditLine'] = 'Lens Media Lab Collection'

In [8]:
smp['Classification'] = ['photographic paper sample' if smp.IsSample.loc[i]==True else 'photographic paper package' for i in smp.index]

In [9]:
smp['Department'] = "Lens Media Lab"

In [10]:
smp['Institution'] = "Institute for the Preservation of Cultural Heritage"

In [11]:
smp['Date'] = [f'ca. {smp.Year.loc[i]}' if smp.DateUncertain.loc[i]==1 else str(smp.Year.loc[i]) for i in smp.index]

In [12]:
smp['ObjectLabel'] = [f'{smp.Manufacturer.loc[i]} {smp.Brand.loc[i]} {smp.Date.loc[i]}' for i in smp.index]

In [13]:
smp['Attribution'] = "Data provided about the Lens Media Lab Collection are public domain. Rights restrictions may apply to collection objects or images of those objects."
smp['License'] = "https://creativecommons.org/licenses/by/2.0/"

In [14]:
del smp['DateUncertain']

In [15]:
del smp['IsSample']

In [16]:
del smp['Year']

# Object Images

In [17]:
idx = smp.loc[smp.CatalogNumber=='4959'].index[0]

In [18]:
smp.Images.loc[idx] = [item for item in smp.Images.loc[idx] if '4959c' not in item['filename']]

In [19]:
def check_pattern(filenames):
    # Regular expression to match the required filename pattern
    pattern = re.compile(r'^(\d+)([vbtlr])?\.jpg$')

    # Extract all matches
    matches = [pattern.match(filename) for filename in filenames]
    
    if not all(matches):
        return False  # If any filename doesn't match the pattern, return False

    # Extract the base numeric sequences
    numeric_sequences = {m.group(1) for m in matches if m}
    
    if len(numeric_sequences) != 1:
        return False  # If there's more than one unique numeric sequence, return False
    
    # Check if there's at least one file with only the numeric sequence
    base_numeric = numeric_sequences.pop()
    if f'{base_numeric}.jpg' not in filenames:
        return False

    return True

In [20]:
def check_advanced_pattern(filenames):
    # Regular expression to match the required filename patterns
    # This includes patterns with and without an underscore followed by any lowercase letter
    pattern = re.compile(r'^(\d+)(?:_)?([a-z])?\.jpg$')

    # Extract all matches
    matches = [pattern.match(filename) for filename in filenames]
    
    if not all(matches):
        return False  # If any filename doesn't match the pattern, return False

    # Extract the base numeric sequences
    numeric_sequences = {m.group(1) for m in matches if m}
    
    if len(numeric_sequences) != 1:
        return False  # If there's more than one unique numeric sequence, return False
    
    # Check if there's at least one file with only the numeric sequence
    base_numeric = numeric_sequences.pop()
    if f'{base_numeric}.jpg' not in filenames:
        return False

    # Count filenames with an underscore
    underscore_count = sum('_' in filename for filename in filenames)
    if underscore_count != 1:
        return False  # Ensure there is exactly one filename with an underscore

    return True

In [21]:
def process_normal_imdict(imdict):
    f = imdict['filename']
    fbase = f[:-4]
    imdict['id'] = fbase

    if fbase.isdigit():
        imdict['label'] = 'recto'
        imdict['rank'] = 1
        imdict['object_thumbnail_flag'] = 1
    else:
        if 'b' in fbase:
            imdict['label'] = 'bottom'
            imdict['rank'] = 6
            imdict['object_thumbnail_flag'] = 0
        if 't' in fbase:
            imdict['label'] = 'top'
            imdict['rank'] = 5
            imdict['object_thumbnail_flag'] = 0
        if 'r' in fbase:
            imdict['label'] = 'right'
            imdict['rank'] = 4
            imdict['object_thumbnail_flag'] = 0
        if 'l' in fbase:
            imdict['label'] = 'left'
            imdict['rank'] = 3
            imdict['object_thumbnail_flag'] = 0
        if 'v' in fbase:
            imdict['label'] = 'verso'
            imdict['rank'] = 2
            imdict['object_thumbnail_flag'] = 0

    return imdict

In [22]:
def process_advanced_imdict(imdict):
    f = imdict['filename']
    fbase = f[:-4]
    imdict['id'] = fbase

    if '_' in fbase:
        imdict['label'] = 'item package'
        imdict['rank'] = 1
        imdict['object_thumbnail_flag'] = 1
    else:
        if fbase.isdigit():
            imdict['label'] = 'containing package recto'
            imdict['rank'] = 2
            imdict['object_thumbnail_flag'] = 0
        if 'b' in fbase:
            imdict['label'] = 'containing package bottom'
            imdict['rank'] = 7
            imdict['object_thumbnail_flag'] = 0
        if 't' in fbase:
            imdict['label'] = 'containing package top'
            imdict['rank'] = 6
            imdict['object_thumbnail_flag'] = 0
        if 'r' in fbase:
            imdict['label'] = 'containing package right'
            imdict['rank'] = 5
            imdict['object_thumbnail_flag'] = 0
        if 'l' in fbase:
            imdict['label'] = 'containing package left'
            imdict['rank'] = 4
            imdict['object_thumbnail_flag'] = 0
        if 'v' in fbase:
            imdict['label'] = 'containing package verso'
            imdict['rank'] = 3
            imdict['object_thumbnail_flag'] = 0

    return imdict

In [23]:
def process_3194(imdict):
    f = imdict['filename']
    fbase = f[:-4]
    imdict['id'] = fbase

    if '_001' in fbase:
        imdict['label'] = 'recto'
        imdict['rank'] = 1
        imdict['object_thumbnail_flag'] = 1
    elif '_002' in fbase:
        imdict['label'] = 'verso'
        imdict['rank'] = 2
        imdict['object_thumbnail_flag'] = 0
        
    return imdict

In [24]:
def process_5363(imdict):
    f = imdict['filename']
    fbase = f[:-4]
    imdict['id'] = fbase

    if '_' in fbase:
        if 'v' in fbase:
            imdict['label'] = 'item verso'
            imdict['rank'] = 2
            imdict['object_thumbnail_flag'] = 0
        else:
            imdict['label'] = 'item recto'
            imdict['rank'] = 1
            imdict['object_thumbnail_flag'] = 1
    else:
        if fbase.isdigit():
            imdict['label'] = 'containing package recto'
            imdict['rank'] = 3
            imdict['object_thumbnail_flag'] = 0
        if 'b' in fbase:
            imdict['label'] = 'containing package bottom'
            imdict['rank'] = 5
            imdict['object_thumbnail_flag'] = 0
        if 'v' in fbase:
            imdict['label'] = 'containing package verso'
            imdict['rank'] = 4
            imdict['object_thumbnail_flag'] = 0
            
    return imdict

In [25]:
def process_1747(imdict, rank=None):
    f = imdict['filename']
    fbase = f[:-4]
    imdict['id'] = fbase

    if fbase.isdigit():
        imdict['label'] = 'package recto'
        imdict['rank'] = 1
        imdict['object_thumbnail_flag'] = 1
    if 'v' in fbase and '_' not in fbase:
        imdict['label'] = 'package verso'
        imdict['rank'] = 2
        imdict['object_thumbnail_flag'] = 0
    if 'b' in fbase and '_' not in fbase:
        imdict['label'] = 'package bottom'
        imdict['rank'] = 3
        imdict['object_thumbnail_flag'] = 0
    if rank is not None:
        if 'v' in fbase:
            imdict['label'] = 'item verso'
            imdict['rank'] = rank
            imdict['object_thumbnail_flag'] = 0
        else:
            imdict['label'] = 'item recto'
            imdict['rank'] = rank
            imdict['object_thumbnail_flag'] = 0
    
    return imdict

In [26]:
def process_487x(imdict):
    f = imdict['filename']
    fbase = f[:-4]
    imdict['id'] = fbase

    if fbase.isdigit():
        imdict['label'] = 'package'
        imdict['rank'] = 1
        imdict['object_thumbnail_flag'] = 1
    if '_a' in fbase:
        imdict['label'] = 'item'
        imdict['rank'] = 2
        imdict['object_thumbnail_flag'] = 0
    if '_b' in fbase:
        imdict['label'] = 'item'
        imdict['rank'] = 3
        imdict['object_thumbnail_flag'] = 0
        
    return imdict

In [27]:
def process_images_entry(entry):
    if isinstance(entry,dict):
        entry['id'] = entry['filename'][:-4]
        entry['label'] = 'sample recto'
        entry['rank'] = 1
        entry['object_thumbnail_flag'] = 1

        return [entry]
    elif isinstance(entry,list):
        ls = [item['filename'].lower() for item in entry]
        if check_pattern(ls):
            newl = []
            for item in entry:
                newl.append(process_normal_imdict(item))
            return newl
        elif check_advanced_pattern(ls):
            newl = []
            for item in entry:
                newl.append(process_advanced_imdict(item))
            return newl
        else:
            if all(['3194_' in item for item in ls]):
                newl = []
                for item in entry:
                    newl.append(process_3194(item))
                return newl
            elif all(['5363' in item for item in ls]) or all(['4901' in item for item in ls]+[len(ls)==4]):
                newl = []
                for item in entry:
                    newl.append(process_5363(item))
                return newl
            elif all(['1747' in item for item in ls]) or all(['4901' in item for item in ls]+[len(ls) > 4]):
                newl = []
                rank = 3
                for item in entry:
                    if '_' in item['filename']:
                        rank+=1
                        newl.append(process_1747(item, rank=rank))
                    else:    
                        newl.append(process_1747(item))
                return newl
            else:
                newl = []
                for item in entry:
                    newl.append(process_487x(item))
                return newl

In [28]:
smp['ImageAssets'] = [process_images_entry(item) for item in smp.Images]

In [29]:
fs = set()

for i in smp.index:
    image_assets = smp.ImageAssets.loc[i]
    assert isinstance(image_assets, list)
    
    for j,image_asset in enumerate(image_assets):
        f = image_asset['filename']
        
        # If the filename is already in the set, increment it
        original_filename, ext = os.path.splitext(f)
        new_filename = f
        counter = 0
        while new_filename in fs:
            counter += 1
            new_filename = f"{original_filename}_copy{counter}{ext}"
        
        # Add the new (or original) filename to the set
        fs.add(new_filename)
        
        # Update the record in the DataFrame (or your data structure)
        image_asset['filename'] = new_filename
        image_assets[j] = image_asset
        smp.ImageAssets.loc[i] = image_assets
        
        # Save a copy of the file with the incremented filename
        original_path = os.path.join('/Users/damoncrockett/lml2lux/img', f)
        new_path = os.path.join('/Users/damoncrockett/lml2lux/img', new_filename)
        copy_path = os.path.join('/Users/damoncrockett/lml2lux/img_copies', new_filename)
        
        if not os.path.isfile(new_path):
            shutil.copy(original_path, copy_path)

In [30]:
del smp['Images']

In [31]:
d = smp.to_dict('records')

In [32]:
import json

In [33]:
with open('lml_w_copies.jsonl', 'w') as file:
    for item in d:
        json.dump(item, file)
        file.write('\n')