# Ingest and process data

This notebook loads the data that was stored on Box, and converts it to dataframe and dataset with a common format across all collections.

It provides functionality for downlaoding images.

In [12]:
%load_ext autoreload
%autoreload 2

In [13]:
from tools.weaving_tools import NMSCollection, SMGCollection, BTCollection
from pathlib import Path

In [None]:
nms = NMSCollection(device='mps',img_folder='nms_imgs')
nms.load_original_csvs(files=list(Path('data').glob('NMS_*.*')))
nms.save_csv('data/NMS.csv')

In [None]:
smg = SMGCollection(device='mps', img_folder='smg_imgs')
smg.load_from_json('data/smg_objects_06_06_2022.json')
smg.save_csv('data/SMG.csv')

In [None]:
bt = BTCollection(device='mps', img_folder='bt_imgs')
bt.load_from_xml('data/bt_catalogue.xml')
bt.save_csv('data/BT.csv')

# Download images

In [None]:
collection = SMGCollection(img_folder=Path('smg_imgs'))
collection.load_from_csv('data/SMG.csv')
collection.fetch_images(n=5)
#collection.save_csv('data/SMG.csv')

In [None]:
collection = BTCollection(img_folder=Path('bt_imgs'))
collection.load_from_csv('data/BT.csv')
collection.fetch_images(n=5)


In [None]:
collection = NMSCollection(img_folder=Path('nms_imgs'))
collection.load_from_csv('data/NMS.csv')
collection.fetch_images(n=5)


# Create vector database

## Vectorize image and text in each collection

In [14]:
clip_ckpt = 'clip-ViT-B-32'#'openai/clip-vit-base-patch32'

In [16]:
nms = NMSCollection(device='mps',img_folder='nms_imgs')
nms.load_from_csv('data/NMS.csv')
nms.vectorize_collection(clip_ckpt, modalities= [('img_path','image')])

Vectorizing image


  0%|          | 0/4794 [00:00<?, ?ex/s]

  0%|          | 0/4794 [00:00<?, ?ex/s]

In [18]:
smg = SMGCollection(device='mps',img_folder='smg_imgs')
smg.load_from_csv('data/SMG.csv')
# temp to for testing purposes remove later
smg.df = smg.df.sample(frac=0.05).reset_index()
smg.vectorize_collection(clip_ckpt, modalities= [('img_path','image')])

Vectorizing image


  0%|          | 0/2284 [00:00<?, ?ex/s]

  0%|          | 0/2284 [00:00<?, ?ex/s]

In [20]:
bt = BTCollection(device='mps', img_folder='bt_imgs')
bt.load_from_csv('data/BT.csv')
bt.vectorize_collection(clip_ckpt, modalities= [('img_path','image')])

Vectorizing image


  0%|          | 0/1106 [00:00<?, ?ex/s]

  0%|          | 0/1106 [00:00<?, ?ex/s]

In [23]:
va = VACollection(device='mps', img_folder='va_imgs')
va.load_from_csv('data/VA.csv')
# temp to for testing purposes remove later
va.df = va.df.sample(frac=0.3).reset_index()
va.vectorize_collection(clip_ckpt, modalities= [('img_path','image')])

(5877, 9)
Vectorizing image


  0%|          | 0/3679 [00:00<?, ?ex/s]

  0%|          | 0/3679 [00:00<?, ?ex/s]

## add URLs

In [24]:
base_url = 'https://framemark.vam.ac.uk/collections/'
postfix = '/full/600,/0/default.jpg'
va.df['img_url'] = va.df.apply(lambda x: f'{base_url}{x.img_loc}{postfix}', axis=1)

In [26]:
va.df['img_url'].iloc[0] 

'https://framemark.vam.ac.uk/collections/2021MY1152/full/600,/0/default.jpg'

In [27]:
smg.df['base_url'] = 'https://coimages.sciencemuseumgroup.org.uk/images/'
smg.df['img_url'] = smg.df.apply(lambda x: f'{x.base_url}{x.img_loc}', axis=1)

In [29]:
smg.df['img_url'].iloc[0] 

'https://coimages.sciencemuseumgroup.org.uk/images/395/653/medium_SMG00158262.jpg'

In [32]:
nms.df['base_url'] = 'https://www.nms.ac.uk/search.axd?command=getcontent&server=Detail&value='
nms.df['img_url'] = nms.df.apply(lambda x: f'{x.base_url}{x.img_loc}'.strip(), axis=1)

In [33]:
nms.df['img_url'].iloc[0] 

'https://www.nms.ac.uk/search.axd?command=getcontent&server=Detail&value=PF1047791'

In [34]:
bt.df['base_url'] = 'http://www.digitalarchives.bt.com/CalmView/GetImage.ashx?db=Catalog&type=default&fname='
bt.df['img_url'] = bt.df.apply(lambda x: f'{x.base_url}{x.img_loc}', axis=1)

In [36]:
bt.df['img_url'].iloc[0] 

'http://www.digitalarchives.bt.com/CalmView/GetImage.ashx?db=Catalog&type=default&fname=TCB_473_P07760.jpg'

# Create Vector Database

In [37]:
import chromadb

In [38]:
client = chromadb.PersistentClient(path="ce_vector_db")

In [49]:
try:
    client.delete_collection(name="congruence_engine")
except Exception as e:
    print(e)
    

Collection congruence_engine does not exist.


In [50]:
collection = client.get_or_create_collection(name="congruence_engine", metadata={"hnsw:space": "cosine"} )

In [51]:
for coll in [nms,bt,smg,va]:
    for mod in ['image']:
        coll.add_embeddings_to_database(collection,mod)

In [52]:
collection.count()

11863

## Fin.