In [2]:
from pymilvus import Collection, connections, db
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import pandas as pd
import numpy as np
import h5py
import time
import os

HOST = os.environ['MILVUS_HOST']
PORT = os.environ['MILVUS_PORT']
USER = os.environ['MILVUS_USER']
PASS = os.environ['MILVUS_PASS']

Start by loading the reference table and cleaning up some of the columns we want to use. We also need to generate tokens from the ai_description column to support hybrid query experiments.

In [3]:
df = pd.read_csv('data/photos.tsv000', sep='\t')
df.head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash
0,wud-eV6Vpwo,https://unsplash.com/photos/wud-eV6Vpwo,https://images.unsplash.com/photo-143924685475...,2015-08-10 22:48:30.841999,t,4273,2392,1.79,,sergio_rola,...,,,7515660,42055,silhouette of structure under red sky,,,,,LJMymdi{1IWo}Gj[w^WVICS#bbS2
1,psIMdj26lgw,https://unsplash.com/photos/psIMdj26lgw,https://images.unsplash.com/photo-144077331099...,2015-08-28 14:49:40.016052,t,3872,2176,1.78,,xcvii,...,,,1814817,5893,selective focus photography of black animal ne...,,,,,"LKKd}R^,bJD%~q4Txu%N%gxuD$xu"
2,2EDjes2hlZo,https://unsplash.com/photos/2EDjes2hlZo,https://images.unsplash.com/photo-144683489809...,2015-11-06 18:36:17.334458,t,2560,1707,1.5,Sunset reflection over river,imthinhvu,...,,,2708347,12420,photo of body body of water during golden hour,,,,,LeI{]g9u9u%1?KV@s8R-EAf#t5aL
3,WN8kSLy8KMQ,https://unsplash.com/photos/WN8kSLy8KMQ,https://images.unsplash.com/photo-144530812443...,2015-10-20 02:29:20.267471,t,2288,1520,1.51,Hiking The Mountains,bettenz,...,,,1616448,9773,green leafed trees between two rock formations,Zion National Park,37.250981,-112.950525,65.07215,LoDv=$sjD$bc.AV@ROWCtSn~s:Rj
4,QAXDmkU60OU,https://unsplash.com/photos/QAXDmkU60OU,https://images.unsplash.com/photo-144196149785...,2015-09-11 08:51:54.202624,t,2048,1371,1.49,,j,...,,,983884,9410,landscape photography of snow covered mountain...,,,,,LUIPMT9F%LoIBax]Rkj]Aet7Rjj[


In [4]:
df['stats_views'] = df['stats_views'].fillna(0).astype(int)
df['stats_downloads'] = df['stats_downloads'].fillna(0).astype(int)
df['photo_description'] = df['photo_description'].fillna('').astype(str)
df['ai_description'] = df['ai_description'].fillna('').astype(str)
df['tokens'] = df['ai_description'].str.split()
df.head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash,tokens
0,wud-eV6Vpwo,https://unsplash.com/photos/wud-eV6Vpwo,https://images.unsplash.com/photo-143924685475...,2015-08-10 22:48:30.841999,t,4273,2392,1.79,,sergio_rola,...,,7515660,42055,silhouette of structure under red sky,,,,,LJMymdi{1IWo}Gj[w^WVICS#bbS2,"[silhouette, of, structure, under, red, sky]"
1,psIMdj26lgw,https://unsplash.com/photos/psIMdj26lgw,https://images.unsplash.com/photo-144077331099...,2015-08-28 14:49:40.016052,t,3872,2176,1.78,,xcvii,...,,1814817,5893,selective focus photography of black animal ne...,,,,,"LKKd}R^,bJD%~q4Txu%N%gxuD$xu","[selective, focus, photography, of, black, ani..."
2,2EDjes2hlZo,https://unsplash.com/photos/2EDjes2hlZo,https://images.unsplash.com/photo-144683489809...,2015-11-06 18:36:17.334458,t,2560,1707,1.5,Sunset reflection over river,imthinhvu,...,,2708347,12420,photo of body body of water during golden hour,,,,,LeI{]g9u9u%1?KV@s8R-EAf#t5aL,"[photo, of, body, body, of, water, during, gol..."
3,WN8kSLy8KMQ,https://unsplash.com/photos/WN8kSLy8KMQ,https://images.unsplash.com/photo-144530812443...,2015-10-20 02:29:20.267471,t,2288,1520,1.51,Hiking The Mountains,bettenz,...,,1616448,9773,green leafed trees between two rock formations,Zion National Park,37.250981,-112.950525,65.07215,LoDv=$sjD$bc.AV@ROWCtSn~s:Rj,"[green, leafed, trees, between, two, rock, for..."
4,QAXDmkU60OU,https://unsplash.com/photos/QAXDmkU60OU,https://images.unsplash.com/photo-144196149785...,2015-09-11 08:51:54.202624,t,2048,1371,1.49,,j,...,,983884,9410,landscape photography of snow covered mountain...,,,,,LUIPMT9F%LoIBax]Rkj]Aet7Rjj[,"[landscape, photography, of, snow, covered, mo..."


We also need to align these data elements to the image vectors we created in 3-encode-images.ipynb. Load the h5py object and merge the two dataframes to ensure ordering remains the same. 

In [5]:
with h5py.File('data/unsplash-lite.hdf5', 'r') as f:
    vectors = np.array(f['unsplash-512']['vectors'])
    fnames = f['unsplash-512']['fnames']
    fnames = np.array([fname.decode('utf-8') for fname in tqdm(fnames)])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24999/24999 [00:01<00:00, 18582.92it/s]


In [6]:
df_vect = pd.DataFrame({'fname':fnames})
df_vect['photo_id'] = df_vect['fname'].map(lambda fname: fname.split('/')[-1][:-4])
df_vect.head()

Unnamed: 0,fname,photo_id
0,data/unsplash-512/M34IAftLLsY.png,M34IAftLLsY
1,data/unsplash-512/0ZHvH8DITFA.png,0ZHvH8DITFA
2,data/unsplash-512/c3qvF4WR0Rw.png,c3qvF4WR0Rw
3,data/unsplash-512/xXa4h--mrbM.png,xXa4h--mrbM
4,data/unsplash-512/TwFZBS0vuV0.png,TwFZBS0vuV0


In [7]:
df = df_vect.merge(df, on='photo_id', how='left')
df.head()

Unnamed: 0,fname,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,...,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash,tokens
0,data/unsplash-512/M34IAftLLsY.png,M34IAftLLsY,https://unsplash.com/photos/M34IAftLLsY,https://images.unsplash.com/photo-158015067449...,2020-01-27 18:45:03.038748,t,5472,3648,1.5,Street Photography. Coral Gables. FL.,...,,454820,2300,man in black t-shirt riding on bicycle,,,,,LeLnY[Eg}Y-A;}xFxZs:OXbHRkWV,"[man, in, black, t-shirt, riding, on, bicycle]"
1,data/unsplash-512/0ZHvH8DITFA.png,0ZHvH8DITFA,https://unsplash.com/photos/0ZHvH8DITFA,https://images.unsplash.com/photo-157895114146...,2020-01-13 21:36:13.695572,t,7360,4912,1.5,"Two waterfalls in Haifoss, Iceland.",...,,575745,4324,a large waterfall with a large amount of water...,,,,,LSE{U#oy4oM_?w%1M{Rj%MoIWCWr,"[a, large, waterfall, with, a, large, amount, ..."
2,data/unsplash-512/c3qvF4WR0Rw.png,c3qvF4WR0Rw,https://unsplash.com/photos/c3qvF4WR0Rw,https://images.unsplash.com/photo-144598606556...,2015-10-27 22:48:00.761634,t,4347,2865,1.52,Delicate flower on the ground,...,,3657652,16675,yellow petaled flower on gray brick floor,,,,,LFK9Y$0h0o%E$SNFxrai9Ps*?TIb,"[yellow, petaled, flower, on, gray, brick, floor]"
3,data/unsplash-512/xXa4h--mrbM.png,xXa4h--mrbM,https://unsplash.com/photos/xXa4h--mrbM,https://images.unsplash.com/photo-156933405103...,2019-09-24 14:08:09.099339,t,5397,3602,1.5,Cacabelos Spain Lightleaks,...,,1059665,5693,field of green trees,,,,,"LuI3,jVEX.0y%1oJR,R*R*fks:jZ","[field, of, green, trees]"
4,data/unsplash-512/TwFZBS0vuV0.png,TwFZBS0vuV0,https://unsplash.com/photos/TwFZBS0vuV0,https://images.unsplash.com/photo-158342657393...,2020-03-05 16:44:05.731578,t,2574,3504,0.73,Elephant with his owner. www.msblifestyle.com,...,,372842,2449,woman in black jacket standing beside elephant...,,,,,LDCr.hxGIpE10foeNGt7I:E2-o%1,"[woman, in, black, jacket, standing, beside, e..."


Prep the data for insertion. This is based off of the CollectionSchema we defined in 4-create-collection.ipynb. This easily fits in RAM, but we would probably want a more robust (i.e., less memory-intensive) process for insertion of data in a Production case. Note that this data structure is columnar.

In [8]:
data = [
    df['photo_id'].values,
    df['photographer_username'].values,
    df['photo_submitted_at'].values,
    df['stats_views'].values,
    df['stats_downloads'].values,
    df['photo_description'].values,
    df['ai_description'].values,
    vectors,
    df['fname'].values,
    [{'tokens':token_list} for token_list in df['tokens']]
]

Now we can open a connection and start uploading data. We are limited to uploaded chunks less than 1024MB (GraphQL API limitation), so we'll chunk through the data with a batch size of 1000.

In [9]:
conn = connections.connect(
    user=USER,
    password=PASS,
    host=HOST,
    port=PORT
)

In [10]:
def get_batches(data, batchsize=1000):
    rowcount = len(data[0]) # This assumes all data is same dimensionality
    batches = []
    for i in range(0, rowcount, batchsize):
        batch = []
        for col in data:
            batch.append(col[i:i+batchsize])
        batches.append(batch)
    return batches

def upload_data_batch(data, collection_name='unsplash_lite'):
    collection = Collection(collection_name)
    response = collection.insert(data)
    return response

data_batches = get_batches(data, batchsize=1000)
with ThreadPoolExecutor(max_workers=8) as executor:
    response = list(tqdm(executor.map(upload_data_batch, data_batches)))

25it [00:02, 12.48it/s]


Finally, we can create an Index. Let's stick with FLAT on the float vector (image) field and use Cosine similarity as the distance/similarity metric.

In [11]:
index_params = {
    "index_type": "FLAT",
    "metric_type": "COSINE",
    "params": {}
}

collection = Collection('unsplash_lite')

collection.create_index(
  field_name="image",
  index_params=index_params,
  index_name="image_cosine"
)

Status(code=0, message=)