In [2]:
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType
from pymilvus import connections, db
import pandas as pd
import os

HOST = os.environ['MILVUS_HOST']
PORT = os.environ['MILVUS_PORT']
USER = os.environ['MILVUS_USER']
PASS = os.environ['MILVUS_PASS']

Start by loading the reference table so we can make decisions about our intended schema.

In [3]:
df = pd.read_csv('data/photos.tsv000', sep='\t')
df.head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash
0,wud-eV6Vpwo,https://unsplash.com/photos/wud-eV6Vpwo,https://images.unsplash.com/photo-143924685475...,2015-08-10 22:48:30.841999,t,4273,2392,1.79,,sergio_rola,...,,,7515660,42055,silhouette of structure under red sky,,,,,LJMymdi{1IWo}Gj[w^WVICS#bbS2
1,psIMdj26lgw,https://unsplash.com/photos/psIMdj26lgw,https://images.unsplash.com/photo-144077331099...,2015-08-28 14:49:40.016052,t,3872,2176,1.78,,xcvii,...,,,1814817,5893,selective focus photography of black animal ne...,,,,,"LKKd}R^,bJD%~q4Txu%N%gxuD$xu"
2,2EDjes2hlZo,https://unsplash.com/photos/2EDjes2hlZo,https://images.unsplash.com/photo-144683489809...,2015-11-06 18:36:17.334458,t,2560,1707,1.5,Sunset reflection over river,imthinhvu,...,,,2708347,12420,photo of body body of water during golden hour,,,,,LeI{]g9u9u%1?KV@s8R-EAf#t5aL
3,WN8kSLy8KMQ,https://unsplash.com/photos/WN8kSLy8KMQ,https://images.unsplash.com/photo-144530812443...,2015-10-20 02:29:20.267471,t,2288,1520,1.51,Hiking The Mountains,bettenz,...,,,1616448,9773,green leafed trees between two rock formations,Zion National Park,37.250981,-112.950525,65.07215,LoDv=$sjD$bc.AV@ROWCtSn~s:Rj
4,QAXDmkU60OU,https://unsplash.com/photos/QAXDmkU60OU,https://images.unsplash.com/photo-144196149785...,2015-09-11 08:51:54.202624,t,2048,1371,1.49,,j,...,,,983884,9410,landscape photography of snow covered mountain...,,,,,LUIPMT9F%LoIBax]Rkj]Aet7Rjj[


Based on the above, we can define a Collection Schema. We want to keep the following columns:
- photo_id (primary key)
- photographer_username
- photo_submitted_at
- stats_views
- stats_downloads
- photo_description
- ai_description
- image (vector)

We'll also create two new columns:
- fname (image location on disk)
- tokens (tokenized version of the ai_description, for hybrid search testing)

Start by definining each field and datatype.

In [4]:
photo_id = FieldSchema(
    name="photo_id",
    dtype=DataType.VARCHAR,
    max_length=16,
    is_primary=True
)

photographer_username = FieldSchema(
    name="photographer_username",
    dtype=DataType.VARCHAR,
    max_length=200,
    default_value="xXx_Zaphod_Beeblebrox_xXx"
)

photo_submitted_at = FieldSchema(
    name="photo_submitted_at",
    dtype=DataType.VARCHAR,
    max_length=200,
    default_value="1970-01-01T00:00:00.000000"
)

stats_views = FieldSchema(
    name = "stats_views",
    dtype=DataType.INT64,
    default_value=0
)

stats_downloads = FieldSchema(
    name = "stats_downloads",
    dtype=DataType.INT64,
    default_value=0
)

photo_description = FieldSchema(
    name="photo_description",
    dtype=DataType.VARCHAR,
    max_length=2000,
    default_value=""
)

ai_description = FieldSchema(
    name="ai_description",
    dtype=DataType.VARCHAR,
    max_length=200,
    default_value=""
)

image = FieldSchema(
    name="image",
    dtype=DataType.FLOAT_VECTOR,
    dim=512,
)

fname = FieldSchema(
    name="fname",
    dtype=DataType.VARCHAR,
    max_length=128,
    default_value=""
)

image_meta = FieldSchema(
    name="image_meta",
    dtype=DataType.JSON
)

Now combine them into a CollectionSchema.

In [5]:
schema = CollectionSchema(
    fields = [
        photo_id,
        photographer_username,
        photo_submitted_at,
        stats_views,
        stats_downloads,
        photo_description,
        ai_description,
        image,
        fname,
        image_meta
    ],
    description="Unsplash Lite - 25K images.",
    enable_dynamic_field=False
)

With the Collection Schema defined, we can open a connection to Milvus and create the Collection.

In [6]:
conn = connections.connect(
    user=USER,
    password=PASS,
    host=HOST,
    port=PORT
)

In [7]:
collection = Collection(
    name='unsplash_lite',
    schema=schema,
    using='default'
)

We can check the status of this collection and schema by accessing methods and attributes of the collection object.

In [8]:
collection.is_empty

True

In [9]:
collection.schema

{'auto_id': False, 'description': 'Unsplash Lite - 25K images.', 'fields': [{'name': 'photo_id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 16}, 'is_primary': True, 'auto_id': False}, {'name': 'photographer_username', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 200}}, {'name': 'photo_submitted_at', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 200}}, {'name': 'stats_views', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'stats_downloads', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'photo_description', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2000}}, {'name': 'ai_description', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 200}}, {'name': 'image', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 512}}, {'name': 'fname', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params':

We are now ready to insert data into the Collection.