In [3]:
!pip -q install towhee towhee.models

In [4]:
from towhee import pipeline

In [5]:
p = pipeline('image-embedding')

Cloning the repo: towhee/image-embedding-resnet50... Be patient and waiting printing 'Successfully'.
Successfully clone the repo: towhee/image-embedding-resnet50.
Do you want to install ruamel.yaml? [y/n]: y
ruamel.yaml installed successfully!


In [6]:
output = p('https://raw.githubusercontent.com/towhee-io/towhee/main/docs/02-Getting%20Started/towhee.jpeg')

Cloning the repo: towhee/image-decoder... Be patient and waiting printing 'Successfully'.
Successfully clone the repo: towhee/image-decoder.
Cloning the repo: towhee/timm-image-embedding... Be patient and waiting printing 'Successfully'.
Successfully clone the repo: towhee/timm-image-embedding.


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_a1_0-14fe96d1.pth" to /root/.cache/torch/hub/checkpoints/resnet50_a1_0-14fe96d1.pth


In [7]:
output

array([0.05727836, 0.06637038, 0.        , ..., 0.08792703, 0.3107774 ,
       0.        ], dtype=float32)

In [9]:
from towhee import DataCollection

In [10]:
dc = DataCollection.range(5)

In [12]:
(
    dc.map(lambda x: x + 1)
      .map(lambda x: x * 2)
      .to_list()
)

[2, 4, 6, 8, 10]

In [14]:
def is_prime(x):
  if x <=1:
    return False
  for i in range(2, int(x/2)+1):
    if (x%i) == 0:
      return False
  return True


In [15]:
is_prime(5)

True

In [13]:
!pip -q install pandas scikit-learn scikit-learn opencv-python ipython matplotlib

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.6 MB[0m [31m10.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m25.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [19]:
dc = DataCollection(iter(range(10)))

In [20]:
dc.map(is_prime).to_list() 

[False, False, True, True, False, True, False, True, False, False]

In [21]:
dk = (
    DataCollection.range(10)
      .filter(is_prime)
      .filter(lambda x: x % 10 == 3)
      .map(str)
)

In [22]:
dk.to_list()

['3']

Pythonic Method-Chaining Style API: Designed to behave as a python list or iterator, DataCollection is easy to understand for python users and is compatible with most popular data science toolkits. Function/Operator invocations can be chained one after another, making your code clean and fluent.

Exception-Safe Execution: DataCollection provides exception-safe execution, which allows the function/operator invocation chain to continue executing on exception. Data scientists can put an exception receiver to the tail of the pipeline, processing and analyzing the exceptions as data, not errors.

Feature-Rich Operator Repository: There are various pre-defined operators On the towhee hub, which cover the most popular deep learning models in computer vision, NLP, and voice processing. Using these operators in the data processing pipeline can significantly accelerate your work.

In [27]:
! curl -L https://github.com/towhee-io/examples/releases/download/data/reverse_image_search.zip -O
! unzip -q -o reverse_image_search.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  119M  100  119M    0     0  97.8M      0  0:00:01  0:00:01 --:--:--  123M


In [26]:
import csv
from glob import glob
from pathlib import Path
from statistics import mean

from towhee.dc2 import pipe, ops, DataCollection


# Towhee parameters
MODEL = 'resnet50'
DEVICE = None # if None, use default device (cuda is enabled if available)

# path to csv (column_1 indicates image path) OR a pattern of image paths
INSERT_SRC = 'reverse_image_search.csv'
QUERY_SRC = './test/*/*.JPEG'

In [63]:
# Milvus parameters
HOST = 'in01-a6ee5ad60466a21.aws-us-east-2.vectordb.zillizcloud.com'
PORT = '19530'
TOPK = 10
DIM = 2048 # dimension of embedding extracted by MODEL
COLLECTION_NAME = 'reverse_image_search'
INDEX_TYPE = 'IVF_FLAT'
METRIC_TYPE = 'L2'


In [28]:
# Load image path
def load_image(x):
    if x.endswith('csv'):
        with open(x) as f:
            reader = csv.reader(f)
            next(reader)
            for item in reader:
                yield item[1]
    else:
        for item in glob(x):
            yield item

In [29]:
# Embedding pipeline
p_embed = (
    pipe.input('src')
        .flat_map('src', 'img_path', load_image)
        .map('img_path', 'img', ops.image_decode())
        .map('img', 'vec', ops.image_embedding.timm(model_name=MODEL, device=DEVICE))
)

In [30]:
p_display = p_embed.output('img_path', 'img', 'vec')

Cloning the repo: towhee/image-decode... Be patient and waiting printing 'Successfully'.
Successfully clone the repo: towhee/image-decode.
Cloning the repo: image-embedding/timm... Be patient and waiting printing 'Successfully'.
Successfully clone the repo: image-embedding/timm.


In [31]:
DataCollection(p_display('./test/goldfish/*.JPEG')).show()

Cloning the repo: image-decode/cv2... Be patient and waiting printing 'Successfully'.
Successfully clone the repo: image-decode/cv2.


img_path,img,vec
./test/goldfish/n01443537_3883.JPEG,,"[0.0, 0.0, 0.0, ...] shape=(2048,)"


In [None]:
!pip install pymilvus

In [34]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

In [35]:
# Create milvus collection (delete first if exists)
def create_milvus_collection(collection_name, dim):
    if utility.has_collection(collection_name):
        utility.drop_collection(collection_name)
    
    fields = [
        FieldSchema(name='path', dtype=DataType.VARCHAR, description='path to image', max_length=500, 
                    is_primary=True, auto_id=False),
        FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='image embedding vectors', dim=dim)
    ]
    schema = CollectionSchema(fields=fields, description='reverse image search')
    collection = Collection(name=collection_name, schema=schema)

    index_params = {
        'metric_type': METRIC_TYPE,
        'index_type': INDEX_TYPE,
        'params': {"nlist": 2048}
    }
    collection.create_index(field_name='embedding', index_params=index_params)
    return collection

In [64]:
connections.connect('test',host=HOST, 
                    port=PORT,
                    user='db_admin',
                    password='1234ehtdA')

MilvusException: ignored

In [41]:
from pymilvus import CollectionSchema, FieldSchema, DataType

book_id = FieldSchema(
  name="book_id", 
  dtype=DataType.INT64, 
  is_primary=True, 
)

book_name = FieldSchema(
  name="book_name", 
  dtype=DataType.VARCHAR, 
  max_length=200,
)

word_count = FieldSchema(
  name="word_count", 
  dtype=DataType.INT64,  
)

book_intro = FieldSchema(
  name="book_intro", 
  dtype=DataType.FLOAT_VECTOR, 
  dim=2
)

schema = CollectionSchema(
  fields=[book_id, book_name, word_count, book_intro], 
  description="Test book search"
)

collection_name = "book"

In [42]:
collection = Collection(
    name=collection_name, 
    schema=schema, 
    using='default', 
    shards_num=2,
    )

In [43]:
import random
data = [
  [i for i in range(2000)],
  [str(i) for i in range(2000)],
  [i for i in range(10000, 12000)],
  [[random.random() for _ in range(2)] for _ in range(2000)],
]

In [44]:
from pymilvus import Collection
collection = Collection("book")      # Get an existing collection.
mr = collection.insert(data)

In [45]:
index_params = {
  "metric_type":"L2",
  "index_type":"IVF_FLAT",
  "params":{"nlist":1024}
}

In [None]:
from pymilvus import Collection
collection = Collection("book")      # Get an existing collection.
collection.create_index(
  field_name="book_intro", 
  index_params=index_params
)

In [None]:
collection = Collection("book")      # Get an existing collection.
collection.load()

In [None]:
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}

In [None]:
results = collection.search(
	data=[[0.1, 0.2]], 
	anns_field="book_intro", 
	param=search_params, 
	limit=10, 
	expr=None,
	consistency_level="Strong"
)

In [None]:
results[0].ids
results[0].distances

In [None]:
collection.release()

### Triying out langchain

In [50]:
!pip -q install langchain openai tiktoken

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.7 MB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m27.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [49]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Milvus
from langchain.document_loaders import TextLoader

loader = TextLoader('/content/vim_play.txt')
documents = loader.load()

In [51]:
import os 
os.environ['OPENAI_API_KEY'] = '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'

In [52]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

In [None]:
vector_db = Milvus.from_documents(
    docs[:5],
    embeddings,
    connection_args={"host":"in01-5c3eacf2535a4de.aws-us-east-2.vectordb.zillizcloud.com" ,
                     "port": "19530"},
)