# Embeddings laden

In [1]:
import gensim

In [2]:
from gensim.models import KeyedVectors

In [3]:
w2v = KeyedVectors.load_word2vec_format("newsticker.bin", binary=True)

In [4]:
len(w2v)

393997

In [5]:
w2v.vector_size

300

In [6]:
w2v.most_similar("linux")

[('linux_bsd', 0.5779598355293274),
 ('unix-derivate', 0.5751985311508179),
 ('openvms', 0.5634273290634155),
 ('suse_downloadseite', 0.5558210015296936),
 ('motif', 0.5551407933235168),
 ('gnu_linux', 0.5539237856864929),
 ('linux-variant', 0.5514055490493774),
 ('unix-ähnlich_betriebssystem', 0.5510704517364502),
 ('linux-distribution', 0.5510043501853943),
 ('unix-derivat', 0.5471871495246887)]

# qdrant

In [22]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, FieldCondition, \
                                 MatchValue, OptimizersConfigDiff, Filter, \
                                 PayloadSchemaType, BinaryQuantization, BinaryQuantizationConfig

qc = QdrantClient(host='localhost', port=6333, timeout=6000, 
                  grpc_port=6334, prefer_grpc=True)

In [None]:
qc.recreate_collection(
    collection_name='newsticker', 
    vectors_config=VectorParams(size=w2v.vector_size, distance=Distance.COSINE),
    optimizers_config=OptimizersConfigDiff(memmap_threshold=0),
    on_disk_payload=False
)

In [None]:
%%time
qc.upload_collection(
    collection_name='newsticker',
    vectors=w2v.vectors,
    payload=[{"word": k} for k in w2v.index_to_key],
    ids=None,  # Vector ids will be assigned automatically
)

In [10]:
%%time
r = qc.search(collection_name='newsticker', 
          query_vector=[0.0 for i in range(w2v.vector_size)], 
          with_vectors=True,
          limit=10,
          query_filter=Filter(must=[FieldCondition(key="word", match=MatchValue(value="linux"))])
         )

CPU times: user 6.39 ms, sys: 1.22 ms, total: 7.61 ms
Wall time: 37.9 ms


In [13]:
qc.create_payload_index(
    collection_name="newsticker",
    field_name="word",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=18472, status=<UpdateStatus.COMPLETED: 'completed'>)

In [14]:
%%time
r = qc.search(collection_name='newsticker', 
          query_vector=[0.0 for i in range(w2v.vector_size)], 
          with_vectors=True,
          limit=10,
          query_filter=Filter(must=[FieldCondition(key="word", match=MatchValue(value="linux"))])
         )

CPU times: user 3.14 ms, sys: 0 ns, total: 3.14 ms
Wall time: 34.3 ms


In [15]:
%%time
r2 = qc.search(collection_name='newsticker', 
          query_vector=r[0].vector, 
          with_vectors=False,
          limit=10,
         )

CPU times: user 1.52 ms, sys: 738 µs, total: 2.26 ms
Wall time: 3.15 ms


In [17]:
[[r.score, r.payload["word"]] for r in r2]

[[0.9999999403953552, 'linux'],
 [0.5779598951339722, 'linux_bsd'],
 [0.5751985907554626, 'unix-derivate'],
 [0.5634274482727051, 'openvms'],
 [0.5558210611343384, 'suse_downloadseite'],
 [0.5551407337188721, 'motif'],
 [0.5539238452911377, 'gnu_linux'],
 [0.5514057874679565, 'linux-variant'],
 [0.5510704517364502, 'unix-ähnlich_betriebssystem'],
 [0.5510043501853943, 'linux-distribution']]

In [24]:
%%time
qc.update_collection(
    collection_name="newsticker",
    quantization_config=BinaryQuantization(
        binary=BinaryQuantizationConfig(always_ram=True)
    )
)

CPU times: user 2.79 ms, sys: 1.29 ms, total: 4.08 ms
Wall time: 1.72 s


True

In [25]:
from tqdm.auto import tqdm
for w in tqdm(w2v.index_to_key, total=len(w2v.index_to_key)):
    r = qc.search(collection_name='newsticker', 
          query_vector=[0.0 for i in range(w2v.vector_size)], 
          with_vectors=True,
          limit=10,
          query_filter=Filter(must=[FieldCondition(key="word", match=MatchValue(value=w))])
         )

  0%|          | 0/393997 [00:00<?, ?it/s]

In [50]:
for v in tqdm(w2v.vectors, total=len(w2v.vectors)):
    r = qc.search(collection_name='newsticker', 
          query_vector=v, 
          with_vectors=True,
          limit=10
         )

  0%|          | 0/393997 [00:00<?, ?it/s]

# Postgres

In [None]:
from sqlalchemy import create_engine
psql = create_engine('postgresql:///newsticker')

In [29]:
import psycopg
psql = psycopg.connect("dbname=newsticker")

In [30]:
# Vector-Extension und Tabelle anlegen
psql.execute("CREATE EXTENSION vector")        
psql.execute("CREATE TABLE embeddings (word TEXT NOT NULL PRIMARY KEY, embedding VECTOR(300))")
psql.commit()

In [31]:
from tqdm.auto import tqdm
for word in tqdm(w2v.index_to_key, total=len(w2v)):
    psql.execute("INSERT INTO embeddings (word, embedding) VALUES (%s, %s)",
                (word, str(list(w2v[word]))))
psql.commit()

  0%|          | 0/393997 [00:00<?, ?it/s]

In [32]:
%%time
r = psql.execute("SELECT * FROM embeddings WHERE word=%s", ("linux",)).fetchone()
r

CPU times: user 3.38 ms, sys: 0 ns, total: 3.38 ms
Wall time: 221 ms


('linux',
 '[0.9589558,0.19947146,-0.4096641,0.017722009,-1.248387,-0.5330077,1.075491,-0.06663409,-0.35737067,-0.5795995,0.110878274,0.23789768,-0.63666457,-0.5599161,0.24903178,-1.0020274,0.69734097,-0.23216513,0.84816086,-0.80528027,-0.8973152,0.52024066,0.23125485,-1.0612818,-1.0891576,-1.1822853,-1.281176,0.50714666,0.99066544,0.15231892,-0.83849996,0.07902937,1.1686418,0.018616887,0.7951601,-0.9614738,0.57500136,0.82044953,-0.3075975,-1.0482547,0.30146852,-1.6498418,-0.026289744,-0.043323234,0.45177528,0.029446198,-0.6337627,0.31244415,0.59559476,-0.23203942,-0.5385304,0.24233422,0.12640923,-0.620869,-0.28922755,0.09376014,-1.6695409,-0.17653067,0.34263787,-0.7501536,0.7521111,0.43047214,-0.3684961,-0.38368693,-1.3479874,0.35782832,0.32592556,0.3600103,0.13949881,0.38488752,-0.07509829,0.71128356,-0.35133773,-0.6554507,-1.0370524,0.37548238,-1.1510168,-1.1007159,-0.5090775,1.6262338,-0.6459884,-0.21367016,0.11266727,1.1709536,-0.9748765,0.059127357,0.9341988,1.4555093,0.58458704,

In [33]:
%%time
r2 = psql.execute("SELECT word FROM embeddings ORDER BY embedding <=> %s LIMIT 10", (r[1],)).fetchall()
r2

CPU times: user 2.41 ms, sys: 0 ns, total: 2.41 ms
Wall time: 287 ms


[('linux',),
 ('linux_bsd',),
 ('unix-derivate',),
 ('openvms',),
 ('suse_downloadseite',),
 ('motif',),
 ('gnu_linux',),
 ('linux-variant',),
 ('unix-ähnlich_betriebssystem',),
 ('linux-distribution',)]

In [34]:
%%time
psql.execute("CREATE INDEX ON embeddings USING hnsw (embedding vector_cosine_ops)")
psql.commit()

CPU times: user 1.82 s, sys: 708 ms, total: 2.52 s
Wall time: 35min 30s


In [35]:
for w in tqdm(w2v.index_to_key, total=len(w2v.index_to_key)):
    r = psql.execute("SELECT * FROM embeddings WHERE word=%s", (word,)).fetchone()

  0%|          | 0/393997 [00:00<?, ?it/s]

In [51]:
for v in tqdm(w2v.vectors, total=len(w2v.vectors)):
    r2 = psql.execute("SELECT * FROM embeddings ORDER BY embedding <=> %s LIMIT 10", 
                      (str(v.tolist()),)).fetchall()

  0%|          | 0/393997 [00:00<?, ?it/s]

In [None]:
psql.close()