# Putting everything together

In the previous two steps we retrieved the raw data and looked at the quality of it. Now, moving to a hypothetical deployment, we will switch over to a vector database for storing the embeddings and related meta data.

In [None]:
# required libraries
from github import Github
import os
import pandas as pd
import numpy as np
import pickle

# qudrant client to access the vector DB
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

# we are using some less optimal code, suppress the warnings for now
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

## Baseline data and model we use to perform similarity search

In [None]:
# model used to encode the embeddings
model = SentenceTransformer('all-MiniLM-L6-v2') # NOTE: make sure it's the same model used in the previous steps

# load the data we prepared in the previous step
unique_labels = pickle.load(open("../data/raw/labels.pkl", 'rb'))
deployment_issues = pickle.load(open("../data/deploy/issues.pkl", 'rb'))

## Create DB collection and initialise client

In [None]:
# fow now in memory, we can switch to qudrant cloud or self-managed easily
qdrant = QdrantClient(":memory:") 

# create a collection on the database
# vector size needs to match the one used in the model
qdrant.recreate_collection(
	collection_name="github_issues",
	vectors_config=models.VectorParams(
		size=model.get_sentence_embedding_dimension(), # Vector size is defined by used model
		distance=models.Distance.COSINE
	)
)


## Store the newly computed embeddings together with the meta data

In [None]:
from collections import OrderedDict, defaultdict

qdrant.upload_records(
	collection_name="github_issues",
	records=[
		models.Record(
			id=idx,
			vector=model.encode(row["clean_body"]).tolist(),
			payload=row.to_dict(into=OrderedDict)
		) for idx, row in deployment_issues.iterrows()
	]
)

## Query the vector database

In [None]:
hits = qdrant.search(
	collection_name="github_issues",
	query_vector=model.encode("I get the following error when using Quarkus dev mode").tolist(),
	limit=5
)
for hit in hits:
	print(hit.payload['title'], "labels:", hit.payload['labels'], "score:", hit.score)