In [37]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from transformers import pipeline, __version__ as tfs_version

tfs_version

'4.37.0.dev0'

In [38]:
pipe = pipeline("feature-extraction", model="microsoft/codebert-base")

In [39]:
qdrant = QdrantClient("localhost", port=6333, timeout=100000)

In [40]:
df = pd.read_parquet("output_data/dataframe_text_embeddings.parquet")

In [41]:
df["date"] = df["date"].astype("str")
df.head()

Unnamed: 0,text,vector,file_name,author,subject,code_language,lines_of_code,date
0,class Solution(object):\n def fibonacci(sel...,"[-0.20388200879096985, 0.006657123565673828, -...",ClimbingStairs.py,Alejandro Pérez García,Bases de Datos y Administración,Python,15,2023-10-03 09:15:00
1,"def calculate_area(height, l, r):\n return ...","[-0.17179635167121887, -0.12407292425632477, -...",Containers.py,Ana Martínez Fernández,Programación Orientada a Objetos (POO),Python,28,2023-10-09 14:30:00
2,class Solution(object):\n def containsDupli...,"[-0.24347487092018127, -0.02925797551870346, -...",ContainsDuplicate.py,Juan García Sánchez,Desarrollo Web Avanzado,Python,13,2023-10-15 18:45:00
3,"class Solution:\n def rob(self, nums) -> in...","[-0.11747059971094131, -0.04676730930805206, -...",HouseRobberDynProg.py,Ana Martínez Fernández,Programación en Sistemas Embebidos,Python,15,2023-10-21 11:20:00
4,# Definition for singly-linked list.\n# class ...,"[-0.21408772468566895, -0.02248210459947586, -...",LinkedListCycle.py,Ana Martínez Fernández,Desarrollo Web Avanzado,Python,24,2023-11-05 20:00:00


In [42]:
collection_name = "code"
vector_len = len(df.iloc[0][1])
vector_len

768

In [43]:
qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_len, distance=Distance.COSINE),
    replication_factor=2,
    shard_number=6,
)

True

In [44]:
batch_size = 100

for i in range(0, len(df.index), batch_size):
    pos_final = min(i + batch_size, len(df.index))

    chunk = df.iloc[i:pos_final]

    qdrant.upsert(
        collection_name=collection_name,
        points=[
            PointStruct(
                id=idx,
                vector=row["vector"].tolist(),
                payload={
                    "text": row["text"],
                    "author": row["author"],
                    "subject": row["subject"],
                    "code_language": row["code_language"],
                    "lines_of_code": row["lines_of_code"],
                    "date": row["date"],
                    "file_name": row["file_name"],
                },
            )
            for idx, row in chunk.iterrows()
        ],
    )

In [45]:
qdrant.get_collection(collection_name=collection_name)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=13, indexed_vectors_count=0, points_count=13, segments_count=48, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=6, replication_factor=2, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})

In [46]:
search_vector = df["vector"][0]
search_vector = search_vector.tolist()

In [47]:
scored_point = qdrant.search(collection_name, search_vector, limit=1)[0]
scored_point.model_dump()

{'id': 0,
 'version': 0,
 'score': 1.0,
 'payload': {'author': 'Alejandro Pérez García',
  'code_language': 'Python',
  'date': '2023-10-03 09:15:00',
  'file_name': 'ClimbingStairs.py',
  'lines_of_code': 15,
  'subject': 'Bases de Datos y Administración',
  'text': 'class Solution(object):\n    def fibonacci(self, n):\n        """\n        :type n: int\n        :rtype: int\n        """\n        if n == 0 or n == 1:\n            return 1\n        prev, curr = 1, 1\n        for _ in range(2, n + 1):\n            temp = curr\n            curr = prev + curr\n            prev = temp\n        return curr\n'},
 'vector': None}

In [48]:
print(scored_point.payload["text"])

class Solution(object):
    def fibonacci(self, n):
        """
        :type n: int
        :rtype: int
        """
        if n == 0 or n == 1:
            return 1
        prev, curr = 1, 1
        for _ in range(2, n + 1):
            temp = curr
            curr = prev + curr
            prev = temp
        return curr

