In [32]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from transformers import pipeline, __version__ as tfs_version

tfs_version

'4.29.2'

In [2]:
pipe = pipeline("feature-extraction", model="microsoft/codebert-base")
qdrant = QdrantClient("localhost", port=6333)

In [29]:
df = pd.read_parquet("output_data/dataframe_text_embeddings.parquet")

In [13]:
df.head()

Unnamed: 0,text,vector,author,subject,code_language,lines_of_code,date
0,class Solution(object):\n def climbStairs(s...,"[-0.21761193871498108, -0.006907954812049866, ...",Ana Martínez Fernández,Desarrollo Web Avanzado,Python,15,2023-10-03 09:15:00
1,"def calculate_area(height, l, r):\n return ...","[-0.17671486735343933, -0.11769969761371613, -...",Ana Martínez Fernández,Estructuras de Datos y Algoritmos,Python,28,2023-10-09 14:30:00
2,"class Solution:\n def rob(self, nums) -> in...","[-0.11747059971094131, -0.04676730930805206, -...",Ana Martínez Fernández,Estructuras de Datos y Algoritmos,Python,15,2023-10-15 18:45:00
3,# Definition for singly-linked list.\n# class ...,"[-0.21408772468566895, -0.02248210459947586, -...",Alejandro Pérez García,Bases de Datos y Administración,Python,24,2023-10-21 11:20:00
4,class Solution:\n def longestPalindrome(sel...,"[-0.35301244258880615, -0.0692468136548996, -0...",María Rodríguez López,Programación Orientada a Objetos (POO),Python,28,2023-11-05 20:00:00


In [14]:
collection_name = "code"
vector_len = len(df.iloc[0][1])
vector_len

768

In [15]:
try:
    qdrant.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_len + 1, distance=Distance.COSINE
        ),  # ADDED +1 to vector len to add lines of code
        replication_factor=2,
        shard_number=6,
    )
except Exception as e:
    pass

In [30]:
batch_size = 100

for i in range(0, len(df.index), batch_size):
    pos_final = min(i + batch_size, len(df.index))

    chunk = df.iloc[i:pos_final]

    qdrant.upsert(
        collection_name=collection_name,
        points=[
            PointStruct(
                id=idx,
                vector=row["vector"].tolist() + [row["lines_of_code"]],
                payload={
                    "text": row["text"],
                    "author": row["author"],
                    "subject": row["subject"],
                    "code_language": row["code_language"],
                    "lines_of_code": row["lines_of_code"],
                    "date": row["date"],
                },
            )
            for idx, row in chunk.iterrows()
        ],
    )

In [18]:
qdrant.get_collection(collection_name=collection_name)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=12, indexed_vectors_count=0, points_count=12, segments_count=48, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=769, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=6, replication_factor=2, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})

In [27]:
search_vector = df["vector"][0]
search_vector = search_vector.tolist() + [15]

In [31]:
scored_point = qdrant.search(collection_name, search_vector, limit=1)[0]
scored_point.model_dump()

{'id': 0,
 'version': 3,
 'score': 0.99999976,
 'payload': {'author': 'Ana Martínez Fernández',
  'code_language': 'Python',
  'date': '2023-10-03T09:15:00',
  'lines_of_code': 15,
  'subject': 'Desarrollo Web Avanzado',
  'text': 'class Solution(object):\n    def climbStairs(self, n):\n        """\n        :type n: int\n        :rtype: int\n        """\n        if n == 0 or n == 1:\n            return 1\n        prev, curr = 1, 1\n        for _ in range(2, n + 1):\n            temp = curr\n            curr = prev + curr\n            prev = temp\n        return curr\n'},
 'vector': None}

In [10]:
print(scored_point.payload["text"])

class Solution(object):
    def climbStairs(self, n):
        """
        :type n: int
        :rtype: int
        """
        if n == 0 or n == 1:
            return 1
        prev, curr = 1, 1
        for _ in range(2, n + 1):
            temp = curr
            curr = prev + curr
            prev = temp
        return curr

