In [2]:
import json
import logging
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from pathlib import Path
from typing import Any, Dict, List

from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.http.models import SparseVectorParams, Modifier
from qdrant_client.models import Distance, VectorParams
from dotenv import load_dotenv
from fastembed import SparseTextEmbedding

from src.config.settings import Config
# from notebooks.parse_md_to_json_old import parse_entry_v1

load_dotenv()
openai_client = OpenAI()
config = Config()

LEVEL = 1

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

# prefer_grpc is set True to avoid timeout error
client = QdrantClient(
    host=config.qdrant_host,
    port=config.qdrant_port,
    # prefer_grpc=True
)

bm25_embedding_model = SparseTextEmbedding(config.sparse_embedding_model)

In [3]:
def get_embedding(text: str) -> List[float]:
    """Generate embedding vector from OpenAI."""
    try:
        response = openai_client.embeddings.create(
            model= config.embedding_model,
            input=text
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return [0] * 1536  # Return zero vector on error

def create_qdrant_collection(collection_name: str) -> None:
    """Create a Qdrant collection if it doesn't exist."""
    # List existing collections
    # Create a collection if it doesn't exist
    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config={
                config.embedding_model: VectorParams(
                    size=1536,
                    distance=Distance.COSINE,
                    on_disk=True
                ),
            },
            sparse_vectors_config={
                config.sparse_embedding_model: SparseVectorParams(modifier=Modifier.IDF) # INFO has GRPC version for Modifier
            },
            # INFO Set up a quantization for Droplet due to lack of RAM
            # INFO Check out https://qdrant.tech/documentation/guides/optimize/ for additional information
            # quantization_config=models.ScalarQuantization(
            #     scalar=models.ScalarQuantizationConfig(
            #         type=models.ScalarType.INT8,
            #         always_ram=True,
            #     ),
            # ) if quantization else None
        )
        logger.info(f"Collection {collection_name} created")
    else:
        logger.info(f"Collection {collection_name} already exists")

### For V0 grammar (in JSON formats)

In [None]:
def reformat_for_embedding(entry: dict) -> str:
    """
    Reformat a single JSON entry into a single string for embedding.
    """
    parts = []

    # Include grammar names if available
    if "grammar_name_kr" in entry:
        parts.append(f"НАЗВАНИЕ НА КОРЕЙСКОМ: {entry['grammar_name_kr']}")
    if "grammar_name_rus" in entry:
        parts.append(f"НАЗВАНИЕ НА РУССКОМ: {entry['grammar_name_rus']}")

    # Include level information (optional)
    level_mapping = {
        1: "Начинающий",
        2: "Базовый",
        3: "Средний",
        4: "Выше среднего",
        5: "Продвинутый",
        6: "Экспертный"
    }

    if "level" in entry:
        level_value = entry.get("level")
        level_name = level_mapping.get(level_value, f"Level {level_value}")
        parts.append(f"Level: {level_name} ({level_value})")

    # Append description
    if "description" in entry and entry["description"]:
        parts.append(f"ОПИСАНИЕ: {entry['description']}")

    # Append usage form
    if "usage_form" in entry and entry["usage_form"]:
        parts.append(f"ФОРМА: {entry['usage_form']}")

    # Append examples
    if "examples" in entry and entry["examples"]:
        for idx, example in enumerate(entry["examples"], start=1):
            korean = example.get("korean", "")
            russian = example.get("russian", "")
            parts.append(f"ПРИМЕР {idx}: НА КОРЕЙСКОМ: {korean} | НА РУССКОМ: {russian}")

    # Append notes
    if "notes" in entry and entry["notes"]:
        # Join notes with a semicolon for clarity
        notes_combined = "; ".join(entry["notes"])
        parts.append(f"ПРИМЕЧАНИЯ: {notes_combined}")

    # TODO: Add irregular verbs examples
    # Combine all parts into one final string separated by newlines
    return "\n".join(parts)


def load_json_entries(dir_path: str) -> List[Dict[str, Any]]:
    """Load all JSON grammar entries from a directory."""
    entries = []
    path = Path(dir_path)

    # If path is a file, and it's a combined JSON file
    if path.is_file() and path.name.endswith('.json'):
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, list):
                return data
            else:
                return [data]

    return entries

In [None]:
COLLECTION_NAME = config.qdrant_collection_name
create_qdrant_collection(COLLECTION_NAME)

all_entries_json_file = Path("data/grammar-level-1/entries.json")

if all_entries_json_file.exists():
    entries = load_json_entries(str(all_entries_json_file))
    print(f"{len(entries)} grammar entries to upload")
else:
    print("Please run parse_md_to_json.py first to generate JSON files.")
    exit()
    
# Generate embeddings and create points
points = []
for i, entry in enumerate(entries):

    formatted_entry = reformat_for_embedding(entry)
    vector = get_embedding(formatted_entry)
    sparse_vector = next(bm25_embedding_model.embed(formatted_entry)).as_object()
    
    points.append(models.PointStruct(
        id=i,
        vector={
            config.embedding_model: vector,
            config.sparse_embedding_model: sparse_vector
        },
        payload=entry
    ))
    
print(f"Generated {len(points)} points")

### For V1 grammars if stored in MD format

In [None]:
def reformat_for_embedding(entry: dict) -> str:
    return f"Грамматика {entry['grammar_name_kr']} - {entry['grammar_name_rus']}: {entry['description']}"

def load_md_entries(dir_path: Path) -> List[str]:
    """Load all MD grammar entries from a directory"""
    content_list = [file.read_text(encoding='utf-8') for file in dir_path.glob("*.md")]
    return content_list

In [None]:
COLLECTION_NAME = config.qdrant_collection_name_v2
create_qdrant_collection(COLLECTION_NAME)

all_entries_md_folder = Path("data/grammar-level-1/entries_md/")

if all_entries_md_folder.exists():
    entries = load_md_entries(all_entries_md_folder)
    print(f"{len(entries)} grammar entries to upload")
else:
    print("Please run parse_md_to_json.py first to generate JSON files.")
    exit()
    
# Generate embeddings and create points
points = []
for i, entry in enumerate(entries):

    parsed_entry = parse_entry_v1(entry) # Create disctionary 
    formatted_entry = reformat_for_embedding(parsed_entry) # Select only grammar name and description for embedding

    vector = get_embedding(formatted_entry)
    sparse_vector = next(bm25_embedding_model.embed(formatted_entry)).as_object()
    
    grammar_name = f"{parsed_entry['grammar_name_kr']} - {parsed_entry['grammar_name_rus']}"
    payload = {
        "grammar_name": grammar_name,
        "level" : LEVEL,
        "content": entry
    }
    
    points.append(models.PointStruct(
        id=i,
        vector={
            config.embedding_model: vector,
            config.sparse_embedding_model: sparse_vector
        },
        payload=payload
    ))
    
print(f"Generated {len(points)} points")

## For V2 grammar points from CSV clean grammars

In [152]:
import pandas as pd

clean_grammars = pd.read_pickle("../data/grammar-level-1/v2/grammar_list_clean_word2md.pkl")
clean_grammars.head()

Unnamed: 0_level_0,grammar_name_kr,grammar_name_rus,level,related_grammars,content
grammar_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
602036e488ddf85722f218e49087c45a,이/가,именительный падеж,1,"[은/는, 께서]",**Описание:**\nЧастицы **이/가** обозначают **им...
bd5253ce8e0b11a7ee24bce4850cb389,와/과,"«и», перечисление существительных",1,"[하고, (이)랑]",**Описание:**\nИспользуется для перечисления п...
27bcb1cd7ba042c6f7214fdd1ea1cf6c,와/과,"«с», совместное действие",1,"[하고, (이)랑]","**Описание:**\nУказывает на лицо или объект, с..."
2749970a6a7f10f83726d189e561fe78,까지,«до»,1,"[부터, 에서 «из»]",**Описание:**\nЧастица **까지** используется для...
c7c87e5469bf9ac9161ae90d5bf3e702,께서,именительный падеж (вежл.),1,"[이/가, 께, 께서는, -(으)시-, -(으)세요]",**Описание:**\nЭто **вежливая форма** именител...


In [None]:
def parse_entry_for_embedding(entry) -> str:
    """
    Convert a single row of grammar dataframe into a string for embedding
    """
    grammar_name_kr = entry["grammar_name_kr"]
    grammar_name_rus = entry["grammar_name_rus"]
    content_lines = entry["content"].split("**Форма:**")[0].replace("\n", " ").strip()

    text_to_embed = f"Название грамматики: {grammar_name_kr} - {grammar_name_rus}\n{content_lines}"
    return text_to_embed

def create_qdrant_point(entry: pd.Series) -> models.PointStruct:

    text_to_embed = parse_entry_for_embedding(entry)

    vector = get_embedding(text_to_embed)
    sparse_vector = next(bm25_embedding_model.embed(text_to_embed)).as_object()

    payload = {
        "grammar_name_kr": entry["grammar_name_kr"],
        "grammar_name_rus": entry["grammar_name_rus"],
        "level" : LEVEL,
        "related_grammars" : entry["related_grammars"],
        "content": entry["content"]
    }

    return models.PointStruct(
        id=entry.name,
        vector={
            config.embedding_model: vector,
            config.sparse_embedding_model: sparse_vector
        },
        payload=payload
    )

In [183]:
entry = clean_grammars.iloc[0]
create_qdrant_point(entry)

PointStruct(id='602036e488ddf85722f218e49087c45a', vector={'text-embedding-3-small': [-0.003266491461545229, 0.01400352455675602, -0.06431248039007187, 0.008063995279371738, 0.02653089351952076, 0.00947531871497631, -0.025453699752688408, 0.002423686906695366, 0.017394691705703735, -0.002977245021611452, 0.0020721026230603456, -0.008383164182305336, 0.017644042149186134, -0.04047457501292229, -0.01367438118904829, 0.012437602505087852, -0.019369546324014664, -0.013584615662693977, -0.016736404970288277, 0.03967665135860443, 0.0137242516502738, -0.003004673635587096, 0.010063786059617996, -0.0019848300144076347, 0.00741069670766592, -0.010791889391839504, 0.007724878378212452, 0.02669047750532627, 0.046917788684368134, 0.0026431153528392315, 0.043566521257162094, -0.014851315878331661, 0.01598835363984108, -0.03975644335150719, -0.00511667225509882, 0.010821811854839325, -0.013734225183725357, 0.03315364196896553, -0.01890076883137226, -0.021045181900262833, 0.006338489707559347, 0.0219

In [185]:
points_series = clean_grammars.apply(create_qdrant_point, axis=1)
points = points_series.to_list()

print(f"Generated {len(points)} points")

## Ingest points to the vector database

In [201]:
COLLECTION_NAME = config.qdrant_collection_name_v2
create_qdrant_collection(COLLECTION_NAME)

client.upsert(
    collection_name=COLLECTION_NAME,
    points=points
)

print(f"Upload complete. {len(points)} entries added to {COLLECTION_NAME} collection.")
print("You can now query the collection using the Qdrant client.")

Collection korean_grammar_v2 already exists
Collection korean_grammar_v2 already exists
Collection korean_grammar_v2 already exists
Collection korean_grammar_v2 already exists
Collection korean_grammar_v2 already exists
Collection korean_grammar_v2 already exists


## Testing out the Retrieval

In [41]:
from sentence_transformers import CrossEncoder
from qdrant_client.http.models import Prefetch, SparseVector, FusionQuery, Fusion


bm_threshold = 0
vector_threshold = 0
retrieve_top_k: int = 15
rerank_top_k: int = 5

# Set the necessary deps
sparse_embedding = SparseTextEmbedding(model_name=config.sparse_embedding_model)
openai_client = OpenAI()
qdrant_client = QdrantClient(
    host=config.qdrant_host,
    port=config.qdrant_port,
)
reranking_model = CrossEncoder(config.reranking_model)

In [68]:
search_query = "까지"

# Create embeddings
vector_query = openai_client.embeddings.create(model=config.embedding_model, input=search_query)
vector_query = vector_query.data[0].embedding

sparse_vector_query = next(sparse_embedding.query_embed(search_query))
sparse_vector_query = SparseVector(**sparse_vector_query.as_object())


# Set up the Hybrid search prefetches
bm_25_prefetch = Prefetch(
    query=sparse_vector_query,
    using=config.sparse_embedding_model,
    limit=retrieve_top_k,
    score_threshold=bm_threshold,
)

dense_prefetch = Prefetch(
    query=vector_query,
    using=config.embedding_model,
    limit=retrieve_top_k,
    score_threshold=vector_threshold,
)

In [69]:
from pprint import pprint

# Use hybrid search with bm25 amd OpenAI embeddings with RRF
hits = qdrant_client.query_points(
    collection_name=config.qdrant_collection_name_v2,
    prefetch=[bm_25_prefetch, dense_prefetch],
    query=FusionQuery(fusion=Fusion.RRF),
    with_payload=True,
).points

pprint(f"Received {len(hits)} results from Qdrant.")

'Received 10 results from Qdrant.'


In [74]:
from src.schemas.schemas import RetrievedDoc, GrammarEntryV2

# Convert to schema objects
docs = [
    RetrievedDoc(
        id=hit.id,
        content=GrammarEntryV2(**hit.payload),
        score=hit.score,
    )
    for hit in hits
]

# cross_input = []
# for doc in docs:
#     doc_data = doc.content.grammar_name_kr
#     cross_input.append([search_query, doc_data])
#
# scores = reranking_model.predict(cross_input)
#
# # Add cross-encoder scores to docs
# for idx in range(len(scores)):
#     docs[idx].cross_score = float(scores[idx])
#     pprint(f"Document {idx} reranking: {docs[idx].score:.4f} -> {scores[idx]:.4f}")
#
# # Sort by cross-encoder score
# reranked_docs = sorted(docs, key=lambda x: x.cross_score, reverse=True)

# result = [doc.content for doc in reranked_docs[:rerank_top_k]]

result = [doc.content for doc in docs]

In [75]:
for i, doc in enumerate(docs):
    pprint(f"{doc.content.grammar_name_kr} - {doc.content.grammar_name_rus}: {doc.score}")

'까지 - «до»: 1.0'
'-기 전에 - «до того как…»: 0.33333334'
'V + -는 동안(에) - «в течение…, пока…»: 0.25'
'V + -(으)ㄴ 지 - «с тех пор как», «после того как прошло…»: 0.2'
'부터 - «с (какого-то времени)»: 0.16666667'
'-다가 - «в то время как», «пока», «и вдруг»: 0.14285715'
'에서부터 - «с», «из»: 0.125'
'~(으)ㄴ 후에 - «после того как»: 0.11111111'
'께 - дательный падеж (вежл.): 0.1'
'~고 있다 - длительность происходящего: 0.09090909'


In [76]:
# for i, doc in enumerate(reranked_docs):
#     pprint(f"{doc.content.grammar_name_kr} - {doc.content.grammar_name_rus}: {doc.cross_score:.3f}")

In [77]:
result[2]

GrammarEntryV2(grammar_name_kr='V + -는 동안(에)', grammar_name_rus='«в течение…, пока…»', level=1, content='**Описание:**\nГрамматическая конструкция **-는 동안(에)** указывает на одновременность действий или событий, происходящих в течение определённого времени. Часто переводится как «пока», «в то время как», «в течение». Если используется с существительным, то употребляется форма **N + 동안(에)**.\n\n**Форма:**\n\nНастоящее время:\n\nоснова глагола + -는 동안(에)\n\nсуществительное + 동안(에)\n\nЧастица **에** не обязательна и может опускаться.\n\n**Примеры:**\n**밥을 먹는 동안** TV를 봤어요.\nСмотрел телевизор, пока ел.\n\n**비가 오는 동안에** 우리는 카페에 있었어요.\nПока шёл дождь, мы были в кафе.\n\n**휴가 동안** 여행을 많이 했어요.\nВо время отпуска я много путешествовал.\n\n**수업 시간 동안** 조용히 해야 해요.\nВо время урока нужно соблюдать тишину.\n\n**Использование с нерегулярными глаголами:**\n**만들다 (делать)** → **만드는 동안(에)** (пока делает)\n**듣다 (слушать)** → **듣는 동안(에)** (пока слушает)\n**돕다 (помогать)** → **돕는 동안(에)** (пока помогает)\n**덥다 

In [78]:
len(result)

10