In [1]:
import requests
import json
import pandas as pd
import numpy as np
import faiss
import re
from pymystem3 import Mystem
from rank_bm25 import BM25Okapi

In [2]:
mystem = Mystem()
with open("russian.txt", "r", encoding="utf-8") as f:
    stop_words = set([line.strip().lower() for line in f if line.strip()])

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^а-я\s]', ' ', text)
    lemmas = mystem.lemmatize(text)
    words = [w for w in lemmas if w.strip() and w not in stop_words]
    return words

In [3]:
API_URL = "https://llm.t1v.scibox.tech/v1/embeddings"
API_KEY = "sk-h7YcuRfbDw_2fPiuZzz06w"
headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}

In [None]:
df = pd.read_csv(r"D:\Code\HAHATON\new_hahaton.csv")

In [None]:
if 'processed_question' not in df.columns:
    df['processed_question'] = df['question'].apply(preprocess_text)

In [None]:
embeddings_array = np.array(df['emb_question'].tolist(), dtype='float32')
d = embeddings_array.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings_array)

In [None]:
user_query = "А если я живу за границей, смогу ли я хоть как-то зарегистрироваться?"

In [None]:
data = {"model": "bge-m3", "input": user_query}
response = requests.post(API_URL, headers=headers, json=data)
vect = np.array(response.json()["data"][0]["embedding"], dtype='float32').reshape(1, -1)

k = 10
distances, indices = index.search(vect, k)
faiss_scores = 1 / (1 + distances[0]) 
faiss_results = df.iloc[indices[0]].copy()
faiss_results['score'] = faiss_scores

In [None]:
bm25_corpus = df['processed_question'].tolist()
bm25 = BM25Okapi(bm25_corpus)
query_words = preprocess_text(user_query)
bm25_scores = bm25.get_scores(query_words)

bm25_results = df.copy()
bm25_results['score'] = bm25_scores

In [None]:
alpha = 0.7  # вес FAISS
beta = 0.3   # вес BM25

combined = df.copy()
combined['combined_score'] = alpha * faiss_results['score'] + beta * bm25_results['score']

In [None]:
final_results = combined.sort_values(by='combined_score', ascending=False).head(10)
for idx, row in final_results.iterrows():
    print(f"Category: {row['category']}, Question: {row['question']}, Combined score: {row['combined_score']:.4f}")

KeyError: 'emb_question'