In [1]:
!pip install nltk sentence-transformers emoji langdetect

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-no

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import string
import emoji
from sentence_transformers import SentenceTransformer
from langdetect import detect, DetectorFactory
from collections import Counter

DetectorFactory.seed = 0

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stemmer_ru = SnowballStemmer('russian')
stemmer_en = SnowballStemmer('english')

stop_words_russian = set(stopwords.words('russian'))
stop_words_english = set(stopwords.words('english'))

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def preprocess_text(text, lang):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = emoji.demojize(text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)

    if lang == 'russian':
        stemmer = stemmer_ru
        stop_words = stop_words_russian
    else:
        stemmer = stemmer_en
        stop_words = stop_words_english

    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) > 1]

    normalized_text = ' '.join(tokens)
    return normalized_text

def get_embedding(text):
    return model.encode(text)

def generate_hashtags(text):
    words = text.split()
    hashtags = ['#' + word for word in words if len(word) > 1]
    return hashtags

def process_query(query):
    lang = detect(query)
    normalized_query = preprocess_text(query, lang=lang)
    embedding = get_embedding(normalized_query)
    hashtags = generate_hashtags(normalized_query)

    return {
        'original_query': query,
        'embedding': embedding,
        'hashtags': hashtags
    }

In [12]:
if __name__ == "__main__":
    user_query = "Пример пользовательского запроса с эмодзи 😊 и другими символами!!!"
    result = process_query(user_query)

    print("Оригинальный запрос:", result['original_query'])
    print("Эмбеддинг запроса:", result['embedding'])
    print("Сгенерированные хэштеги:", result['hashtags'])

Оригинальный запрос: Пример пользовательского запроса с эмодзи 😊 и другими символами!!!
Эмбеддинг запроса: [-4.31871414e-02  9.40642506e-02  7.78497290e-03 -2.47389656e-02
 -3.11436169e-02 -1.78893581e-02  1.29410326e-01  1.89602189e-02
 -9.10590217e-02  1.89000349e-02  8.67307261e-02 -3.76375467e-02
  8.42680782e-02 -2.20839493e-02  4.70088683e-02 -3.09688840e-02
 -6.79204566e-03  3.49285081e-02  1.47510683e-02  4.01400998e-02
  7.46454298e-03 -1.51913082e-02  4.64295745e-02 -2.60953419e-02
 -5.83557636e-02  3.15307677e-02  5.37789539e-02  3.40656824e-02
  3.88011821e-02  2.84297112e-02  6.58702552e-02 -1.95414480e-02
  1.30273804e-01 -2.69172713e-02 -4.73894477e-02  3.63585120e-03
 -6.13963939e-02 -7.23956227e-02  7.51107989e-04  6.23711720e-02
 -7.55563229e-02 -5.70931472e-02 -1.07405148e-01  3.39920595e-02
 -1.84150634e-03  1.12112416e-02 -2.42493022e-03  5.04862182e-02
  5.25302887e-02 -6.56340346e-02 -1.13313003e-02  6.41928194e-03
 -6.78869933e-02 -4.10430133e-02 -4.40421887e-02