In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import string
import emoji
from sentence_transformers import SentenceTransformer
from langdetect import detect, DetectorFactory

In [None]:
DetectorFactory.seed = 0

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stemmer_ru = SnowballStemmer('russian')
stemmer_en = SnowballStemmer('english')

stop_words_russian = set(stopwords.words('russian'))
stop_words_english = set(stopwords.words('english'))

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
def preprocess_text(text, lang):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = emoji.demojize(text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)

    if lang == 'russian':
        stemmer = stemmer_ru
        stop_words = stop_words_russian
    else:
        stemmer = stemmer_en
        stop_words = stop_words_english

    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) > 1]

    normalized_text = ' '.join(tokens)
    return normalized_text

def get_embedding(text):
    return model.encode(text)

def generate_hashtags(text):
    words = text.split()
    hashtags = ['#' + word for word in words if len(word) > 1]
    return hashtags

def process_query(query):
    lang = detect(query)
    normalized_query = preprocess_text(query, lang=lang)
    embedding = get_embedding(normalized_query)
    hashtags = generate_hashtags(normalized_query)
    
    return {
        'original_query': query,
        'embedding': embedding,
        'hashtags': hashtags
    }

In [None]:
if __name__ == "__main__":
    user_query = "Пример пользовательского запроса с эмодзи 😊 и другими символами!!!"
    result = process_query(user_query)

    print("Original query: ", result['original_query'])
    print("Query embedding: ", result['embedding'])
    print("Generated hashtags: ", result['hashtags'])