In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sturub/automarkup.parquet
/kaggle/input/sturub/sample_submission.csv
/kaggle/input/sturub/videos.parquet
/kaggle/input/sturub/manualmarkup.csv
/kaggle/input/sturub/features.parquet
/kaggle/input/sturub/Rutube.xlsx
/kaggle/input/sturub/requirements.txt
/kaggle/input/sturub/baseline.ipynb
/kaggle/input/sturub/features/features_nov.parquet
/kaggle/input/sturub/features/features_sep.parquet
/kaggle/input/sturub/features/features_may.parquet
/kaggle/input/sturub/features/features_jun.parquet
/kaggle/input/sturub/features/features_aug.parquet
/kaggle/input/sturub/features/features_oct.parquet
/kaggle/input/sturub/features/features_jul.parquet
/kaggle/input/sturub/test_data/test_data/sample_submission.csv
/kaggle/input/sturub/test_data/test_data/submission_queries.csv
/kaggle/input/sturub/metric/metric/sample_submission.csv
/kaggle/input/sturub/metric/metric/script.py
/kaggle/input/sturub/metric/metric/ground_truth.csv


In [None]:
!pip install rank_bm25 pyaspeller transliterate pymorphy2 fastparquet hnswlib sentence_transformers

In [3]:
import json
from sklearn.metrics import ndcg_score
import pyarrow.parquet as pq
import time
import os
import hnswlib
import pickle
import polars as pl
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from scipy.sparse import hstack
import cudf
#from cuml.neighbors import NearestNeighbors
import re
from sklearn.preprocessing import MaxAbsScaler
from pyaspeller import YandexSpeller
from sklearn.decomposition import TruncatedSVD, PCA
from transliterate import translit
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, top_k_accuracy_score



In [4]:
# automarkup = pd.read_parquet('/kaggle/input/sturub/automarkup.parquet')
# sample_submission = pd.read_csv('/kaggle/input/sturub/sample_submission.csv')
# manualmarkup = pd.read_csv('/kaggle/input/sturub/manualmarkup.csv')
videos = pd.read_parquet('/kaggle/input/sturub/videos.parquet', engine = 'fastparquet')
videos = videos.sample(n=300_000, replace=False, random_state=42)

In [5]:
index_path = "./hnswlib.index"
#index = hnswlib.Index(space = 'cosine', dim = 768)

In [6]:
from sentence_transformers import SentenceTransformer, util
model_name = 'quora-distilbert-multilingual'
model = SentenceTransformer(model_name, device='cuda')
embedding_size = 768

Downloading .gitattributes:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/447 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [7]:
class RutubeSearchModel:
    def __init__(self, top_k: int) -> None:
        self._vec = model
        self._knn = hnswlib.Index(space = 'cosine', dim = embedding_size)
        self.speller = YandexSpeller()
        self._top_k = top_k
        
    def fit(self, videos: pd.DataFrame):
        videos['search_tag'] = videos['video_title'].astype(str) + " " + videos['channel_title'].astype(str) + " " + videos['v_category'].astype(str)
        videos = self.text_preprocessing(videos)
        self._videos = videos[['video_id','video_title']]
        
        corpus = videos['video_title'].apply(lambda x: x.lower()).values
        
        self._mx_videos = self._vec.encode(corpus, show_progress_bar=True, convert_to_numpy=True)
        self._knn.init_index(max_elements = len(self._mx_videos), ef_construction = 400, M = 64)
        self._knn.add_items(self._mx_videos, list(range(len(self._mx_videos))))
        self._knn.set_ef(50)
        return self

    def predict_front(self, test):
        test = self.speller.spelled(test)
        
        print(test)        
        mx_predict = self._vec.encode(test)
        
        indices, distances = self._knn.knn_query(mx_predict, k=10)        
       
        search = self._videos.iloc[indices.flatten()]
        search['neighbors'] = distances.flatten()
        search['video_title'] = search['video_title'].str.lower()
        corpus = search['video_title'].values.tolist()
        tokenized_corpus = [doc.split(" ") for doc in corpus]
        bm25 = BM25Okapi(tokenized_corpus)
        text_speller = self.speller.spelled(test)
        tokenized_query = text_speller.lower().split(" ")
        search['bm25_score'] = bm25.get_scores(tokenized_query)
        return search.sort_values(by='bm25_score', ascending = False).head(self._top_k)
    
    def predict_submission(self, test):
        
        sub = []
        for i in tqdm(range(len(val_df))):
            search = self.predict_front(val_df['query'].iloc[i])['video_id'].tolist()
            for j in search:
                sub.append([val_df['query'].iloc[i],j])
        return pd.DataFrame(sub, columns = ['query', 'video_id'])
    
    def text_preprocessing(self,videos):
        videos['search_tag'] = videos['search_tag'].str.lower()
        return videos
    
    def parse_russian_words(self, text):
        cyrillic = translit(text, 'ru')
        if cyrillic != text:
            russian_text = cyrillic
        else: 
            russian_text = text
        # russian_text = translit(english_text, 'ru')
        return russian_text
    
    def find_foreign_layout_characters(self,text):
        foreign_layout_characters = {'q': 'й','w': 'ц','e': 'у','r': 'к', 't': 'е',
                                     'y': 'н','u': 'г','i': 'ш','o': 'щ', 'p': 'з',
                                     '[': 'х',']': 'ъ','a': 'ф','s': 'ы', 'd': 'в',
                                     'f': 'а','g': 'п','h': 'р','j': 'о','k': 'л',
                                     'l': 'д',';': 'ж',"'": 'э','z': 'я','x': 'ч', 
                                     'c': 'с','v': 'м','b': 'и','n': 'т','m': 'ь',
                                     ',': 'б','.': 'ю','/': '.' }

        foreign_layout_chars = []
        for char in text:
            if char == ' ':
                foreign_layout_chars.append(char)
            else:
                if char.lower() in foreign_layout_characters.keys():
                    foreign_layout_chars.append(foreign_layout_characters[char.lower()])

        return ''.join(map(str, foreign_layout_chars))
    
    def has_russian_words(self, text):
        russian_words_regex = re.compile('[а-яА-ЯёЁ]')
        return bool(russian_words_regex.search(text))   

In [8]:
model_search = RutubeSearchModel(top_k=5)

In [9]:
%%time
model_search.fit(videos)

Batches:   0%|          | 0/9375 [00:00<?, ?it/s]

CPU times: user 27min 39s, sys: 11 s, total: 27min 50s
Wall time: 10min 7s


<__main__.RutubeSearchModel at 0x7ac57d815990>

In [10]:
%%time
model_search.predict_front('моя фамилия шилов')

моя фамилия шилов


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 81 ms, sys: 3.01 ms, total: 84 ms
Wall time: 1.35 s


Unnamed: 0,video_id,video_title,neighbors,bm25_score
4246582,video_33711482,про имя ярослав,0.02603,0.0
28255032,video_20341816,мари краймбрери - я хотела твою фамилию,0.026541,0.0
27048975,video_27110808,"как маму зовут? оля, а папу? батя.",0.028095,0.0
10107610,video_10942561,імена (лукянець),0.029783,0.0
15745837,video_24478326,мне зовут егорка,0.032541,0.0


In [12]:
%%time
model_search.predict_front('война')

война


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 72.6 ms, sys: 2.99 ms, total: 75.6 ms
Wall time: 1.22 s


Unnamed: 0,video_id,video_title,neighbors,bm25_score
21556498,video_24319409,как на войне,0.031671,0.0
18759629,video_640446,world war 3,0.044468,0.0
15086712,video_10014524,vader - this is the war,0.047791,0.0
33482741,video_1486268,world war 2. бой #76.,0.051213,0.0
13615648,video_26586590,когда мы были на войне,0.055232,0.0


In [14]:
%%time
model_search.predict_front('linkin park')

linkin park


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 78.6 ms, sys: 4.01 ms, total: 82.6 ms
Wall time: 1.22 s


Unnamed: 0,video_id,video_title,neighbors,bm25_score
17464364,video_5851374,linkin park - crawling,0.104087,0.909446
22159963,video_7782088,in the end - linkin park,0.032844,0.767837
12480052,video_11516579,linkin park - somewhere i belong,0.042492,0.767837
17253207,video_31570778,linkin park - across the line [1 hour],0.098576,0.664386
135224,video_5857906,tema park vialand,0.095783,0.536984


In [18]:
%%time
model_search.predict_front('будущее близко')

будущее близко


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 81.2 ms, sys: 3 ms, total: 84.2 ms
Wall time: 1.22 s


Unnamed: 0,video_id,video_title,neighbors,bm25_score
8012118,video_28765120,конструируем будущее,0.053432,0.459656
14895664,video_1903450,"будущее в паре, на ближайшее будущее",0.019138,0.432617
19290865,video_11128887,будущее за нами,0.018028,0.39754
8630107,video_2721163,будущее - прошлое,0.055548,0.39754
24079339,video_3412695,события ближайшего будущего 🩶,0.015714,0.0


In [None]:
val_df = manualmarkup[manualmarkup['sentiment']=="exact"][['query','video_id']]

In [None]:
sub_df = model_search.predict_submission(val_df)