In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:


from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy import sparse

from catboost import CatBoostClassifier




In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/avarar

/content/drive/MyDrive/Colab Notebooks/avarar


In [None]:
!pip install catboost



In [None]:


from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy import sparse

from catboost import CatBoostClassifier

In [None]:
!pip -q install -U sentence-transformers catboost pyarrow


In [None]:
!pip -q install -U fastparquet


In [None]:
import pandas as pd

TARGET = "item_contact"
cols = [
    "query_id","item_id","query_text","item_title","item_description",
    "query_cat","query_mcat","query_loc","item_cat_id","item_mcat_id","item_loc",
    "price","item_query_click_conv",
]

train = pd.read_parquet("train-dset.parquet", columns=cols + [TARGET], engine="fastparquet")
test  = pd.read_parquet("test-dset-small.parquet", columns=cols, engine="fastparquet")

print(train.shape, test.shape)
train.head()


(7781790, 14) (335348, 13)


Unnamed: 0,query_id,item_id,query_text,item_title,item_description,query_cat,query_mcat,query_loc,item_cat_id,item_mcat_id,item_loc,price,item_query_click_conv,item_contact
0,4,7349717282,ботинки детские zara 21,Ботинки детские Zara,Новые полуботинки фирмы Zara. \nразмеры 21 сте...,29.0,38.0,624480.0,29,2179540,638660,500.0,-1.0,0.0
1,4,7519735286,ботинки детские zara 21,Детские ботинки Zara унисекс,"Крутые ботинки, в отличном состоянии",29.0,38.0,624480.0,29,2179540,637640,250.0,-1.0,0.0
2,4,4384449104,ботинки детские zara 21,Ботинки детские zara,Челси димесезонные Zara \nВ идеальном состояни...,29.0,38.0,624480.0,29,2179540,623880,1500.0,-1.0,0.0
3,4,7283365509,ботинки детские zara 21,Детские ботиночки Zara 21 размер,АВИТО ДОСТАВКА .21 РАЗМЕР.,29.0,38.0,624480.0,29,2179540,628530,220.0,-1.0,0.0
4,4,4452768560,ботинки детские zara 21,Детские ботиночки zara размер 21,Детские ботинки Zara \nРазмер 21 - 13 см\nСост...,29.0,38.0,624480.0,29,2179540,637640,1648.0,-1.0,1.0


In [None]:
text_cols = ["query_text","item_title","item_description"]
cat_cols  = ["query_cat","query_mcat","query_loc","item_cat_id","item_mcat_id","item_loc"]

In [None]:
GROUP  = "query_id"

In [None]:
import numpy as np

In [None]:


def prepare(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    for c in text_cols:
        df[c] = df[c].fillna("").astype(str)

    for c in cat_cols:
        df[c] = df[c].fillna("NA").astype(str)

    df["price"] = pd.to_numeric(df["price"], errors="coerce").fillna(0).astype(np.float32)
    df["item_query_click_conv"] = pd.to_numeric(df["item_query_click_conv"], errors="coerce")
    df["conv_missing"] = (df["item_query_click_conv"] < 0).astype(np.int8)
    df.loc[df["item_query_click_conv"] < 0, "item_query_click_conv"] = np.nan
    med = df["item_query_click_conv"].median()
    df["item_query_click_conv"] = df["item_query_click_conv"].fillna(med if np.isfinite(med) else 0).astype(np.float32)
    df["cat_match"]  = (df["query_cat"]  == df["item_cat_id"]).astype(np.int8)
    df["mcat_match"] = (df["query_mcat"] == df["item_mcat_id"]).astype(np.int8)
    df["loc_match"]  = (df["query_loc"]  == df["item_loc"]).astype(np.int8)
    df["log_price"] = np.log1p(df["price"].clip(lower=0)).astype(np.float32)
    g = df.groupby(GROUP)["price"]
    df["price_rank_pct"] = g.rank(pct=True).astype(np.float32)

    mu = g.transform("mean").astype(np.float32)
    sd = g.transform("std").replace(0, np.nan).astype(np.float32)
    df["price_z"] = ((df["price"] - mu) / sd).fillna(0).astype(np.float32)
    df["q_len"] = df["query_text"].str.len().astype(np.int32)
    df["t_len"] = df["item_title"].str.len().astype(np.int32)
    df["d_len"] = df["item_description"].str.len().astype(np.int32)

    return df


In [None]:
train = prepare(train)
test  = prepare(test)

In [None]:
TARGET_SHARE = 0.80

vc = train["item_id"].value_counts()
cum_share = vc.cumsum() / len(train)
K = int(np.searchsorted(cum_share.values, TARGET_SHARE) + 1)

print("K_for_share", TARGET_SHARE, "=", K)
print("topK covers:", float(cum_share.values[K-1]))

freq_items = vc.head(K).index.values
pos_items = train.loc[train[TARGET] == 1, "item_id"].unique()

keep_items = np.unique(np.concatenate([freq_items, pos_items]))
print("keep_items unique:", len(keep_items))


K_for_share 0.8 = 4216700
topK covers: 0.8
keep_items unique: 4277139


In [None]:
from sentence_transformers import SentenceTransformer
import torch, os

MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
pop_model = SentenceTransformer(MODEL_NAME, device=device)

CACHE_DIR = "/content/drive/MyDrive/avito_emb_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
DESC_MAX_CHARS = 300
q_all = pd.concat([
    train[[GROUP, "query_text"]],
    test[[GROUP, "query_text"]],
], axis=0).drop_duplicates(GROUP).reset_index(drop=True)

TOPK_FREQ = 1_200_000
TOPK_CONV = 300_000

pos_items = train.loc[train[TARGET] == 1, "item_id"].unique()
freq_items = train["item_id"].value_counts().head(TOPK_FREQ).index.values
conv_items = (
    train.loc[train["item_query_click_conv"].fillna(-1) >= 0, "item_id"]
    .value_counts().head(TOPK_CONV).index.values
)

keep_items = np.unique(np.concatenate([pos_items, freq_items, conv_items]))

i_all = pd.concat([
    test[["item_id","item_title","item_description"]],
    train.loc[train["item_id"].isin(keep_items), ["item_id","item_title","item_description"]],
], axis=0).drop_duplicates("item_id").reset_index(drop=True)

print("i_all unique items:", len(i_all))
i_all["item_text"] = (
    i_all["item_title"].astype(str)
    + " [SEP] "
    + i_all["item_description"].astype(str).str.slice(0, DESC_MAX_CHARS)
)



i_all unique items: 1636533


In [None]:
def encode_to_memmap_series(text_series: pd.Series, out_path, batch_size=128, dtype=np.float16):
    n = len(text_series)
    meta_path = out_path + ".meta.npy"
    if os.path.exists(out_path) and os.path.exists(meta_path):
        dim = int(np.load(meta_path))
        return np.memmap(out_path, mode="r", dtype=dtype, shape=(n, dim))

    b0 = min(batch_size, n)
    emb0 = pop_model.encode(
        text_series.iloc[:b0].tolist(),
        batch_size=b0,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True
    )
    dim = emb0.shape[1]
    np.save(meta_path, np.array(dim, dtype=np.int32))

    mm = np.memmap(out_path, mode="w+", dtype=dtype, shape=(n, dim))
    mm[:b0] = emb0.astype(dtype)

    start = b0
    while start < n:
        end = min(start + batch_size, n)
        emb = pop_model.encode(
            text_series.iloc[start:end].tolist(),
            batch_size=batch_size,
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=False
        )
        mm[start:end] = emb.astype(dtype)
        start = end

    mm.flush()
    return np.memmap(out_path, mode="r", dtype=dtype, shape=(n, dim))


In [None]:
MODEL_TAG = MODEL_NAME.split("/")[-1]
Q_VER = "v6"
q_path = os.path.join(CACHE_DIR, f"q_emb_{MODEL_TAG}_{Q_VER}.f16.mmp")



In [None]:
I_VER = "v10"
i_path = os.path.join(CACHE_DIR, f"i_emb_{MODEL_TAG}_desc{DESC_MAX_CHARS}_n{len(i_all)}_{I_VER}.f16.mmp")

In [None]:
q_emb = encode_to_memmap_series(q_all["query_text"], q_path, batch_size=256)




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
i_emb = encode_to_memmap_series(i_all["item_text"],  i_path, batch_size=256)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print(q_emb.shape, i_emb.shape)

(690695, 384) (1636533, 384)


In [None]:

if "q_idx" not in q_all.columns:
    q_all = q_all.copy()
    q_all["q_idx"] = np.arange(len(q_all), dtype=np.int32)

if "i_idx" not in i_all.columns:
    i_all = i_all.copy()
    i_all["i_idx"] = np.arange(len(i_all), dtype=np.int32)


train = train.merge(q_all[[GROUP, "q_idx"]], on=GROUP, how="left")
test  = test.merge(q_all[[GROUP, "q_idx"]], on=GROUP, how="left")

train = train.merge(i_all[["item_id", "i_idx"]], on="item_id", how="left")
test  = test.merge(i_all[["item_id", "i_idx"]], on="item_id", how="left")
print([c for c in train.columns if "i_idx" in c])
print([c for c in i_all.columns if "i_idx" in c])



['i_idx']
['i_idx']


In [None]:

train["has_sim"] = train["i_idx"].notna()
test["has_sim"]  = test["i_idx"].notna()

train["sim_q_item"] = 0.0
test["sim_q_item"]  = 0.0


In [None]:
missing_q = train["q_idx"].isna().sum()
if missing_q:
    print("train rows without q_emb:", missing_q, "=> dropping them")
    train = train[train["q_idx"].notna()].copy()

missing_q_test = test["q_idx"].isna().sum()
if missing_q_test:
    print("test rows without q_emb:", missing_q_test, "=> dropping them")
    test = test[test["q_idx"].notna()].copy()
train["q_idx"] = train["q_idx"].astype(np.int32)
test["q_idx"]  = test["q_idx"].astype(np.int32)


In [None]:
def add_sim_feature(df: pd.DataFrame, chunk_size=200_000) -> pd.DataFrame:
    df = df.copy()
    n = len(df)
    sims = np.empty(n, dtype=np.float32)

    q_idx = df["q_idx"].to_numpy(np.int32)
    i_idx = df["i_idx"].to_numpy(np.int32)

    for start in range(0, n, chunk_size):
        end = min(start + chunk_size, n)

        Q = np.array(q_emb[q_idx[start:end]], dtype=np.float32)
        I = np.array(i_emb[i_idx[start:end]], dtype=np.float32)

        sims[start:end] = np.einsum("ij,ij->i", Q, I)

        if start == 0:
            print("first chunk sim stats:", sims[start:end].min(), sims[start:end].mean(), sims[start:end].max())

    df["sim_q_item"] = sims
    return df


In [None]:

train_sim = train[train["has_sim"] & train["q_idx"].notna()].copy()
test_sim  = test[test["has_sim"] & test["q_idx"].notna()].copy()

train_sim["q_idx"] = train_sim["q_idx"].astype(np.int32)
train_sim["i_idx"] = train_sim["i_idx"].astype(np.int32)
test_sim["q_idx"]  = test_sim["q_idx"].astype(np.int32)
test_sim["i_idx"]  = test_sim["i_idx"].astype(np.int32)

train_sim = add_sim_feature(train_sim, chunk_size=200_000)
test_sim  = add_sim_feature(test_sim,  chunk_size=200_000)
train.loc[train_sim.index, "sim_q_item"] = train_sim["sim_q_item"].values
test.loc[test_sim.index, "sim_q_item"]   = test_sim["sim_q_item"].values
train.loc[~train["has_sim"], "sim_q_item"] = -2.0
test.loc[~test["has_sim"],  "sim_q_item"]  = -2.0


first chunk sim stats: -0.124466226 0.50828475 0.982785
first chunk sim stats: -0.12712443 0.49879047 0.9796449


In [None]:
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import GroupKFold

features = cat_cols + [
    "price","log_price","price_rank_pct","price_z",
    "item_query_click_conv","conv_missing",
    "cat_match","mcat_match","loc_match",
    "q_len","t_len","d_len",
    "has_sim",
    "sim_q_item",
]

train = train.sort_values([GROUP, "item_id"]).reset_index(drop=True)

X = train[features]
y = train[TARGET].astype(np.float32).values
qid = train[GROUP].values

gkf = GroupKFold(n_splits=5)
tr_idx, va_idx = next(gkf.split(X, y, groups=qid))
tr_idx = np.sort(tr_idx)
va_idx = np.sort(va_idx)

train_pool = Pool(
    X.iloc[tr_idx], label=y[tr_idx], group_id=qid[tr_idx],
    cat_features=cat_cols
)
valid_pool = Pool(
    X.iloc[va_idx], label=y[va_idx], group_id=qid[va_idx],
    cat_features=cat_cols
)

params = dict(
    loss_function="YetiRank",
    eval_metric="NDCG:top=10",
    iterations=2000,
    learning_rate=0.04,
    depth=8,
    l2_leaf_reg=6.0,
    random_strength=0.8,
    verbose=50,
    task_type="GPU" if torch.cuda.is_available() else "CPU",
    od_type="Iter",
    od_wait=100,
    metric_period=50,
    bootstrap_type="Bernoulli",
    subsample=0.8,
)

In [None]:


cb_model = CatBoostRanker(**params, random_seed=42)
cb_model.fit(train_pool, eval_set=valid_pool, use_best_model=True, plot=True)
best_iter = cb_model.get_best_iteration()
print("best_iter:", best_iter)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Groupwise loss function. OneHotMaxSize set to 10


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.8630892	best: 0.8630892 (0)	total: 1.44s	remaining: 48m 7s
50:	test: 0.8913135	best: 0.8913271 (49)	total: 1m 3s	remaining: 40m 21s
100:	test: 0.8933971	best: 0.8933971 (100)	total: 2m 8s	remaining: 40m 14s
150:	test: 0.8949935	best: 0.8950033 (149)	total: 3m 13s	remaining: 39m 24s
200:	test: 0.8956277	best: 0.8956442 (197)	total: 4m 17s	remaining: 38m 27s
250:	test: 0.8960369	best: 0.8960419 (248)	total: 5m 21s	remaining: 37m 22s
300:	test: 0.8962375	best: 0.8962375 (300)	total: 6m 26s	remaining: 36m 19s
350:	test: 0.8964549	best: 0.8964658 (348)	total: 7m 31s	remaining: 35m 20s
400:	test: 0.8966681	best: 0.8966681 (400)	total: 8m 34s	remaining: 34m 13s
450:	test: 0.8968648	best: 0.8968746 (448)	total: 9m 38s	remaining: 33m 8s
500:	test: 0.8968874	best: 0.8969159 (496)	total: 10m 43s	remaining: 32m 4s
550:	test: 0.8970184	best: 0.8970184 (550)	total: 11m 47s	remaining: 31m 1s
600:	test: 0.8971661	best: 0.8971833 (599)	total: 12m 51s	remaining: 29m 56s
650:	test: 0.8973311	b

In [None]:
from catboost import Pool

test_pool = Pool(
    test[features],
    group_id=test[GROUP].values,
    cat_features=cat_cols
)
test["score"] = cb_model.predict(test_pool)

sub = test[[GROUP, "item_id", "score"]].sort_values([GROUP, "score"], ascending=[True, False])

submission_df = sub[[GROUP, "item_id"]]
submission_df.to_csv(
    "solution3.csv",
    header=["query_id", "item_id"],
    index=False
)



saved solution.csv (335348, 2)


Unnamed: 0,query_id,item_id
11,55,7464296355
27,55,7576666895
0,55,7540855789
22,55,7562354327
34,55,7549689548
