In [1]:
import pickle
import numpy as np
import pandas as pd
from googletrans import Translator
from gensim.models import KeyedVectors
from sentence_transformers import SentenceTransformer
from src.mcc_emb import create_mcc_embeddings_dict, clean_mcc_df_eng, translate_to_eng
from src.mcc_emb import process_mcc_df
from src.clickstream_emb import (
    process_categories_df,
    create_clck_embeddings_dict,
)
from src.utils import get_corpus, get_top_k_words, filter_mcc_descriptipn


[nltk_data] Downloading package stopwords to /home/glebk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load downloaded word embeddings:

DATA = './word_emb/GoogleNews-vectors-negative300.bin.gz'

wv_embeddings = KeyedVectors.load_word2vec_format(DATA, binary=True) 

## Clickstream categories processing:

In [6]:
CLICK_CATEGORIES_PATH = "./data/click_categories.csv"

In [7]:
clickstream_categories = pd.read_csv(CLICK_CATEGORIES_PATH)
clickstream_categories.head()

Unnamed: 0,cat_id,level_0,level_1,level_2
0,1,accessories,,
1,2,accessories,handbags,
2,3,accessories,jewellery,
3,8,accessories,watches,
4,11,age,age_0-5_yo,


In [8]:
# Combine and normilize descriptions:
clickstream_categories = process_categories_df(clickstream_categories)
clickstream_categories.head()

Unnamed: 0,cat_id,Description
0,1,accessories
1,2,accessories handbags
2,3,accessories jewellery
3,8,accessories watches
4,11,age child


In [9]:
clickstream_categories.to_csv("./data/clck_cat_norm_eng.csv", index=False)

## MCC codes processing:

In [12]:
MCC_CODES_PATH = "./data/mcc_codes.csv"

In [13]:
mcc_codes = pd.read_csv(MCC_CODES_PATH)
mcc_codes.head()

Unnamed: 0,MCC,Название,Описание
0,742,Ветеринарные услуги,Лицензированные специалисты в основном занимаю...
1,763,Сельскохозяйственные кооперативы,"Ассоциации и кооперативы, которые предоставляю..."
2,780,Услуги садоводства и ландшафтного дизайна,Ландшафтные архитекторы и другие поставщики ус...
3,1520,Генеральные подрядчики – жилое и коммерческое ...,"Генеральные подрядчики, в основном занимающиес..."
4,1711,"Генеральные подрядчики по вентиляции, теплосна...","Специальные торговые подрядчики, которые работ..."


In [14]:
# RU to ENG translation:

mcc_codes = process_mcc_df(mcc_codes)

translator = Translator()
mcc_codes_trns = translate_to_eng(mcc_codes, translator)
mcc_codes = clean_mcc_df_eng(mcc_codes_trns)
mcc_codes.head()

Unnamed: 0,MCC,Description
0,742,veterinary services licensed professionals pri...
1,763,agricultural cooperatives associations provide...
2,780,horticulture landscaping services landscape ar...
3,1520,general contractors residential commercial con...
4,1711,general contractors ventilation heating plumbi...


In [15]:
mcc_codes.to_csv("./data/mcc_cat_norm_eng.csv", index=False)

## Selection of MCC description subset closest to clickstream description corpus:

In [3]:
mcc_codes = pd.read_csv("./data//mcc_cat_norm_eng.csv")
clickstream_categories = pd.read_csv("./data/clck_cat_norm_eng.csv")

In [4]:
# Words to avoid in description: 

stopwords = [
"services",
"sale",
"goods",
"may",
"used",
"eg",
"include",
"includes",
"service",
"sales",
"selling",
"example",
"outlets",
"retail",
"else",
"new",
"including",
"also",
"etc"]

In [5]:
# Clickstream categories description corpus:

clck_corpus = get_corpus(clickstream_categories, stopwords)
len(clck_corpus)

513

In [6]:
# Selection of embedidngs mode: word embeddings or sentence embeddings:

MODE = "WE"  # "ST" - SentenceTransformer or "WE" - word embedding

if MODE == "ST":
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
else:
    model = None

In [7]:
# Clickstream description corpus embeddings:

clck_corpus_list = []
clck_corpus_embs = []
for w in clck_corpus:
    try:
        if MODE == "WE":
            emb = wv_embeddings[w]
        elif MODE == "ST":
           emb = model.encode(w)
        clck_corpus_embs.append(emb)
        clck_corpus_list.append(w)
    except:
        continue

clck_corpus_embs = np.array(clck_corpus_embs)

In [8]:
# Example: initial description.

mcc_descriptipn = mcc_codes.loc[123, "Description"]
mcc_descriptipn

'travel agencies excursion organizers mainly provide tourist information booking services outlets act agents behalf travelers booking air tickets purchase tickets land sea transport including air flights bus tours sea cruises car rental rail transportation accommodation also includes tour operators organize collect sales travel agent directly buyer traveler book tour packages hotel concierge checkout'

In [9]:
# Example: closest word from description to clickstream corpus.

top_k = get_top_k_words(mcc_descriptipn, clck_corpus_embs, wv_embeddings, stopwords, 3)

(" ").join(top_k)

'agencies purchase transport'

In [10]:
# Reduce MCC description to top-K words closest to clickstream corpus:

K = 3  # top-K words 
new_mcc = filter_mcc_descriptipn(mcc_codes, mcc_descriptipn, clck_corpus_embs, wv_embeddings, stopwords, K)

new_mcc.head()

100%|██████████| 385/385 [00:01<00:00, 370.13it/s]


Unnamed: 0,MCC,Description
0,742,surgery pets dogs
1,763,maintenance planting photography
2,780,garden care planting
3,1520,construction renovation commercial
4,1711,heating work systems


In [11]:
new_mcc.to_csv("./data/filtered_mcc_description.csv")

In [12]:
# Create MCC embeddings dict:

mcc_embs = create_mcc_embeddings_dict(new_mcc, wv_embeddings, MODE, model)

100%|██████████| 385/385 [00:00<00:00, 28898.80it/s]


In [13]:
# Create clickstream embeddings dict:

clc_embs = create_clck_embeddings_dict(clickstream_categories, wv_embeddings, MODE, model)


100%|██████████| 402/402 [00:00<00:00, 30592.58it/s]


## Mapping from category id's to indexes:

In [18]:
MCC_PATH = "./data/mcc_codes.csv"
CLCK_PATH = "./data/click_categories.csv"

In [19]:
mcc_codes = pd.read_csv(MCC_PATH)
clck_codes = pd.read_csv(CLCK_PATH)

In [20]:
mcc_codes = np.sort(mcc_codes['MCC'].unique())
idxs = list(range(len(mcc_codes)))

mcc_code_to_idx = {mcc_code: idx for mcc_code, idx in zip(mcc_codes, idxs)}

with open("./submission/mcc_code_to_idx.pickle", "wb") as f:
    pickle.dump(mcc_code_to_idx, f, protocol=pickle.DEFAULT_PROTOCOL)


In [21]:
clck_codes = np.sort(clck_codes['cat_id'].unique())
idxs = list(range(len(clck_codes)))

clck_code_to_idx = {clck_code: idx for clck_code, idx in zip(clck_codes, idxs)}

with open("./submission/cat_code_to_idx.pickle", "wb") as f:
    pickle.dump(clck_code_to_idx, f, protocol=pickle.DEFAULT_PROTOCOL)


## Description embeddings [num_cat_idxs, emb_dim]

In [23]:
# MCC embeddings:

with open("./embeddings/mcc_emb_en.pickle", "rb") as f:
    mcc_emb = pickle.load(f)

mcc_seq_emb = []
for orig_idx, seq_idx in mcc_code_to_idx.items():
    emb = mcc_emb[orig_idx]
    mcc_seq_emb.append(emb)

mcc_seq_emb = np.array(mcc_seq_emb)
mcc_seq_emb.shape

(385, 300)

In [24]:
with open("./submission/mcc_code_emb_seq.pickle", "wb") as f:
    pickle.dump(mcc_seq_emb, f, protocol=pickle.DEFAULT_PROTOCOL)

In [25]:
# Clickstream embeddings:

with open("./embeddings/new_clck_cat_emb_en_filtered.pickle", "rb") as f:
    clc_emb = pickle.load(f)

clc_emb_seq = []
for orig_idx, seq_idx in clck_code_to_idx.items():
    emb = clc_emb[orig_idx]
    clc_emb_seq.append(emb)

clc_emb_seq = np.array(clc_emb_seq)
clc_emb_seq.shape

(402, 300)

In [26]:
with open("./submission/clc_code_emb_seq.pickle", "wb") as f:
    pickle.dump(clc_emb_seq, f, protocol=pickle.DEFAULT_PROTOCOL)