In [1]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from navec import Navec
from googletrans import Translator
from gensim.models import KeyedVectors
from sentence_transformers import SentenceTransformer
from src.mcc_emb import create_mcc_embeddings_dict, clean_mcc_df_eng, translate_to_eng
from src.transactions_emb import get_embed_dicts, create_transactions_embeddings
from src.mcc_emb import process_mcc_df
from src.clickstream_emb import (
    process_categories_df,
    create_clck_embeddings_dict,
    load_clck_emb_dict,
    create_clickstream_embeddings,
)
from src.utils import get_corpus, calc_distances, get_words_and_embs, get_top_k_words, filter_mcc_descriptipn

from typing import Dict


[nltk_data] Downloading package stopwords to /home/glebk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ImportError: cannot import name 'get_corpus' from 'src.utils' (/home/glebk/VSProjects/projects/data_fusion_matching/src/utils.py)

In [3]:
# Load downloaded word embeddings:

DATA = './word_emb/GoogleNews-vectors-negative300.bin.gz'

wv_embeddings = KeyedVectors.load_word2vec_format(DATA, binary=True) 

## Clickstream categories processing:

In [2]:
CLICK_CATEGORIES_PATH = "./data/click_categories.csv"

In [3]:
clickstream_categories = pd.read_csv(CLICK_CATEGORIES_PATH)
clickstream_categories.head()

Unnamed: 0,cat_id,level_0,level_1,level_2
0,1,accessories,,
1,2,accessories,handbags,
2,3,accessories,jewellery,
3,8,accessories,watches,
4,11,age,age_0-5_yo,
5,12,age,age_17-23_yo,
6,13,age,age_6-16_yo,
7,14,audio,,
8,15,avid_music_listeners,,
9,19,avid_music_listeners,radio,


In [4]:
# Combine and normilize descriptions:
clickstream_categories = process_categories_df(clickstream_categories)
clickstream_categories.head()

Unnamed: 0,cat_id,Description
0,1,accessories
1,2,accessories handbags
2,3,accessories jewellery
3,8,accessories watches
4,11,age child


In [5]:
clickstream_categories.to_csv("./data/clck_cat_norm_eng.csv", index=False)

## MCC codes processing:

In [18]:
MCC_CODES_PATH = "./data/mcc_codes_ru.csv"

In [None]:
mcc_codes = pd.read_csv(MCC_CODES_PATH)
mcc_codes.head()

In [24]:
# RU to ENG translation:

mcc_codes = process_mcc_df(mcc_codes)

translator = Translator()
mcc_codes_trns = translate_to_eng(mcc_codes, translator)
mcc_codes = clean_mcc_df_eng(mcc_codes_trns)
mcc_codes.head()

Unnamed: 0,MCC,Description
0,742,veterinary services licensed professionals pri...
1,763,agricultural cooperatives associations provide...
2,780,horticulture landscaping services landscape ar...
3,1520,general contractors residential commercial con...
4,1711,general contractors ventilation heating plumbi...


In [26]:
mcc_codes.to_csv("./data/mcc_cat_norm_eng.csv", index=False)

In [6]:
mcc_codes = pd.read_csv("/home/glebk/Datasets/Matching/mcc_cat_norm_eng.csv")

## Selection of MCC description subset closest to clickstream description corpus:

In [51]:
mcc_codes = pd.read_csv("/home/glebk/Datasets/Matching/mcc_cat_norm_eng.csv")
clickstream_categories = pd.read_csv("/home/glebk/Datasets/Matching/clck_cat_norm_eng.csv")

In [52]:
# Words to avoid in description: 

stopwords = [
"services",
"sale",
"goods",
"may",
"used",
"eg",
"include",
"includes",
"service",
"sales",
"selling",
"example",
"outlets",
"retail",
"else",
"new",
"including",
"also",
"etc"]

In [54]:
# Clickstream categories description corpus:

clck_corpus = get_corpus(clickstream_categories, stopwords)
len(clck_corpus)

240

In [None]:
# Selection of embedidngs mode: word embeddings or sentence embeddings:

MODE = "WE"  # "ST" - SentenceTransformer or "WE" - word embedding

if MODE == "ST":
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
else:
    model = None

In [55]:
# Clickstream description corpus embeddings:

clck_corpus_list = []
clck_corpus_embs = []
for w in clck_corpus:
    try:
        if MODE == "WE":
            emb = wv_embeddings[w]
        elif MODE == "ST":
           emb = model.encode(w)
        clck_corpus_embs.append(emb)
        clck_corpus_list.append(w)
    except:
        continue

clck_corpus_embs = np.array(clck_corpus_embs)

In [62]:
# Example: initial description.

mcc_descriptipn = mcc_codes.loc[123, "Description"]
mcc_descriptipn

'travel agencies excursion organizers mainly provide tourist information booking services outlets act agents behalf travelers booking air tickets purchase tickets land sea transport including air flights bus tours sea cruises car rental rail transportation accommodation also includes tour operators organize collect sales travel agent directly buyer traveler book tour packages hotel concierge checkout'

In [63]:
# Example: closest word from description to clickstream corpus.

top_k = get_top_k_words(mcc_descriptipn, clck_corpus_embs, wv_embeddings, stopwords, 1)

(" ").join(top_k)

'travelers'

In [65]:
# Reduce MCC description to top-K words closest to clickstream corpus:

K = 3  # top-K words 
new_mcc = filter_mcc_descriptipn(mcc_codes, mcc_descriptipn, clck_corpus_embs, wv_embeddings, stopwords, K)

new_mcc.head()

100%|██████████| 385/385 [00:00<00:00, 899.84it/s]


Unnamed: 0,MCC,Description
0,742,dogs
1,763,cultivation
2,780,sowing
3,1520,reconstruction
4,1711,sprinklers


In [66]:
new_mcc.to_csv("./data/filtered_mcc_description.csv")

In [68]:
# Create MCC embeddings dict:

mcc_embs = create_mcc_embeddings_dict(new_mcc, wv_embeddings, MODE, model)

100%|██████████| 385/385 [00:00<00:00, 28616.11it/s]


In [70]:
# Create clickstream embeddings dict:

clc_embs = create_clck_embeddings_dict(clickstream_categories, wv_embeddings, MODE, model)


100%|██████████| 402/402 [00:00<00:00, 28073.30it/s]


## Mapping from category id's to indexes:

In [72]:
MCC_PATH = "/home/glebk/VSProjects/projects/Matching/data/mcc_codes.csv"
CLCK_PATH = "/home/glebk/VSProjects/projects/Matching/data/click_categories.csv"

In [73]:
mcc_codes = pd.read_csv(MCC_PATH)
clck_codes = pd.read_csv(CLCK_PATH)

In [74]:
mcc_codes = np.sort(mcc_codes['MCC'].unique())
idxs = list(range(len(mcc_codes)))

mcc_code_to_idx = {mcc_code: idx for mcc_code, idx in zip(mcc_codes, idxs)}

with open("./submission/mcc_code_to_idx.pickle", "wb") as f:
    pickle.dump(mcc_code_to_idx, f, protocol=pickle.DEFAULT_PROTOCOL)


In [75]:
clck_codes = np.sort(clck_codes['cat_id'].unique())
idxs = list(range(len(clck_codes)))

clck_code_to_idx = {clck_code: idx for clck_code, idx in zip(clck_codes, idxs)}

with open("./submission/cat_code_to_idx.pickle", "wb") as f:
    pickle.dump(clck_code_to_idx, f, protocol=pickle.DEFAULT_PROTOCOL)


## Description embeddings [num_cat_idxs, emb_dim]

In [76]:
# MCC embeddings:

with open("/home/glebk/VSProjects/projects/Matching/embeddings/new_mcc_emb_en_filtered.pickle", "rb") as f:
    mcc_emb = pickle.load(f)

mcc_seq_emb = []
for orig_idx, seq_idx in mcc_code_to_idx.items():
    emb = mcc_emb[orig_idx]
    mcc_seq_emb.append(emb)

mcc_seq_emb = np.array(mcc_seq_emb)
mcc_seq_emb.shape

(385, 300)

In [77]:
with open("./submission/mcc_code_emb_seq.pickle", "wb") as f:
    pickle.dump(mcc_seq_emb, f, protocol=pickle.DEFAULT_PROTOCOL)

In [78]:
# Clickstream embeddings:

with open("/home/glebk/VSProjects/projects/Matching/embeddings/new_clck_cat_emb_en_filtered.pickle", "rb") as f:
    clc_emb = pickle.load(f)

clc_emb_seq = []
for orig_idx, seq_idx in clck_code_to_idx.items():
    emb = clc_emb[orig_idx]
    clc_emb_seq.append(emb)

clc_emb_seq = np.array(clc_emb_seq)
clc_emb_seq.shape

(402, 300)

In [79]:
with open("./submission/clc_code_emb_seq.pickle", "wb") as f:
    pickle.dump(clc_emb_seq, f, protocol=pickle.DEFAULT_PROTOCOL)