## Mercado Libre Tech Challenge Part 3: Products Simularity

### Objetive: find similarity between product titles.

### Imports & Utils

In [59]:
import pandas as pd
import numpy as np
from IPython.display import display
import umap
import umap.plot
from sentence_transformers import SentenceTransformer
import faiss



def clean_dataset(
    data: pd.DataFrame,
    check_only: bool = True,
    dup_cols: list = [],
    nan_cols: list = [],
) -> pd.DataFrame:

    # Check raw dataset info
    print(f"Raw dataset info:\n")
    print(f"{data.info()}\n")
    # Check duplicates
    if len(dup_cols) > 0:
        for col in dup_cols:
            print(
                f"Number of duplicates in column {col}: {data.duplicated(subset=[col]).sum()}\n"
            )

    if not check_only:
        # Drop NaNs
        data.dropna(subset=nan_cols, inplace=True)
        # Drop duplicates
        data.drop_duplicates(subset=dup_cols, inplace=True)

        # Check dataset info after cleaning
        print(f"Cleaned dataset info:\n")
        print(f"{data.info()}\n")

    return data


def transform_data_lowecase(data: pd.DataFrame, lower_cols: list = []) -> pd.DataFrame:

    if len(lower_cols) > 0:
        for col in lower_cols:
            data[f"{col}"] = data[col].apply(lambda x: x.lower())
    else:
        print("No column to lower case was specifed. The dataframe won't be modified")
    return data


def keep_alpha(text: str) -> str:

    return " ".join(x for x in text.split(" ") if (x.isalpha() or ":" in x))


def clean_text(text: str, rm_words: list, min_word_len: int = 0) -> str:

    if len(rm_words) > 0:
        for w in rm_words:
            text = text.replace(w, " ")
    # else:
    #     print("No words to remove were specified.")
    text = " ".join(x for x in text.split(" ") if len(x) > min_word_len)

    return text


def dim_reduction(
    embeddings_matrix,
    umap_config: dict,
    random_init: int = 124,
):
    """Use UMAP to reduce embeddings dimensionality

    Args:
        embeddings_matrix (numpy array): matrix with embeddings as rows
        umap_config (dict): umap config options
        random_init (int, optional): random initialization. Defaults to 124.

    Returns:
        UMAP object
    """
    red_embeddings = None
    red_embeddings = umap.UMAP(
        n_neighbors=umap_config["n_neighbors"],
        n_components=umap_config["n_components"],
        metric=umap_config["metric"],
        min_dist=umap_config["min_dist"],
        random_state=random_init,
    ).fit(embeddings_matrix)

    return red_embeddings


### Load language model
Different language models could be used here, in particular, multilingual models might be desired, considering item titles are in portuguese. Herein I tested two: `paraphrase-MiniLM-L6-v2` and `paraphrase-multilingual-mpnet-base-v2`. The latter is indeed a multilingual model, but more expensive in terms of embedding lenght and computational effort to compute the embeddings. Alternatively, `paraphrase-MiniLM-L6-v2` is smaller and faster to compute, and based on a preliminary evaluation (rapid visual inspection) of results, it gave better results than `paraphrase-multilingual-mpnet-base-v2`, for the puropose of the present task. For that herein we'll use `paraphrase-MiniLM-L6-v2`.

In [60]:
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
# model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

### Load dataset

In [61]:
data_path = "../data"
data_file_name = "items_titles.csv"
query_file_name = "items_titles_test.csv"
data = pd.read_csv(f"{data_path}/{data_file_name}")
data_q = pd.read_csv(f"{data_path}/{query_file_name}")

TEXT_EMBED_COL = "ITE_ITEM_TITLE"

### Dataset clean up

In [62]:
# Processing training dataset
data = clean_dataset(data=data, dup_cols=[TEXT_EMBED_COL], nan_cols=[TEXT_EMBED_COL], check_only=False)
data = transform_data_lowecase(data, lower_cols=[TEXT_EMBED_COL])
data[TEXT_EMBED_COL] = data[TEXT_EMBED_COL].transform(lambda x: keep_alpha(x))

print("Dataset sample:")
display(data.sample(10, random_state=123))

Raw dataset info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ITE_ITEM_TITLE  30000 non-null  object
dtypes: object(1)
memory usage: 234.5+ KB
None

Number of duplicates in column ITE_ITEM_TITLE: 0

Cleaned dataset info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 0 to 29999
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ITE_ITEM_TITLE  30000 non-null  object
dtypes: object(1)
memory usage: 468.8+ KB
None

Dataset sample:


Unnamed: 0,ITE_ITEM_TITLE
25665,kit de tênis
16464,tênis infantil feminino krisle
22386,tenis dg sorrento preto black unissex dourado
10149,bicicleta com suspensão aro full velocidades
8729,tênis drop gel equation feminino preto original
25295,tênis olympikus index masculino academia fitness
8876,sapatênis linha moderno conforto doctor flex
12348,tenis infantil feminino original super leve ca...
3858,tênis feminino academia esporte caminhada reló...
19209,kit pares sapatenis casual barato corrente e ó...


In [53]:
# Processing query dataset
data_q = clean_dataset(data=data_q, dup_cols=[TEXT_EMBED_COL], nan_cols=[TEXT_EMBED_COL], check_only=False)
data_q = transform_data_lowecase(data_q, lower_cols=[TEXT_EMBED_COL])
data_q[TEXT_EMBED_COL] = data_q[TEXT_EMBED_COL].transform(lambda x: keep_alpha(x))

print("Query sample:")
display(data_q.sample(10, random_state=123))

Raw dataset info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ITE_ITEM_TITLE  10000 non-null  object
dtypes: object(1)
memory usage: 156.2+ KB
None

Number of duplicates in column ITE_ITEM_TITLE: 146

Cleaned dataset info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9854 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ITE_ITEM_TITLE  9854 non-null   object
dtypes: object(1)
memory usage: 154.0+ KB
None

Query sample:


Unnamed: 0,ITE_ITEM_TITLE
7,under armour hovr phantom conexão bluetooth tê...
7783,bicicleta bike infantil aro nathor candy envio
2157,tênis masculino olympikus amortecedor corrida ...
2362,tênis casual feminino elástico kolosh
7818,tênis de treino masculino under armour hovr apex
5453,sapatênis masculino zíper hankook macio confor...
2067,tênis usthemp chunky smooth bulldog inglês car...
6002,tênis feminino diversas cores
369,tênis ramarim feminino chunky branco
6611,sapatênis pegada casual couro


### Generate numerical vector representations from text (embeddings)

In [63]:
# embeddings = model.encode(data[TEXT_EMBED_COL].to_list())
embeddings_q = model.encode(data_q[TEXT_EMBED_COL].to_list())


### Indexing

The present work requires searching for similar items (embeddings) in a dataset of 10K item titles, not in the 30K, thus an exact brute force indexing method (`IndexFlatL2`) is the best option. If dealing with the larger 30K or higher, then a brute force approach would become unfeasible, an alternative indexing method (e.g. HNSW) should be used. 

In [64]:
embedding_dim = embeddings_q.shape[1]
print(f"Embeddings size: {embedding_dim}")

# Build index & add embeddings
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings_q)

print(f"Index size: {index.ntotal}")


Embeddings size: 384
Index size: 10000


In [65]:
# Search top 10 most similar embeddings (neighbours) to each embedding in the dataset
num_neigbours = 10
distance, neighbors = index.search(embeddings_q, num_neigbours)

In [66]:
# Check out some examples
print(f"Neighbour ids:\n {neighbors[:5]}\n")
print(f"Neighbour distance:\n {distance[:5]}\n")
display(data_q.iloc[neighbors[0]])


Neighbour ids:
 [[   0 1767 6155  309 7880 7922  792 1559 5382 1847]
 [   1 4385 9523 1779 2346 5962 2925 5018 2972 3778]
 [   2 8528 4539 6670 6233 9759 4331 3107 7533 3250]
 [   3  115 7037 9997 8953  214 7936 2715 2133 5653]
 [   4 4396 3796 9477 5522 2853 8168 4515 4526  761]]

Neighbour distance:
 [[0.0000000e+00 1.2596882e+01 1.3523090e+01 1.3642517e+01 1.5268101e+01
  1.5412693e+01 1.6024605e+01 1.6658463e+01 1.6685265e+01 1.7257355e+01]
 [1.5258789e-05 1.4231960e+01 1.4446102e+01 1.5067076e+01 1.5171703e+01
  1.5330662e+01 1.5891640e+01 1.6217333e+01 1.6476946e+01 1.6511831e+01]
 [1.1444092e-05 1.1374603e+01 1.1959545e+01 1.3136986e+01 1.5434505e+01
  1.6515198e+01 1.6974964e+01 2.0170223e+01 2.0693367e+01 2.0940525e+01]
 [0.0000000e+00 6.8527985e+00 1.2840492e+01 1.5471779e+01 1.5780037e+01
  1.6165329e+01 1.6622261e+01 1.6863075e+01 1.7503738e+01 1.8121620e+01]
 [7.6293945e-06 1.7027935e+01 1.8994801e+01 2.3655842e+01 2.3842819e+01
  2.4805038e+01 2.4938644e+01 2.5049374e+01 

Unnamed: 0,ITE_ITEM_TITLE
0,Tênis Olympikus Esporte Valente - Masculino Kids
1767,Tênis Infantil Masculino Olympikus 943 Supremo...
6155,Tênis Esportivo Infantil Masc Olympikus Maneir...
309,Tênis Masculino Olympikus Enjoy Kids/726 - Inf...
7880,Tênis Infantil Olympikus Maneiro Kids 942 Mari...
7922,Tênis Olympikus Masculino Valente - Kids Infan...
792,Tênis Infantil Masculino Olympikus Valente Azu...
1559,Tênis Esportivo Infantil Olympikus Azul Marinh...
5382,Tênis Masculino Esportivo Confortável Olympik...
1847,Tênis Masculino Esportivo Confortavél Olympiku...


### Generate output

In [67]:
items = data_q["ITE_ITEM_TITLE"].to_list()
item_df_list = []
for i,x in enumerate(items):
    item_df = pd.DataFrame()
    
    item = np.repeat(x, len(neighbors[i]))
    similar_items = data_q.iloc[neighbors[i]]["ITE_ITEM_TITLE"].values
    scores = distance[i]
    
    item_df["ITE_ITEM_TITLE"] = item
    item_df["SIMILAR_ITEMS"] = similar_items
    item_df["SCORES"] = scores
    
    item_df_list.append(item_df)

similarity_df = pd.concat(item_df_list)
similarity_df.to_csv("output.csv", index=False)
    

### Final Remarks

- Item titles embeddings using a language (`paraphrase-MiniLM-L6-v2`) model and indexing (faiss `IndexFlatL2`) were applied to generate the list of most similat items for each of the item in the dataset `items_titles_test.csv`.
- Although the obtained results are good from an initial visual inspection, a more thourough analysis of results, possibly manually annotating some exmaples and measuring the recall would be useful to no only confirm the quality of the obtained results, but also to compare with other language models and indexing methods.