## Mercado Libre Tech Challenge Part 3: Products Simularity

### Objetive: find similarity between product titles.

### Imports & Utils

In [44]:
import pandas as pd
import numpy as np
from IPython.display import display
import umap
import umap.plot
from sentence_transformers import SentenceTransformer
import faiss



def clean_dataset(
    data: pd.DataFrame,
    check_only: bool = True,
    dup_cols: list = [],
    nan_cols: list = [],
) -> pd.DataFrame:

    # Check raw dataset info
    print(f"Raw dataset info:\n")
    print(f"{data.info()}\n")
    # Check duplicates
    if len(dup_cols) > 0:
        for col in dup_cols:
            print(
                f"Number of duplicates in column {col}: {data.duplicated(subset=[col]).sum()}\n"
            )

    if not check_only:
        # Drop NaNs
        data.dropna(subset=nan_cols, inplace=True)
        # Drop duplicates
        data.drop_duplicates(subset=dup_cols, inplace=True)

        # Check dataset info after cleaning
        print(f"Cleaned dataset info:\n")
        print(f"{data.info()}\n")

    return data


def transform_data_lowecase(data: pd.DataFrame, lower_cols: list = []) -> pd.DataFrame:

    if len(lower_cols) > 0:
        for col in lower_cols:
            data[f"{col}"] = data[col].apply(lambda x: x.lower())
    else:
        print("No column to lower case was specifed. The dataframe won't be modified")
    return data


def keep_alpha(text: str) -> str:

    return " ".join(x for x in text.split(" ") if (x.isalpha() or ":" in x))


def clean_text(text: str, rm_words: list, min_word_len: int = 0) -> str:

    if len(rm_words) > 0:
        for w in rm_words:
            text = text.replace(w, " ")
    # else:
    #     print("No words to remove were specified.")
    text = " ".join(x for x in text.split(" ") if len(x) > min_word_len)

    return text


def dim_reduction(
    embeddings_matrix,
    umap_config: dict,
    random_init: int = 124,
):
    """Use UMAP to reduce embeddings dimensionality

    Args:
        embeddings_matrix (numpy array): matrix with embeddings as rows
        umap_config (dict): umap config options
        random_init (int, optional): random initialization. Defaults to 124.

    Returns:
        UMAP object
    """
    red_embeddings = None
    red_embeddings = umap.UMAP(
        n_neighbors=umap_config["n_neighbors"],
        n_components=umap_config["n_components"],
        metric=umap_config["metric"],
        min_dist=umap_config["min_dist"],
        random_state=random_init,
    ).fit(embeddings_matrix)

    return red_embeddings


In [51]:
# normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]

### Load language model & set some configs

In [14]:
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

umap_config = {
    "clustering": {
        "n_neighbors": 20,
        "n_components": 100,
        "min_dist": 0.05,
        "metric": "cosine",
    },
}

### Load dataset

In [40]:
data_path = "../data"
data_file_name = "items_titles.csv"
query_file_name = "items_titles_test.csv"
data = pd.read_csv(f"{data_path}/{data_file_name}")
data_q = pd.read_csv(f"{data_path}/{query_file_name}")

TEXT_EMBED_COL = "ITE_ITEM_TITLE"

### Dataset clean up

In [46]:
# Processing training dataset
data = clean_dataset(data=data, dup_cols=[TEXT_EMBED_COL], nan_cols=[TEXT_EMBED_COL], check_only=False)
data = transform_data_lowecase(data, lower_cols=[TEXT_EMBED_COL])
data[TEXT_EMBED_COL] = data[TEXT_EMBED_COL].transform(lambda x: keep_alpha(x))

print("Dataset sample:")
display(data.sample(10, random_state=123))

Raw dataset info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28958 entries, 0 to 29999
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ITE_ITEM_TITLE  28958 non-null  object
dtypes: object(1)
memory usage: 452.5+ KB
None

Number of duplicates in column ITE_ITEM_TITLE: 0

Cleaned dataset info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28958 entries, 0 to 29999
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ITE_ITEM_TITLE  28958 non-null  object
dtypes: object(1)
memory usage: 452.5+ KB
None

Dataset sample:


Unnamed: 0,ITE_ITEM_TITLE
1489,tenis cano alto arpoador soft osklen
12256,sapatilha aquatica multi esportiva
15843,tênis olympikus asas feminino azul
6887,tênis original adidas originals infantil cinza
17916,tênis fila top spin rosa
16169,tênis oakley masculino flak cza
8473,nike lebron soldier xll original pouquíssimo uso
28911,tênis usthemp one vegano casual estampa nico
4317,roda de rolo traseiro do rolo da guia da vermelho
2218,tênis feminino dumond lilás estampa dmd sola alta


In [48]:
# Processing query dataset
data_q = clean_dataset(data=data_q, dup_cols=[TEXT_EMBED_COL], nan_cols=[TEXT_EMBED_COL], check_only=False)
data_q = transform_data_lowecase(data_q, lower_cols=[TEXT_EMBED_COL])
data_q[TEXT_EMBED_COL] = data_q[TEXT_EMBED_COL].transform(lambda x: keep_alpha(x))

print("Query sample:")
display(data_q.sample(10, random_state=123))

Raw dataset info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9854 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ITE_ITEM_TITLE  9854 non-null   object
dtypes: object(1)
memory usage: 154.0+ KB
None

Number of duplicates in column ITE_ITEM_TITLE: 0

Cleaned dataset info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9854 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ITE_ITEM_TITLE  9854 non-null   object
dtypes: object(1)
memory usage: 154.0+ KB
None

Query sample:


Unnamed: 0,ITE_ITEM_TITLE
7,under armour hovr phantom conexão bluetooth tê...
7783,bicicleta bike infantil aro nathor candy envio
2157,tênis masculino olympikus amortecedor corrida ...
2362,tênis casual feminino elástico kolosh
7818,tênis de treino masculino under armour hovr apex
5453,sapatênis masculino zíper hankook macio confor...
2067,tênis usthemp chunky smooth bulldog inglês car...
6002,tênis feminino diversas cores
369,tênis ramarim feminino chunky branco
6611,sapatênis pegada casual couro


### Generate numerical vector representations from text (embeddings)

In [53]:
embeddings = model.encode(data[TEXT_EMBED_COL].to_list())
embeddings_q = model.encode(data_q[TEXT_EMBED_COL].to_list())


### Indexing

In [61]:
embedding_dim = embeddings.shape[1]
print(f"Embeddings size: {embedding_dim}")

# Build index & add embeddings
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings_q)

print(f"Index size: {index.ntotal}")


Embeddings size: 384
Index size: 9854


In [62]:
# Search top 10 most similar embeddings (neighbours) to each embedding in the dataset
num_neigbours = 10
distance, neighbors = index.search(embeddings_q, num_neigbours)


In [63]:
# Check out some examples
print(f"Neighbour ids:\n {neighbors[:5]}\n")
print(f"Neighbour distance:\n {distance[:5]}\n")


Neighbour ids:
 [[   0 6088 1765 1557 7825  792 1845 5326 1077 3203]
 [   1 2341 9390 4346 8387 4179 5895 5334 1777 8630]
 [   2 8417 6164 1160   43 4498 2078 6596 8077 7085]
 [   3  115 6958  214 9851 8836 2311 7838 2706 5031]
 [   4 4357 3771 2841 5461 4474 8433  846  761 5023]]

Neighbour distance:
 [[0.0000000e+00 1.3178524e+01 1.5279091e+01 1.6089115e+01 1.7022388e+01
  1.7212513e+01 1.7229050e+01 1.7763664e+01 1.7772751e+01 1.8110104e+01]
 [1.1444092e-05 1.3208153e+01 1.3900391e+01 1.4242474e+01 1.5961403e+01
  1.6061161e+01 1.6113441e+01 1.6219078e+01 1.6523247e+01 1.6770725e+01]
 [0.0000000e+00 6.2273254e+00 1.4316368e+01 1.8986366e+01 2.0294846e+01
  2.1245445e+01 2.3265221e+01 2.3912811e+01 2.4019798e+01 2.5278519e+01]
 [0.0000000e+00 6.8527985e+00 1.2840492e+01 1.5288239e+01 1.5471779e+01
  1.5780037e+01 1.6059883e+01 1.6622261e+01 1.7127609e+01 1.7199669e+01]
 [0.0000000e+00 1.6936935e+01 1.9969944e+01 2.3834290e+01 2.6127907e+01
  2.7044540e+01 2.7112103e+01 2.7167721e+01 

In [74]:
data_q.iloc[neighbors[0]]

Unnamed: 0,ITE_ITEM_TITLE
0,tênis olympikus esporte valente masculino kids
6155,tênis esportivo infantil masc olympikus maneir...
1767,tênis infantil masculino olympikus supremo kids
1559,tênis esportivo infantil olympikus azul marinh...
7922,tênis olympikus masculino valente kids infanti...
792,tênis infantil masculino olympikus valente azul
1847,tênis masculino esportivo confortavél olympikus
5382,tênis masculino esportivo confortável olympiku...
1078,tenis esportivo masculino olympikus academia c...
3218,tênis masculino olympikus treino corrida confo...


In [72]:
data_q[data_q["ITE_ITEM_TITLE"].str.contains("olympikus")]

Unnamed: 0,ITE_ITEM_TITLE
0,tênis olympikus esporte valente masculino kids
50,tênis olympikus sonoro vermelho
108,tênis olympikus feminino preto
129,tenis inf olympikus enjoy
155,tênis masculino esportivo olympikus oferta bla...
...,...
9804,tênis esportivo masculino olympikus academia l...
9824,tênis olympikus original feminino
9827,tenis olympikus holograma
9878,tênis esportivo training feminino olympikus ea...


In [69]:
data_q

Unnamed: 0,ITE_ITEM_TITLE
0,tênis olympikus esporte valente masculino kids
1,bicicleta barra forte samy marchas cubo rolamento
2,tênis usthemp labrador
3,tênis casual feminino moleca tecido tie dye
4,tênis star baby sapatinho conforto brinde
...,...
9995,chuteira futsal oxn velox infantil
9996,sapatenis casual masculino estiloso conforto q...
9997,tênis feminino infantil molekinha tie dye
9998,tênis feminino leve barato ganhe colchonete tr...
