In [1]:
import os
import re
import duckdb
import numpy as np
import pandas as pd

from fuzzywuzzy import fuzz
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

In [112]:
result_fuzzy_raw = pd.read_csv('datasets/processed/result_fuzzy.tsv', sep='\t')
result_fuzzy_raw.head()

Unnamed: 0,product_id,product_name,result_sku_lev,result_sku_fuzzy,result_sku_id_lev,result_sku_id_fuzzy,possible_brand,is_name_only_alphanum,is_name_only_alphabet,clean_name_non_formula,...,clean_name,result_clean_sku_lev,result_clean_sku_fuzzy,fuzzy_ratio,lev_dist_lev,lev_dist_fuzzy,is_name_token_present_in_sku_lev,is_name_token_present_in_sku_fuzzy,cnt_common_tokens_lev,cnt_common_tokens_fuzzy
0,0,Pupuk Urea N 46%,Urea Nitrea,Urea Daun Buah,2,3,,False,False,pupuk urea n,...,pupuk urea n 46 %,urea nitrea,urea daun buah,44.0,11,12,True,True,1,1
1,1,Pupuk Amonium Sulfat ZA,Ammonium Sulfate,Ammonium Sulfate,117,117,,True,True,pupuk amonium sulfat za,...,pupuk amonium sulfat za,ammonium sulfate,ammonium sulfate,72.0,10,10,False,False,0,0
2,2,Pupuk Super Fosfat SP-36,Triple Super Phospate (TSP),Triple Super Phospate (TSP),64,64,,False,False,pupuk super fosfat sp,...,pupuk super fosfat sp - 36,triple super phospate tsp,triple super phospate tsp,52.0,14,14,True,True,1,1
3,3,Pupuk NPK Phonska,Fertiphos,NPK PIM 15-15-15,78,22,pihc,True,True,pupuk npk phonska,...,pupuk npk phonska,fertiphos,npk pim 15x15x15,35.0,13,14,False,True,0,1
4,4,Pupuk NPK Formula Khusus,Pak Tani Fertila 8-15-19,NPK Kebomas 15-15-15,101,18,,True,True,pupuk npk formula khusus,...,pupuk npk formula khusus,pak tani fertila 8x15x19,npk kebomas 15x15x15,35.0,17,19,False,True,0,1


In [148]:
result_fuzzy = duckdb.query(
    '''
    SELECT
        product_id,
        product_name,
        clean_name_alphanum,
        result_sku_lev,
        result_sku_fuzzy,
        result_sku_id_lev,
        result_sku_id_fuzzy,
        fuzzy_ratio,
        lev_dist_lev AS lev_dist_int
    FROM
        result_fuzzy_raw
    '''
).to_df()
result_fuzzy.head()

Unnamed: 0,product_id,product_name,clean_name_alphanum,result_sku_lev,result_sku_fuzzy,result_sku_id_lev,result_sku_id_fuzzy,fuzzy_ratio,lev_dist_int
0,0,Pupuk Urea N 46%,pupuk urea n 46,Urea Nitrea,Urea Daun Buah,2,3,44.0,11
1,1,Pupuk Amonium Sulfat ZA,pupuk amonium sulfat za,Ammonium Sulfate,Ammonium Sulfate,117,117,72.0,10
2,2,Pupuk Super Fosfat SP-36,pupuk super fosfat sp 36,Triple Super Phospate (TSP),Triple Super Phospate (TSP),64,64,52.0,14
3,3,Pupuk NPK Phonska,pupuk npk phonska,Fertiphos,NPK PIM 15-15-15,78,22,35.0,13
4,4,Pupuk NPK Formula Khusus,pupuk npk formula khusus,Pak Tani Fertila 8-15-19,NPK Kebomas 15-15-15,101,18,35.0,17


In [179]:
result_lev = pd.read_csv('datasets/Similarity Result - result_v2_product_id.csv', sep=',')
result_lev = result_lev.rename(columns={
    'B_product_id': 'product_id',
    'B_string_raw': 'product_name',
    'Best_A_strings_raw': 'sku_name',
    'string_similarity': 'lev_distance'
})
result_lev = result_lev.loc[
    result_lev['lev_distance'] >= 0.59,
    ['product_id', 'product_name', 'sku_name', 'lev_distance']
]

# dedup with duckdb by lev_distance desc
result_lev = duckdb.query(
    '''
    SELECT
        product_id,
        product_name,
        sku_name,
        lev_distance,
    FROM
        result_lev
    QUALIFY
        ROW_NUMBER() OVER (
            PARTITION BY
                product_id
            ORDER BY
                lev_distance DESC
        ) = 1
    ORDER BY
        product_id
    '''
).to_df()

result_lev

Unnamed: 0,product_id,product_name,sku_name,lev_distance
0,1,Pupuk Amonium Sulfat ZA,Ammonium Sulfate,0.600000
1,27,FASTAC,Mestac,0.666667
2,52,permipos,Fertiphos,0.666667
3,77,boron,Borat,0.600000
4,83,mutiara grower,Mutiara 16-16-16,0.625000
...,...,...,...,...
2414,43907,NPK Pelangi 16-16-16 @20 kg,Pelangi 16-16-16,0.650000
2415,43912,NPK PELANGI 13-6-27-4 FUSE @50KG,NPK Pelangi 13-6-27-4,0.666667
2416,43915,NPK PELANGI 12.12.17.2 FUSE,Pelangi 12-12-17-2,0.666667
2417,43926,SP 36 PETROGRESS,SP-36 Petro,0.642857


# Final Result Post-processing

In [189]:
final_res = duckdb.query(
    '''
    SELECT
        f.product_id,
        f.product_name,
        f.clean_name_alphanum,
        f.result_sku_lev AS result_sku_lev_int,
        f.result_sku_fuzzy,
        f.result_sku_id_lev AS result_sku_id_lev_int,
        f.result_sku_id_fuzzy,
        f.fuzzy_ratio,
        f.lev_dist_int,
        l.product_id AS product_id_lev,
        l.sku_name AS result_sku_lev_ratio,
        l.lev_distance AS lev_dist_ratio
    FROM
        result_fuzzy AS f
    LEFT JOIN
        result_lev AS l
    ON
        f.product_id = l.product_id
    ORDER BY
        f.product_id
    '''
).to_df()

final_res

Unnamed: 0,product_id,product_name,clean_name_alphanum,result_sku_lev_int,result_sku_fuzzy,result_sku_id_lev_int,result_sku_id_fuzzy,fuzzy_ratio,lev_dist_int,product_id_lev,result_sku_lev_ratio,lev_dist_ratio
0,0,Pupuk Urea N 46%,pupuk urea n 46,Urea Nitrea,Urea Daun Buah,2,3,44.0,11,,,
1,1,Pupuk Amonium Sulfat ZA,pupuk amonium sulfat za,Ammonium Sulfate,Ammonium Sulfate,117,117,72.0,10,1.0,Ammonium Sulfate,0.6
2,2,Pupuk Super Fosfat SP-36,pupuk super fosfat sp 36,Triple Super Phospate (TSP),Triple Super Phospate (TSP),64,64,52.0,14,,,
3,3,Pupuk NPK Phonska,pupuk npk phonska,Fertiphos,NPK PIM 15-15-15,78,22,35.0,13,,,
4,4,Pupuk NPK Formula Khusus,pupuk npk formula khusus,Pak Tani Fertila 8-15-19,NPK Kebomas 15-15-15,101,18,35.0,17,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
43996,43997,Extra one 680 EC @ 250 ml,extra one 680 ec 250 ml,EMCOTE 13-6-27-2+TE,Petro-CAS,169,9,44.0,17,,,
43997,43998,Extra One 680 SC @ 500 ml,extra one 680 sc 500 ml,EMCOTE 13-6-27-2+TE,FERTIGRES 16-20+13S,169,21,45.0,18,,,
43998,43999,JARING ARWANA @ 100 METER,jaring arwana 100 meter,CAP TAWON 15-15-15,Pelangi Agro 20-10-10,136,29,45.0,18,,,
43999,44000,Terong Puma F1 @ 5 gram,terong puma f 1 5 gram,MerokeCPN NK 15-15+TE,MerokeCPN NK 15-15+TE,55,55,45.0,15,,,


In [190]:
def is_match_naive_regex(s):
    pat = "beras|ayam|ikan|tapetool|remasil|pasir|megastar|pajero|kastro|menir|" \
        "rapia|top asia|great|greta|emrat|koret|obor|tomat|destan|betras|saporo|" \
        "hadroh|manohara|preza|pare valera|santika|doran|bara|posatbona|apuri|" \
        "toram|bolt|cream|angin|star|petir|minyak|catchoize|lenovo|bela ?rusia|" \
        "^elang |borax"
    return (
        re.search(pat, s, re.IGNORECASE) is not None
        or (len(s) <= 2 and s != 'za')
    )

In [191]:
final_res['is_match_naive_regex'] = final_res['clean_name_alphanum'].apply(is_match_naive_regex)
final_res.head()

Unnamed: 0,product_id,product_name,clean_name_alphanum,result_sku_lev_int,result_sku_fuzzy,result_sku_id_lev_int,result_sku_id_fuzzy,fuzzy_ratio,lev_dist_int,product_id_lev,result_sku_lev_ratio,lev_dist_ratio,is_match_naive_regex
0,0,Pupuk Urea N 46%,pupuk urea n 46,Urea Nitrea,Urea Daun Buah,2,3,44.0,11,,,,False
1,1,Pupuk Amonium Sulfat ZA,pupuk amonium sulfat za,Ammonium Sulfate,Ammonium Sulfate,117,117,72.0,10,1.0,Ammonium Sulfate,0.6,False
2,2,Pupuk Super Fosfat SP-36,pupuk super fosfat sp 36,Triple Super Phospate (TSP),Triple Super Phospate (TSP),64,64,52.0,14,,,,False
3,3,Pupuk NPK Phonska,pupuk npk phonska,Fertiphos,NPK PIM 15-15-15,78,22,35.0,13,,,,False
4,4,Pupuk NPK Formula Khusus,pupuk npk formula khusus,Pak Tani Fertila 8-15-19,NPK Kebomas 15-15-15,101,18,35.0,17,,,,False


In [192]:
# post-processing

final_res = duckdb.query(
    '''
    SELECT
        product_id,
        product_name,
        clean_name_alphanum,

        CASE
            WHEN is_match_naive_regex THEN 'Others'
            WHEN fuzzy_ratio < 40 THEN 'Others'
            WHEN lev_dist_int <= 2 THEN result_sku_lev_int
            WHEN product_id_lev IS NOT NULL THEN result_sku_lev_ratio
            ELSE result_sku_fuzzy
        END AS result_sku,

        CASE
            WHEN is_match_naive_regex THEN 'Others'
            WHEN fuzzy_ratio < 40 THEN 'Others'
            WHEN lev_dist_int <= 2 THEN 'levenshtein'
            WHEN product_id_lev IS NOT NULL THEN 'levenshtein'
            ELSE 'fuzzy'
        END AS method_label,

        fuzzy_ratio,
        is_match_naive_regex,
        lev_dist_int,
        lev_dist_ratio,
    FROM
        final_res
    ORDER BY
        product_id
    '''
).to_df()

In [193]:
final_res

Unnamed: 0,product_id,product_name,clean_name_alphanum,result_sku,method_label,fuzzy_ratio,is_match_naive_regex,lev_dist_int,lev_dist_ratio
0,0,Pupuk Urea N 46%,pupuk urea n 46,Urea Daun Buah,fuzzy,44.0,False,11,
1,1,Pupuk Amonium Sulfat ZA,pupuk amonium sulfat za,Ammonium Sulfate,levenshtein,72.0,False,10,0.6
2,2,Pupuk Super Fosfat SP-36,pupuk super fosfat sp 36,Triple Super Phospate (TSP),fuzzy,52.0,False,14,
3,3,Pupuk NPK Phonska,pupuk npk phonska,Others,Others,35.0,False,13,
4,4,Pupuk NPK Formula Khusus,pupuk npk formula khusus,Others,Others,35.0,False,17,
...,...,...,...,...,...,...,...,...,...
43996,43997,Extra one 680 EC @ 250 ml,extra one 680 ec 250 ml,Petro-CAS,fuzzy,44.0,False,17,
43997,43998,Extra One 680 SC @ 500 ml,extra one 680 sc 500 ml,FERTIGRES 16-20+13S,fuzzy,45.0,False,18,
43998,43999,JARING ARWANA @ 100 METER,jaring arwana 100 meter,Pelangi Agro 20-10-10,fuzzy,45.0,False,18,
43999,44000,Terong Puma F1 @ 5 gram,terong puma f 1 5 gram,MerokeCPN NK 15-15+TE,fuzzy,45.0,False,15,


In [194]:
catalog = pd.read_csv('datasets/processed/product_catalog.tsv', sep='\t')

# remove duplicates by product_sku
catalog = catalog.drop_duplicates(subset=['product_sku'])
print(catalog.shape)

catalog.head()

(186, 11)


Unnamed: 0,sku_id,product_sku,brand,type,formula,is_sku_only_alphanum,is_sku_only_alphabet,clean_sku,clean_sku_alphanum,clean_sku_non_formula,clean_sku_formula
0,0,Urea Petro,pihc,urea,,True,True,urea petro,urea petro,urea petro,
1,1,Urea PIM,pihc,urea,,True,True,urea pim,urea pim,urea pim,
2,2,Urea Nitrea,pihc,urea,,True,True,urea nitrea,urea nitrea,urea nitrea,
3,3,Urea Daun Buah,pihc,urea,,True,True,urea daun buah,urea daun buah,urea daun buah,
4,4,Urea Pusri,pihc,urea,,True,True,urea pusri,urea pusri,urea pusri,


In [195]:
final_res = duckdb.query(
    '''
    SELECT
        product_id,
        product_name,
        clean_name_alphanum AS product_clean_name,
        result_sku,
        c.brand AS result_brand,
        c.type AS result_type,
        c.formula AS result_formula,
        method_label,
        fuzzy_ratio,
        is_match_naive_regex,
        lev_dist_int,
        lev_dist_ratio
    FROM
        final_res AS f
    LEFT JOIN
        catalog AS c
    ON
        f.result_sku = c.product_sku
    ORDER BY
        f.product_id
    '''
).to_df()
final_res.shape

(44001, 12)

In [196]:
final_res.to_csv('datasets/final_result.tsv', sep='\t', index=False)
final_res.head(10)

Unnamed: 0,product_id,product_name,product_clean_name,result_sku,result_brand,result_type,result_formula,method_label,fuzzy_ratio,is_match_naive_regex,lev_dist_int,lev_dist_ratio
0,0,Pupuk Urea N 46%,pupuk urea n 46,Urea Daun Buah,pihc,urea,,fuzzy,44.0,False,11,
1,1,Pupuk Amonium Sulfat ZA,pupuk amonium sulfat za,Ammonium Sulfate,yara,za,,levenshtein,72.0,False,10,0.6
2,2,Pupuk Super Fosfat SP-36,pupuk super fosfat sp 36,Triple Super Phospate (TSP),mahkota,fosfat,,fuzzy,52.0,False,14,
3,3,Pupuk NPK Phonska,pupuk npk phonska,Others,,,,Others,35.0,False,13,
4,4,Pupuk NPK Formula Khusus,pupuk npk formula khusus,Others,,,,Others,35.0,False,17,
5,5,Pupuk Organik Granul,pupuk organik granul,Petro BioFertil,pihc,organik,,fuzzy,42.0,False,13,
6,6,Pupuk Organik Cair,pupuk organik cair,Petro BioFertil,pihc,organik,,fuzzy,43.0,False,13,
7,7,Produk Lain,produk lain,Mesti-Rock,laoying,fosfat,,fuzzy,43.0,False,8,
8,8,Rondap,rondap,Others,,,,Others,38.0,False,4,
9,9,Sekor,sekor,Others,,,,Others,30.0,False,4,


In [197]:
final_res.method_label.value_counts()

method_label
fuzzy          22810
Others         18951
levenshtein     2240
Name: count, dtype: int64

In [198]:
final_res[['method_label', 'result_sku']].value_counts()

method_label  result_sku                                            
Others        Others                                                    18951
fuzzy         Brucite                                                    1647
              Vrea                                                       1142
              Petro-CAS                                                   747
              Borat                                                       689
                                                                        ...  
levenshtein   MESTI-PATENHIJO 15-10-20+TE                                   1
              Magnum 15-10-22-2Mg-3.8S                                      1
              MerokeMAP 12-61-0                                             1
fuzzy         YaraVita TRI-PHOLATE 0-0-0-70Mn+50Zn+25Fe+20B+20Cu+1Mo        1
levenshtein   Mahkota 12-6-22-3+TE                                          1
Name: count, Length: 291, dtype: int64