# Notebook for DSW Telkomsel 2023 - Party Parrot

In [1]:
import os
import re
import duckdb
import numpy as np
import pandas as pd

from fuzzywuzzy import fuzz
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

#  Exploration

In [71]:
df_name = pd.read_csv('datasets/processed/product_name.tsv', sep='\t')
catalog = pd.read_csv('datasets/processed/product_catalog.tsv', sep='\t')

In [72]:
name_counts = df_name.clean_name.value_counts()
name_counts.to_frame()

Unnamed: 0_level_0,count
clean_name,Unnamed: 1_level_1
sp - 26 @ 25 kg,9
phonska plus 15x15x15 @ 25 kg,9
nitrea @ 50 kg,8
naga @ 1 liter,7
furadan @ 2 kg,7
...,...
simetal bisi321,1
urea curah,1
xycrot 400 ml,1
bhosep 250 ml,1


In [73]:
joined = pd.merge(name_counts.to_frame().reset_index(), catalog[['sku_id', 'clean_sku']], how='cross')
joined

Unnamed: 0,clean_name,count,sku_id,clean_sku
0,sp - 26 @ 25 kg,9,0,urea petro
1,sp - 26 @ 25 kg,9,1,urea pim
2,sp - 26 @ 25 kg,9,2,urea nitrea
3,sp - 26 @ 25 kg,9,3,urea daun buah
4,sp - 26 @ 25 kg,9,4,urea pusri
...,...,...,...,...
7607155,terong liberto hijau @ 5 gram,1,182,mestical
7607156,terong liberto hijau @ 5 gram,1,183,mesti - gas
7607157,terong liberto hijau @ 5 gram,1,184,mestac
7607158,terong liberto hijau @ 5 gram,1,185,mestikisrit


In [86]:
def is_name_token_present_in_sku(name, sku) -> bool:
    name_tokens = name.split()
    sku_tokens = sku.split()
    for token in name_tokens:
        if (len(token) <= 2 and token != 'za') or token in ['plus']:
            continue
        if token in sku_tokens:
            return True
    return False

duckdb.remove_function("is_name_token_present_in_sku")
duckdb.create_function("is_name_token_present_in_sku", is_name_token_present_in_sku)

<duckdb.duckdb.DuckDBPyConnection at 0x2f5e6368eb0>

In [87]:
hard_match = duckdb.query(
    """
    WITH cte AS (
        SELECT
            clean_name,
            clean_sku,
            sku_id,
            is_name_token_present_in_sku(clean_name, clean_sku) as is_token_in_name_in_sku
        FROM
            joined
    )

    SELECT
        *
    FROM
        cte
    WHERE is_token_in_name_in_sku
    """
).to_df()

In [88]:
np.random.seed(7)
hard_match.sample(20)

Unnamed: 0,clean_name,clean_sku,sku_id,is_token_in_name_in_sku
52665,npk mutiara tani,pak tani 16x16x16 merah,86,True
2538,urea non 50 kg,urea daun buah,3,True
43997,npk 16x16x16 mutiara 1 kg,petro nitrat 16x16x16,37,True
23178,npk 15x15x15 fus ns @ 50,npk pusri 15x15x15,33,True
37315,sp 36 petro 50 kg,rock phosphate petro,11,True
51639,kangkung pak tani,pak tani sawit 13x6x27 - 4x0x65 b,90,True
27933,npk pak tani 16x16x16 kemasan 50 kg,cockhead 16x16x16,152,True
23486,"npk 13x6x27 + 4 mg + 0 , 65 b ns @ 50 kg",emcote 13x6x27 - 2 + te,169,True
11876,npk mutiara 16x16x16 @ 50,pelangi 16x16x16,31,True
33768,urea petronon sub,urea nitrea,2,True


In [89]:
df_name[~df_name.clean_name.isin(hard_match.clean_name.unique())]

Unnamed: 0,product_id,product_name,is_name_only_alphanum,is_name_only_alphabet,clean_name,clean_name_non_formula,clean_name_formula,possible_brand
5,5,Pupuk Organik Granul,True,False,pupuk organik granul,pupuk organik granul,,
6,6,Pupuk Organik Cair,True,False,pupuk organik cair,pupuk organik cair,,
7,7,Produk Lain,True,False,produk lain,produk lain,,
8,8,Rondap,True,True,rondap,rondap,,
9,9,Sekor,True,True,sekor,sekor,,
...,...,...,...,...,...,...,...,...
43996,43997,Extra one 680 EC @ 250 ml,False,False,extra one 680 ec @ 250 ml,extra one,,
43997,43998,Extra One 680 SC @ 500 ml,False,False,extra one 680 sc @ 500 ml,extra one,,
43998,43999,JARING ARWANA @ 100 METER,False,False,jaring arwana @ 100 meter,jaring arwana @,,
43999,44000,Terong Puma F1 @ 5 gram,False,False,terong puma f1 @ 5 gram,terong puma f,,


In [8]:
all_tokens = []
df_name.clean_name.apply(lambda x: all_tokens.extend(x.split(' ')))
all_tokens = [token for token in all_tokens if token.isalpha() and len(token) > 2]
all_tokens = pd.Series(all_tokens)
token_counts = all_tokens.value_counts()
token_counts.head(20)

npk        2048
ltr        1060
liter       923
urea        704
gram        703
plus        543
benih       539
tani        386
mutiara     384
daun        367
jagung      356
pupuk       354
merah       346
kecil       339
padi        327
bibit       327
petro       317
kcl         300
plastik     295
pelangi     285
Name: count, dtype: int64

In [11]:
name_counts[(name_counts <= 1) & (name_counts.index.str.len() <= 2)].sort_index()

clean_name
32    1
5     1
64    1
76    1
99    1
a     1
ap    1
as    1
b1    1
bm    1
es    1
ff    1
gl    1
gp    1
hk    1
hp    1
ki    1
lp    1
n5    1
nk    1
or    1
rp    1
sp    1
ss    1
ts    1
ur    1
w     1
za    1
zk    1
Name: count, dtype: int64

# Post-analysis

In [178]:
final_res = pd.read_csv('datasets/processed/result_fuzzy.tsv', sep='\t')
print(set(range(44002)) - set(final_res.product_id.tolist()))
# final_res.to_csv('datasets/processed/final_result.tsv', sep='\t', index=False)
final_res

{5398}


Unnamed: 0,product_id,product_name,result_sku_lev,result_sku_fuzzy,result_sku_id_lev,result_sku_id_fuzzy,possible_brand,is_name_only_alphanum,is_name_only_alphabet,clean_name_non_formula,...,result_clean_sku_fuzzy,fuzzy_ratio,lev_dist_lev,lev_dist_fuzzy,lev_dist_lev_wo_form,lev_dist_fuzzy_wo_form,is_name_token_present_in_sku_lev,is_name_token_present_in_sku_fuzzy,cnt_common_tokens_lev,cnt_common_tokens_fuzzy
0,0,Pupuk Urea N 46%,Urea Nitrea,Urea Daun Buah,2,3,,False,False,pupuk urea n,...,urea daun buah,44.0,12,13,9.0,12.0,True,True,1,1
1,1,Pupuk Amonium Sulfat ZA,Ammonium Sulfate,Nitroplus (ZA),117,131,,True,False,pupuk amonium sulfat za,...,nitroplus za,40.0,10,17,10.0,17.0,False,True,0,1
2,2,Pupuk Super Fosfat SP-36,Triple Super Phospate (TSP),Triple Super Phospate (TSP),64,64,,False,False,pupuk super fosfat sp -,...,triple super phospate tsp,59.0,15,15,12.0,12.0,True,True,1,1
3,3,Pupuk NPK Phonska,Fertiphos,NPK Petro Ningrat 12-11-20,78,41,pihc,True,False,pupuk npk phonska,...,npk petro ningrat 12x11x20,37.0,13,21,13.0,21.0,False,True,0,1
4,4,Pupuk NPK Formula Khusus,Pak Tani Fertila 8-15-19,NPK Kebomas 15-15-15,101,18,,True,False,pupuk npk formula khusus,...,npk kebomas 15x15x15,36.0,17,19,17.0,19.0,False,True,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43996,43997,Extra one 680 EC @ 250 ml,EMCOTE 13-6-27-2+TE,PETROFERT 16-16-8+13S,169,19,,False,False,extra one,...,petrofert 16x16x8 + 13 s,45.0,18,19,19.0,21.0,False,False,0,0
43997,43998,Extra One 680 SC @ 500 ml,Petro-CAS,PETROFERT 16-16-8+13S,9,19,,False,False,extra one,...,petrofert 16x16x8 + 13 s,45.0,19,19,8.0,21.0,False,False,0,0
43998,43999,JARING ARWANA @ 100 METER,Mutiara 16-16-16,Solution N 28-10-10 + TE,52,28,,False,False,jaring arwana @,...,solution n 28x10x10 + te,45.0,19,20,14.0,20.0,False,False,0,0
43999,44000,Terong Puma F1 @ 5 gram,MerokeCPN NK 15-15+TE,MerokeCPN NK 15-15+TE,55,55,,False,False,terong puma f,...,merokecpn nk 15 - 15 + te,45.0,16,16,19.0,19.0,False,False,0,0


In [179]:
final_res.loc[
    final_res['is_name_token_present_in_sku_fuzzy'] & (final_res['fuzzy_ratio'] >= 20),
    ['possible_brand', 'clean_name', 'result_clean_sku_lev', 'result_clean_sku_fuzzy', 'fuzzy_ratio', 'cnt_common_tokens_fuzzy']
]. \
    sort_values('fuzzy_ratio', ascending=True). \
    head(20)

Unnamed: 0,possible_brand,clean_name,result_clean_sku_lev,result_clean_sku_fuzzy,fuzzy_ratio,cnt_common_tokens_fuzzy
1378,,boom padi,borat,pak tani padi 21x14x7 + 2 mgo + 2 s + te,20.0,1
18952,,padi 05,phosgro,pak tani padi 21x14x7 + 2 mgo + 2 s + te,21.0,1
12047,mutiara,hidro karate,nitralite,karate plus boroni 15.5x0x0 + 0.3 s + 26 cao,21.0,1
5944,,bom padi,borat,pak tani padi 21x14x7 + 2 mgo + 2 s + te,21.0,1
23193,,padi all,fertikali,pak tani padi 21x14x7 + 2 mgo + 2 s + te,21.0,1
12380,,phoenix perkutut gold,urea petro,gold dgw 16x10x18,21.0,1
40602,,kno pn 2 kg merah / putih s .u,cpn pak tani 15x0x15,dgw kno 3,21.0,1
17384,,kcl jsb @ 50 kg,mesti - za,mahkota mop kcl canada,22.0,1
15657,,kcl 88 1 liter,nitralite,mahkota mop kcl canada,22.0,1
32420,yara,yaravita,yaravera,yaravita tri - pholate 0x0x0 - 70 mn + 50 zn +...,22.0,1


In [197]:
cond1 = (
    False
    | (final_res.is_name_token_present_in_sku_fuzzy & (final_res.fuzzy_ratio >= 28))
    | ((final_res.cnt_common_tokens_fuzzy >= 2) & (final_res.fuzzy_ratio >= 28))
    | (final_res.possible_brand.notnull() & (final_res.fuzzy_ratio >= 55))
    | (final_res.possible_brand.isnull()  & (final_res.fuzzy_ratio >= 61))
    | (final_res['lev_dist_fuzzy_wo_form'] <= 1)
    | (final_res['lev_dist_fuzzy'] <= 1)
    | (final_res['lev_dist_lev'] <= 1)
    | (final_res['lev_dist_lev_wo_form'] <= 1)
)
print(final_res[cond1].shape)

final_res.loc[
    cond1,
    ['clean_name', 'result_clean_sku_fuzzy', 'fuzzy_ratio', 'cnt_common_tokens_fuzzy']
] \
  .sort_values('fuzzy_ratio') \
  .head(20)

(8118, 23)


Unnamed: 0,clean_name,result_clean_sku_fuzzy,fuzzy_ratio,cnt_common_tokens_fuzzy
40504,kno 3 putih kanitrat @ 1 kg,dgw kno 3,28.0,2
3003,kcl mkt,mahkota mop kcl canada,28.0,1
15312,barrier gold,gold dgw 16x10x18,28.0,1
41938,do tsp meroke,meroke flex - g 8x9x39 + 3 mgo,28.0,1
41937,do kcl meroke,meroke flex - g 8x9x39 + 3 mgo,28.0,1
18137,santi - gold,gold dgw 16x10x18,28.0,1
6640,kcl flake 5 kg,mahkota mop kcl canada,28.0,1
17946,crumble merah burung,pak tani 16x16x16 merah,28.0,1
13277,bibit padi ir nutri zinc,pak tani padi 21x14x7 + 2 mgo + 2 s + te,28.0,1
13273,bibit padi ir cakrabuana,pak tani padi 21x14x7 + 2 mgo + 2 s + te,28.0,1


In [38]:
low_fuzzy = final_res[final_res.fuzzy_ratio.between(0, 40)]
low_fuzzy.sort_values(['fuzzy_ratio', 'cnt_common_tokens_fuzzy', 'lev_dist_fuzzy'])

Unnamed: 0,product_id,product_name,result_sku_lev,result_sku_fuzzy,result_sku_id_lev,result_sku_id_fuzzy,possible_brand,is_name_only_alphanum,is_name_only_alphabet,clean_name_non_formula,...,clean_name,result_clean_sku_lev,result_clean_sku_fuzzy,fuzzy_ratio,lev_dist_lev,lev_dist_fuzzy,lev_dist_lev_wo_form,lev_dist_fuzzy_wo_form,cnt_common_tokens_lev,cnt_common_tokens_fuzzy
37233,37234,5,KSP,MK FOS 0-52-34,134,132,,True,False,,...,5,ksp,mk fos 0x52x34,13.0,3,13,,,0,0
3552,3552,zzzzzz,Borat,MerokeZA,66,43,,True,True,zzzzzz,...,zzzzzz,borat,merokeza,14.0,6,7,6.0,7.0,0,0
25048,25049,??????,Borat,DGW TSP,66,145,,False,False,??????,...,??????,borat,dgw tsp,14.0,7,7,6.0,7.0,0,0
15398,15399,Ff,KSP,Fertikali,134,82,,True,True,ff,...,ff,ksp,fertikali,18.0,3,8,3.0,8.0,0,0
15763,15764,dhddbbf,DGW TSP,Urea Daun Buah,145,3,,True,True,dhddbbf,...,dhddbbf,dgw tsp,urea daun buah,19.0,6,12,6.0,12.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26277,26278,ABACEL 18EC (1LITER),Nitroplus (ZA),Triple Super Phospate (TSP),131,64,,False,False,abacel,...,abacel 18 ec ( 1 liter ),nitroplus ( za ),triple super phospate ( tsp ),40.0,19,22,16.0,28.0,3,3
38329,38330,demolis. 100ml. (abamectin),Nitroplus (ZA),Nitroplus (ZA),131,131,,False,False,demolis .,...,demolis . 100 ml . ( abamectin ),nitroplus ( za ),nitroplus ( za ),40.0,24,24,13.0,13.0,3,3
8808,8809,Bentan -45WP (100x100 gram),Mahkota MOP (KCL Canada),Nitroplus (ZA),67,131,,False,False,bentan -,...,bentan - 45 wp ( 100 x100 gram ),mahkota mop ( kcl canada ),nitroplus ( za ),40.0,23,25,23.0,16.0,3,3
8169,8170,"Filia 50mL (trisiklazol,propikonazol)",Mahkota MOP (KCL Canada),Mahkota MOP (KCL Canada),67,67,,False,False,filia,...,"filia 50 ml ( trisiklazol , propikonazol )",mahkota mop ( kcl canada ),mahkota mop ( kcl canada ),40.0,31,31,25.0,25.0,3,3


In [None]:
final_res.loc[
    final_res.fuzzy_ratio.between(70, 80),
    ['product_id', 'clean_name', 'result_clean_sku_lev', 'result_clean_sku_fuzzy', 'fuzzy_ratio']
] \
    .sort_values('fuzzy_ratio') \
    .iloc[list(range(10)) + list(range(-10, 0))]

Unnamed: 0,product_id,clean_name,result_clean_sku_lev,result_clean_sku_fuzzy,fuzzy_ratio
41629,41630,urea prl daun buah @ 50 kg,urea daun buah,urea daun buah,70.0
22233,22234,za petro 50 kg,za petro,za petro,70.0
27425,27426,npk nitrate,nitralite,nitralite,70.0
19351,19352,npk pelangi 12.12x17x2 bld,npk pelangi 13x6x27 - 4,pelangi 12x12x17 - 2,70.0
41009,41010,pupuk kaptan kebomas,kapur pertanian kebomas,kapur pertanian kebomas,70.0
21036,21037,npk kujang 25 kg,npk kujang 30x6x8,npk kujang 15x15x15,70.0
27419,27420,lao ying 16x16x16 @ 1,pak tani 16x16x16 biru,pelangi 16x16x16,70.0
40920,40921,meroke saset,meroketsp,merokeza,70.0
35209,35210,urea pet 5 kg,urea petro,urea petro,70.0
35415,35416,urea prl @ 50 kg daun buah,urea daun buah,urea daun buah,70.0


In [50]:
final_res.loc[
    ~cond1 & (final_res.fuzzy_ratio.between(61, 62)),
    ['product_id', 'clean_name', 'result_clean_sku_lev', 'result_clean_sku_fuzzy', 'fuzzy_ratio']
] \
    .sort_values('fuzzy_ratio') \
    .iloc[list(range(10)) + list(range(-10, 0))]

Unnamed: 0,product_id,clean_name,result_clean_sku_lev,result_clean_sku_fuzzy,fuzzy_ratio
2,2,pupuk super fosfat sp - 36,triple super phospate ( tsp ),triple super phospate ( tsp ),61.0
33324,33325,urea granul daun buah @50 kg ns,urea daun buah,urea daun buah,61.0
18378,18379,kasur panjang + kelambu,kapur pertanian kebomas,kapur pertanian kebomas,61.0
33332,33333,urea ganul ns,urea daun buah,urea pusri,61.0
18005,18006,"polibag 7 , 5/15 x15",pelangi 15x15x15,pelangi 15x15x15,61.0
33381,33382,urea granul daun buah @50 kg ns,urea daun buah,urea daun buah,61.0
16827,16828,methox 42 sp,meroketsp,mesti - tsp,61.0
34360,34361,envirophos 36,fertiphos,nitrophoska 13x10x20,61.0
15428,15429,ponska grentop,phosgreen,phosgreen,61.0
34787,34788,calsium multimex,magnesium sulfate,magnesium sulfate,61.0
