# Notebook for DSW Telkomsel 2023 - Party Parrot

In [4]:
import os
import re
import duckdb
import numpy as np
import pandas as pd

from fuzzywuzzy import fuzz
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read Data

In [2]:
df_name = pd.read_excel('datasets/raw/product_name.xlsx')
catalog = pd.read_excel('datasets/raw/product_catalog.xlsx')

df_name.rename(columns={'Product Name': 'product_name'}, inplace=True)
catalog.rename(columns={
    'Product SKU': 'product_sku',
    'Brand': 'brand',
    'Type': 'type',
    'Formula': 'formula'
}, inplace=True)

df_name = df_name.dropna()
df_name

Unnamed: 0,product_name
0,Pupuk Urea N 46%
1,Pupuk Amonium Sulfat ZA
2,Pupuk Super Fosfat SP-36
3,Pupuk NPK Phonska
4,Pupuk NPK Formula Khusus
...,...
43997,Extra one 680 EC @ 250 ml
43998,Extra One 680 SC @ 500 ml
43999,JARING ARWANA @ 100 METER
44000,Terong Puma F1 @ 5 gram


In [4]:
catalog

Unnamed: 0,product_sku,brand,type,formula
0,Urea Petro,PIHC,Urea,
1,Urea PIM,PIHC,Urea,
2,Urea Nitrea,PIHC,Urea,
3,Urea Daun Buah,PIHC,Urea,
4,Urea Pusri,PIHC,Urea,
...,...,...,...,...
182,Mestical,LaoYing,Nitrogen,
183,Mesti-GAS,LaoYing,ZA,
184,Mestac,LaoYing,Nitrogen,
185,MestiKisrit,LaoYing,Mg,


In [182]:
d = {
    'basf': {'nitrophoska', 'entec', 'brucite', 'cantik', 'calcinit', 'perfect'},
    'dgw/hextar': {'mroph', 'morph', 'hx', 'hxas', 'gold', 'dgw', 'cockhead', 'booster', 'compaction'},
    'laoying': {'mestikali', 'mestifos', 'patenbiru', 'mestikisrit', 'bomber', 'mestical',
                'mestifoska', 'emcote', 'patenhijo', 'mestac', 'mesti', 'evermax', 'borate', 'lao ying', 'loying'},
    'mahkota': {'borat', 'peru', 'mahkota', 'kieserite', 'triple', 'mesir'},
    'mutiara': {'merokemkp', 'sprinter', 'partner', 'professional', 'meroketsp', 'merokesop',
                'kali', 'flex', 'merokeza', 'butir', 'merokemap', 'grower', 'merokemop',
                'meroke', 'merokekkb', 'karate', 'merokecpn', 'mutiara', 'merokerock',
                'boroni', 'suburkali'},
    'paktani': {'saprodap', 'holland', 'phosgro', 'urecote', 'magnesium', 'padi', 'fertikali',
                'magnum', 'sawit', 'kristalon', 'fertiphos', 'merah', 'tani', 'fertila',
                'sulfate', 'fertilla', 'singkong', 'qrop', 'ultradap'},
    'pihc': {'jeranti', 'kujang', 'solution', 'petro', 'petrofert', 'nitrat', 'pusri', 'npk',
             'mikroba', 'phosphate', 'phonska', 'polivit', 'rock', 'fertigres', 'nitrea', 
             'coating', 'nitroku', 'pelangi', 'biofertil', 'agro', 'kebomas', 'nitralite',
             'niphos', 'ningrat', 'kapur', 'phosgreen', 'pertanian', 'nitroska'},
    'tawon': {'tawon', 'nitroplus', 'vrea', 'astiva'},
    'yara': {'yaraliva', 'yaravita', 'krista', 'winner', 'yaravera', 'unik', 'palmae',
             'yaratera', 'calcinit', 'faster', 'yaramila', 'tropicote', 'complex',
             'pholate', 'nitrabor', 'sulfate', 'ammonium'}
}

d = {k: sorted(list(d[k] | {k})) for k in d}

# convert to flattened Pandas Series
d = pd.Series(d).explode().reset_index()
d.rename(columns={'index': 'brand', 0: 'common_token'}, inplace=True)
d.to_csv('datasets/processed/common_tokens_per_brand.tsv', sep='\t', index=False)

In [193]:
common_tokens = pd.read_csv('datasets/processed/common_tokens_per_brand.tsv', sep='\t', index_col=[0])
common_tokens

Unnamed: 0_level_0,common_token
brand,Unnamed: 1_level_1
basf,basf
basf,brucite
basf,calcinit
basf,cantik
basf,entec
...,...
yara,yaraliva
yara,yaramila
yara,yaratera
yara,yaravera


# Preprocessing

In [24]:
def is_containing_non_alphanumeric(string):
    return not bool(re.search(r'[^a-zA-Z0-9 ]', string))

df_name['is_only_alphanum'] = df_name['product_name'].apply(is_containing_non_alphanumeric)
catalog['is_only_alphanum'] = catalog['product_sku'].apply(is_containing_non_alphanumeric)

In [37]:
s = '250'
s = re.sub(r'(\d{2})', r'\1 ', s)
s

'25 0'

In [49]:
def extract_formula(s):
    match = re.search(r'\d+\.?,?\d*x\d+\.?,?\d*x+\d+\.?,?\d*', s)
    if match:
        return match.group()
    return np.nan


def give_spaces_before_and_after_punct(s):
    return re.sub(r'([.,\-()])', r' \1 ', s)

s = 'abc,.-()'
give_spaces_before_and_after_punct(s)

'abc ,  .  -  (  ) '

In [31]:
def clean(s):
    s = s.lower()

    # give spaces before and after the special characters, that are not number or decimal number
    # e.g. 'abc+c' to 'abc + c', but '12.9' not to '12 . 9'
    s = re.sub(r'([^\d\.,-]+)', r' \1 ', s)
    s = re.sub(r'\s+', ' ', s) # remove extra spaces once again

    # separate value and units like "100ml" to "100 ml" or "50g" to "50 g"
    # that are not wrapped in <...> bracket
    s = re.sub(r'(\d+)([a-zA-Z]+)', r'\1 \2', s)

    # make chemical formula separator uniform to 'x', i.e. '4.5 - 3.6 - 2.1' to '4.5x3.6x2.1'
    s = re.sub(r'(\d+\.?,?\d*)[×x\+,\. |-]+(\d+\.?,?\d*)[×x\+,\. |-]+(\d+\.?,?\d*)', r'\1x\2x\3', s)

    s = re.sub(r'\s+', ' ', s) # remove extra spaces once again
    
    return s

catalog['clean_sku'] = catalog.product_sku.apply(clean)
catalog['brand'] = catalog.brand.str.lower().str.replace(' ', '')

unique_brand = sorted(catalog.brand.unique())
df_name['clean_name'] = df_name.product_name.apply(clean)

In [7]:
df_name[df_name.is_only_alphanum == False].loc[5200:5400]

Unnamed: 0,product_name,is_only_alphanum,clean_name
5203,npk kujang 30.6.8 25kg,False,npk kujang 30.6x8x25 kg
5211,Pakan Ayam 311/511/512,False,pakan ayam 311 / 511 / 512
5213,Super-K 500 grm,False,super - k 500 grm
5218,Lanet 25 wp. 15 grm,False,lanet 25 wp . 15 grm
5220,Lannete 40 SP. 100 grm,False,lannete 40 sp . 100 grm
5225,X-TRAIL 100ml,False,x - trail 100 ml
5243,selang putih 0.20 x 15 x 20,False,selang putih 0.20x15x20
5244,selang putih 0.20x20x50,False,selang putih 0.20x20x50
5245,"0,20×15×50 putih",False,"0,20x15x50 putih"
5249,sp26 -25kg,False,sp 26 -25 kg


In [8]:
catalog[catalog.is_only_alphanum == False].head(20)

Unnamed: 0,product_sku,brand,type,formula,is_only_alphanum,clean_sku
9,Petro-CAS,PIHC,Mikro,,False,petro - cas
10,SP-36 Petro,PIHC,Fosfat,,False,sp -36 petro
13,SP-26 Petro,PIHC,Fosfat,,False,sp -26 petro
15,Phonska Plus 15-15-15+9S+0.2Zn,PIHC,Majemuk,15-15-15,False,phonska plus 15x15x15 + 9 s+ 0.2 zn
16,NPK Kebomas 12-12-17+2MgO+0.1Zn+0.2B+0.2Fe,PIHC,Majemuk,12-12-17,False,npk kebomas 12x12x17 + 2 mgo+ 0.1 zn+ 0.2 b+ ...
17,NPK Kebomas 12-6-22+3Mg,PIHC,Majemuk,12-6-22,False,npk kebomas 12x6x22 + 3 mg
18,NPK Kebomas 15-15-15,PIHC,Majemuk,15-15-15,False,npk kebomas 15x15x15
19,PETROFERT 16-16-8+13S,PIHC,Majemuk,16-16-8,False,petrofert 16x16x8 + 13 s
20,Petro Niphos 20-20+13S,PIHC,Majemuk,20-20-0,False,petro niphos 20x20x13 s
21,FERTIGRES 16-20+13S,PIHC,Majemuk,16-20-0,False,fertigres 16x20x13 s


In [9]:
df_name.to_csv('datasets/processed/product_name.tsv', sep='\t', index=False)
catalog.to_csv('datasets/processed/product_catalog.tsv', sep='\t', index=False)