In [1]:
import polars as pl
import pandas as pd
from pathlib import Path
import string
import re
from tqdm import tqdm
from time import perf_counter

In [2]:
path = Path.home() / 'OneDrive - Seagroup/ai/nlp/cat_tag/raw'
files = [*path.glob('*.ftr')]

In [3]:
df_pl = pl.concat([pl.read_ipc(f) for f in tqdm(files, desc='pl reading')], rechunk=True)
df_pd = pd.concat([pd.read_feather(f) for f in tqdm(files, desc='pd reading')])

pl reading: 100%|██████████| 268/268 [00:02<00:00, 111.09it/s]
pd reading: 100%|██████████| 268/268 [00:05<00:00, 52.15it/s]


In [4]:
print(f"Shape: {df_pl.shape}, {df_pl['item_id'].n_unique():,.0f} items")
df_pl.head()

Shape: (10101055, 4), 9,147,752 items


item_id,item_name,level1_global_be_category,level2_global_be_category
i64,str,str,str
3449028378,"""Đôi jack bắp c…","""Audio""","""Amplifiers & M…"
4742031388,"""Hộp số quạt…","""Audio""","""Amplifiers & M…"
21470061301,"""Bo Mạch Khuếch…","""Audio""","""Amplifiers & M…"
4042683466,"""Bảng mạch khuế…","""Audio""","""Amplifiers & M…"
1492579343,"""USB Bluetooth …","""Audio""","""Amplifiers & M…"


In [5]:
print(f"Shape: {df_pd.shape}, {df_pd['item_id'].nunique():,.0f} items")
df_pd.head()

Shape: (10101055, 4), 9,147,752 items


Unnamed: 0,item_id,item_name,level1_global_be_category,level2_global_be_category
0,3449028378,Đôi jack bắp chuối cái 4mm JK-312 (đỏ+đen) chấ...,Audio,Amplifiers & Mixers
1,4742031388,Hộp số quạt ... có thê thay cho quạt 12v ...,Audio,Amplifiers & Mixers
2,21470061301,Bo Mạch Khuếch Đại Âm Thanh Kỹ Thuật Số Blueto...,Audio,Amplifiers & Mixers
3,4042683466,Bảng mạch khuếch đại âm lập thể kỹ thuật số 2 ...,Audio,Amplifiers & Mixers
4,1492579343,USB Bluetooth HJX-001/BT-163 Tạo Bluetooth Cho...,Audio,Amplifiers & Mixers


In [6]:
# Clean: Format
col = ['item_name'] + [f'level{i}_global_be_category' for i in [1, 2]]
start = perf_counter()
df_pl = df_pl.with_columns([pl.col(i).str.strip().str.to_lowercase().alias(i) for i in col])
print(perf_counter() - start)

start = perf_counter()
for i in col:
    df_pd[i] = df_pd[i].str.strip().str.lower()
print(perf_counter() - start)

4.270013399999698
8.601254799999879


In [7]:
start = perf_counter()
df_pl = df_pl.with_columns(pl.col('item_name').str.replace(r'\[[^\]]*\]', '').alias('item_name_edit'))
df_pl = df_pl.with_columns(pl.col('item_name_edit').str.replace_all('[[:punct:]]+', ''))
df_pl = df_pl.with_columns(pl.col('item_name_edit').str.replace_all('\s+', ' '))
df_pl = df_pl.unique(subset=['item_name_edit'])
print(perf_counter() - start)

start = perf_counter()
trans = str.maketrans(string.punctuation, ' '*len(string.punctuation))
df_pd['item_name_edit'] = [re.sub(r'\[[^\]]*\]', '', i) for i in tqdm(df_pd['item_name'].astype(str).values, desc=f'Clean bracket')]
df_pd['item_name_edit'] = [i.translate(trans) for i in tqdm(df_pd['item_name_edit'].values, desc=f'Clean punctuations')]
df_pd['item_name_edit'] = [" ".join(re.split("\s+", i)) for i in tqdm(df_pd['item_name_edit'].values, desc=f'Clean spaces')]
df_pd['item_name_edit'] = df_pd['item_name_edit'].str.strip()
df_pd.drop_duplicates(subset='item_name_edit', inplace=True)
df_pd.reset_index(drop=True, inplace=True)
print(perf_counter() - start)

10.149959800000033


Clean bracket: 100%|██████████| 10101055/10101055 [00:05<00:00, 1919404.27it/s]
Clean punctuations: 100%|██████████| 10101055/10101055 [00:27<00:00, 371840.48it/s]
Clean spaces: 100%|██████████| 10101055/10101055 [00:29<00:00, 344192.63it/s]


69.60731560000022


In [8]:
start = perf_counter()
df_pl = df_pl.with_columns(
    pl.concat_str([pl.col('level1_global_be_category'), pl.col('level2_global_be_category')], separator=' > ')
    .alias('all_cat')
)
print(perf_counter() - start)

start = perf_counter()
df_pd['all_cat'] = df_pd['level1_global_be_category'] + ' > ' + df_pd['level2_global_be_category']
print(perf_counter() - start)

0.45946209999965504
0.906440400000065
