# Import Libraries

In [14]:
import os
import pandas as pd

# Data Loading

In [15]:
def data_loading(root, category):
    root_path = "dataset"
    folder_path = os.path.join(root_path, category)

    # Load product list (assumes it's named {category}.csv)
    product_list_path = os.path.join(folder_path, f"{category}.csv")
    product_df = pd.read_csv(product_list_path)

    return product_df

In [16]:
root_path = "dataset"

# List all CSV files in the folder
category_names = [f for f in os.listdir(root_path)]

print(category_names)

['Elektronik', 'Aksesoris', 'Kesehatan', 'Kecantikan', 'Alas_kaki', 'Pakaian_Pria', 'Pakaian_Wanita']


In [17]:
all_dfs = {
    name: data_loading(root_path, category)
    for name, category in zip(category_names, category_names)
}

In [18]:
all_dfs['Elektronik'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Product Name    38 non-null     object
 1   Current Price   38 non-null     object
 2   Original Price  38 non-null     object
 3   Image URL       38 non-null     object
 4   Stock           38 non-null     object
 5   item_id         38 non-null     int64 
dtypes: int64(1), object(5)
memory usage: 1.9+ KB


In [19]:
all_dfs['Elektronik'].head()

Unnamed: 0,Product Name,Current Price,Original Price,Image URL,Stock,item_id
0,Infinix GT 20 Pro 5G | Infinix GT 30 Pro 5G [8...,Rp3.999.000,-,https://images.tokopedia.net/img/cache/500-squ...,Sisa3,1
1,[Hot] itel P55 5G RAM 6+128GB - 5G Network - 5...,Rp1.299.000,Rp1.999.000,https://images.tokopedia.net/img/cache/500-squ...,5.447,2
2,Xiaomi POCO X6 Pro 5G 12/512 12/256 GB Garansi...,Rp3.554.000,Rp4.099.000,https://images.tokopedia.net/img/cache/500-squ...,Habis,3
3,vivo iQOO 13 (12/256) - 1st Snapdragon 8 elite...,Rp9.999.000,-,https://images.tokopedia.net/img/cache/500-squ...,64,4
4,Tecno Spark Go 1 4/128 4/64 Garansi Resmi New ...,Rp1.029.000,-,https://images.tokopedia.net/img/cache/500-squ...,30,5


# Preprocessing

In [20]:
def parse_rupiah_to_int(value):
    """
    Converts price or stock strings like '578 rb', '5.447', 'Sisa 3', 'Habis' to int.
    """
    if pd.isna(value):
        return 0

    s = str(value).lower().strip()

    # Handle sold out or empty
    if s in ['-', 'habis', '']:
        return 0

    # Remove non-digit except '.' and ',' and 'rb' or 'k'
    # Check if it contains 'rb' or 'k' as thousand indicator
    if 'rb' in s:
        # Remove 'rb' and convert to int * 1000
        num_part = s.replace('rb', '').replace(',', '').strip()
        try:
            return int(float(num_part) * 1000)
        except ValueError:
            return 0
    elif 'k' in s:
        # same for 'k'
        num_part = s.replace('k', '').replace(',', '').strip()
        try:
            return int(float(num_part) * 1000)
        except ValueError:
            return 0
    else:
        # Just remove dots and commas then convert
        num_part = s.replace('sisa', '').replace('.', '').replace(',', '').strip()
        try:
            return int(num_part)
        except ValueError:
            return 0

# Apply for prices (which may not have 'rb' but just in case)
def clean_price(price):
    if pd.isna(price):
        return 0
    s = str(price).lower().strip()
    if s == '-' or s == '':
        return 0
    # Remove Rp and apply parse_rupiah_to_int
    s = s.replace('rp', '').strip()
    return parse_rupiah_to_int(s)

In [21]:
for name, df in all_dfs.items():
    print(f"Category: {name}")
    # Now apply cleaning
    df['Current Price'] = df['Current Price'].apply(clean_price)
    df['Original Price'] = df['Original Price'].apply(clean_price)
    df['Stock'] = df['Stock'].apply(parse_rupiah_to_int)

    # Calculate discount percentage
    df['discount'] = df.apply(
        lambda row: round((1 - row['Current Price'] / row['Original Price']) * 100, 2)
        if row['Original Price'] > 0 else 0,
        axis=1
    )

Category: Elektronik
Category: Aksesoris
Category: Kesehatan
Category: Kecantikan
Category: Alas_kaki
Category: Pakaian_Pria
Category: Pakaian_Wanita


In [22]:
all_dfs['Elektronik'].head()

Unnamed: 0,Product Name,Current Price,Original Price,Image URL,Stock,item_id,discount
0,Infinix GT 20 Pro 5G | Infinix GT 30 Pro 5G [8...,3999000,0,https://images.tokopedia.net/img/cache/500-squ...,3,1,0.0
1,[Hot] itel P55 5G RAM 6+128GB - 5G Network - 5...,1299000,1999000,https://images.tokopedia.net/img/cache/500-squ...,5447,2,35.02
2,Xiaomi POCO X6 Pro 5G 12/512 12/256 GB Garansi...,3554000,4099000,https://images.tokopedia.net/img/cache/500-squ...,0,3,13.3
3,vivo iQOO 13 (12/256) - 1st Snapdragon 8 elite...,9999000,0,https://images.tokopedia.net/img/cache/500-squ...,64,4,0.0
4,Tecno Spark Go 1 4/128 4/64 Garansi Resmi New ...,1029000,0,https://images.tokopedia.net/img/cache/500-squ...,30,5,0.0


#  Save

In [23]:
for name, df in all_dfs.items():
    output_dir = os.path.join("dataset_preprocessed", name)
    os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist

    output_path = os.path.join(output_dir, f"{name}.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Saved: dataset_preprocessed/Elektronik/Elektronik.csv
Saved: dataset_preprocessed/Aksesoris/Aksesoris.csv
Saved: dataset_preprocessed/Kesehatan/Kesehatan.csv
Saved: dataset_preprocessed/Kecantikan/Kecantikan.csv
Saved: dataset_preprocessed/Alas_kaki/Alas_kaki.csv
Saved: dataset_preprocessed/Pakaian_Pria/Pakaian_Pria.csv
Saved: dataset_preprocessed/Pakaian_Wanita/Pakaian_Wanita.csv
