In [2]:
!pip install pandas
!pip install Sastrawi
!pip install tqdm



In [3]:
import os
import pandas as pd
import re
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from tqdm import tqdm
from multiprocessing import Pool, cpu_count, freeze_support
from tqdm import tqdm
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

`Case Folding`

In [4]:
def case_folding(text):
    """Mengubah teks menjadi huruf kecil."""
    if pd.isna(text):
        return ""
    return str(text).lower()

def step1_casefolding(
    input_file="./device.csv",
    save_to="step_data",
    output_filename="step1_casefolding.csv"
):
    os.makedirs(save_to, exist_ok=True)
    output_path = os.path.join(save_to, output_filename)

    if not os.path.exists(input_file):
        print(f"‚ùå File {input_file} tidak ditemukan.")
        return

    df = pd.read_csv(input_file)
    print(f"üîπ Melakukan Case Folding pada file: {input_file}")

    required_cols = [
        "title", "content", "brand", "model", "processor",
        "ram", "storage", "display", "camera", "battery",
        "os", "tags"
    ]

    for col in required_cols:
        # Terapkan Case Folding
        df[col] = df[col].astype(str).apply(case_folding)

    df.to_csv(output_path, index=False, encoding='utf-8')
    
    print(f"‚úÖ Step 1 (Case Folding) selesai ‚Äî hasil disimpan di: {output_path}")
    return df # Mengembalikan DataFrame untuk digunakan di Sel 2

# Jalankan Step 1
df_step1 = step1_casefolding()

üîπ Melakukan Case Folding pada file: ./device.csv
‚úÖ Step 1 (Case Folding) selesai ‚Äî hasil disimpan di: step_data/step1_casefolding.csv


`cleaning`

In [5]:
import pandas as pd
import os
import re
from tqdm import tqdm

def remove_noise(text):
    """
    Menghapus kontaminasi teks (deskripsi iPhone/Galaxy A16/Redmi Note) 
    dari kolom 'content' dengan mencari pola tertentu.
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Pola 1: iPhone 16E/A18
    text = re.sub(r'deskripsi detail komparasi spesifikasi iphone 16e.*?tutup lengkap', '', text, flags=re.DOTALL)
    
    # Pola 2: Galaxy A16
    text = re.sub(r'layar fhd super amoled galaxy a16.*?tutup lengkap', '', text, flags=re.DOTALL)
    
    # Pola 3: Redmi Note 14 / Flagship Camera
    text = re.sub(r'deskripsi spesifikasi flagship level camera 108mp.*?tutup lengkap', '', text, flags=re.DOTALL)

    return text.strip()


def cleaning(text):
    """Melakukan Cleaning Kritis dan membersihkan simbol."""
    if pd.isna(text):
        return ""
    
    text = str(text) # Sudah lowercase dari Step 1
    
    # A. Hapus Noise KRITIS (aplikasikan hanya pada 'content' jika perlu, 
    # namun di sini diaplikasikan pada semua kolom karena data Frame sudah dimuat)
    text = remove_noise(text)
   
    # B. Hapus simbol/karakter non-alfanumerik (pertahankan angka dan spasi)
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)

    # C. Hapus spasi berulang
    text = re.sub(r"\s+", " ", text).strip()

    return text

def step2_cleaning(
    df, # Menerima DataFrame dari Step 1
    save_to="step_data",
    output_filename="step2_cleaning.csv"
):
    os.makedirs(save_to, exist_ok=True)
    output_path = os.path.join(save_to, output_filename)

    if df is None:
        print("‚ùå DataFrame Step 1 tidak tersedia.")
        return
    
    print("üßπ Memulai Step 2: Cleaning Kritis (Hapus Noise & Simbol)...")

    required_cols = [
        "title", "content", "brand", "model", "processor",
        "ram", "storage", "display", "camera", "battery",
        "os", "tags"
    ]

    for col in tqdm(required_cols, desc="Applying Cleaning"):
        # Terapkan Cleaning
        df[col] = df[col].astype(str).apply(cleaning)

    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"‚úÖ Step 2 (Cleaning Kritis) selesai ‚Äî hasil disimpan di: {output_path}")

step2_cleaning(df_step1)

üßπ Memulai Step 2: Cleaning Kritis (Hapus Noise & Simbol)...


Applying Cleaning: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:00<00:00, 52.73it/s]

‚úÖ Step 2 (Cleaning Kritis) selesai ‚Äî hasil disimpan di: step_data/step2_cleaning.csv





`Tokenizing`

In [6]:
def tokenizing(text):
    if pd.isna(text):
        return []

    # Pisahkan angka-huruf (Contoh: '4gb' menjadi '4 gb')
    text = re.sub(r'([0-9]+)([a-zA-Z]+)', r'\1 \2', text)
    text = re.sub(r'([a-zA-Z]+)([0-9]+)', r'\1 \2', text)

    # Tokenisasi
    tokens = text.split()
    return tokens

def step3_tokenizing(
    input_file="step_data/step2_cleaning.csv",
    output_file="step_data/step3_tokenizing.csv"
):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    if not os.path.exists(input_file):
        print(f"‚ùå File {input_file} tidak ditemukan.")
        return

    print(f"üîπ Membaca file: {input_file}")
    df = pd.read_csv(input_file)

    required_cols = [
        "title", "content", "brand", "model", "processor",
        "ram", "storage", "display", "camera", "battery",
        "os", "tags"
    ]

    print("‚úÇÔ∏è Melakukan tokenisasi teks...")

    for col in tqdm(required_cols, desc="Tokenizing"):
        df[col] = df[col].astype(str).apply(tokenizing)

    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"‚úÖ Step 3 selesai ‚Äî hasil disimpan di: {output_file}")


if __name__ == "__main__":
    print("üöÄ Menjalankan Step 3 - Tokenizing...")
    step3_tokenizing(
        input_file="./step_data/step2_cleaning.csv",
        output_file="./step_data/step3_tokenizing.csv"
    )

üöÄ Menjalankan Step 3 - Tokenizing...
üîπ Membaca file: ./step_data/step2_cleaning.csv
‚úÇÔ∏è Melakukan tokenisasi teks...


Tokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:00<00:00, 28.08it/s]

‚úÖ Step 3 selesai ‚Äî hasil disimpan di: ./step_data/step3_tokenizing.csv





`Stopword Removal`

In [7]:
# File: step4_stopword.py
import ast
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# init stopwords
stop_factory = StopWordRemoverFactory()
stopwords_id = set(stop_factory.get_stop_words())

def parse_tokens_maybe(text):
    """Membantu mem-parsing token dari string list atau string biasa."""
    if isinstance(text, list): return [str(t).strip() for t in text if str(t).strip() != ""]
    if not isinstance(text, str): return []
    
    txt = text.strip()
    if (txt.startswith("[") and txt.endswith("]")) or (txt.startswith("('") or txt.startswith("['")):
        try:
            parsed = ast.literal_eval(txt)
            if isinstance(parsed, list):
                return [str(t).strip() for t in parsed if str(t).strip() != ""]
        except Exception: pass
    
    return [t.strip() for t in txt.split() if t.strip() != ""]


def remove_stopwords(tokens):
    """Menghapus stopword dari list token."""
    toks = parse_tokens_maybe(tokens)
    filtered = [word for word in toks if word not in stopwords_id]
    return filtered


def step4_stopword(
    input_file="step_data/step3_tokenizing.csv",
    output_file="step_data/step4_stopword.csv"
):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    if not os.path.exists(input_file):
        print(f"‚ùå File {input_file} tidak ditemukan.")
        return

    print(f"üîπ Membaca file: {input_file}")
    # Baca sebagai string agar parsing list (ast.literal_eval) lebih mudah
    df = pd.read_csv(input_file, dtype=str)  

    content_cols = ["content"]

    print("üßπ Menghapus stopword hanya dari kolom 'content'...")

    # Hanya remove stopword untuk content
    for col in tqdm(content_cols, desc="Removing Stopwords"):
        df[col] = df[col].apply(remove_stopwords)

    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"‚úÖ Step 4 selesai ‚Äî hasil disimpan di: {output_file}")


if __name__ == "__main__":
    print("üöÄ Menjalankan Step 4 - Stopword Removal...")
    step4_stopword(
        input_file="./step_data/step3_tokenizing.csv",
        output_file="./step_data/step4_stopword.csv"
    )

üöÄ Menjalankan Step 4 - Stopword Removal...
üîπ Membaca file: ./step_data/step3_tokenizing.csv
üßπ Menghapus stopword hanya dari kolom 'content'...


Removing Stopwords: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.08it/s]

‚úÖ Step 4 selesai ‚Äî hasil disimpan di: ./step_data/step4_stopword.csv





`Steaming`

In [8]:
# File: step5_stemming.py (Membutuhkan modul multiprocessing dan tqdm)
import ast
from multiprocessing import Pool, cpu_count, freeze_support

_stemmer = None

def init_stemmer():
    """Inisialisasi stemmer di setiap worker multiprocessing"""
    global _stemmer
    _stemmer = StemmerFactory().create_stemmer()


def stemming_worker(tokens):
    """Worker function untuk multiprocessing. Menerima token list/string, mengembalikan stem string."""
    global _stemmer

    # Mengubah list token kembali menjadi string
    if isinstance(tokens, list):
        text = " ".join(tokens)
    else:
        # Menghapus format list string jika ada
        text = str(tokens).replace("[", "").replace("]", "").replace("'", "").replace(",", " ")

    try:
        # Pastikan text tidak kosong sebelum di-stem
        return _stemmer.stem(text.strip()) if text.strip() else ""
    except:
        return text


def process_batch(df_batch, num_cores):
    """Stemming paralel per batch"""
    # Gunakan kolom 'content'
    with Pool(num_cores, initializer=init_stemmer) as pool:
        konten_iter = pool.imap_unordered(stemming_worker, df_batch["content"])
        
        konten_stem = list(tqdm(
            konten_iter,
            total=len(df_batch),
            desc="üì∞ Stemming content",
            ncols=100,
            leave=False
        ))

    df_batch["content"] = konten_stem
    return df_batch


def step5_stemming_parallel_batch(
    input_file="step_data/step4_stopword.csv",
    output_file="step_data/step5_stemming.csv",
    batch_size=None,
):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    if not os.path.exists(input_file):
        print(f"‚ùå File {input_file} tidak ditemukan.")
        return

    total_cores = cpu_count()
    num_cores = max(1, total_cores - 1)
    if batch_size is None:
        batch_size = 2000

    print(f"üß† Menggunakan {num_cores} dari {total_cores} core CPU. Batch size: {batch_size}\n")
    
    # Hitung total baris
    with open(input_file, encoding="utf-8", errors="ignore") as f: total_rows = sum(1 for _ in f) - 1
    print(f"üìä Total baris: {total_rows:,}")

    batch_iter = pd.read_csv(input_file, chunksize=batch_size, dtype=str)
    is_first = True

    for i, df_chunk in enumerate(batch_iter, start=1):
        print(f"\nüîπ Memproses batch {i}")
        df_processed = process_batch(df_chunk, num_cores)

        df_processed.to_csv(
            output_file,
            mode="a",
            index=False,
            header=is_first,
            encoding="utf-8",
        )

        is_first = False
        print(f"‚úÖ Batch {i} selesai ‚Üí disimpan")

    print("\nüéâ Semua batch selesai!")
    print(f"üìÅ Hasil akhir: {output_file}")


if __name__ == "__main__":
    freeze_support()
    print("üöÄ Menjalankan Step 5 - Stemming Parallel Batch...")
    step5_stemming_parallel_batch(
        input_file="./step_data/step4_stopword.csv",
        output_file="./step_data/step5_stemming.csv",
    )

üöÄ Menjalankan Step 5 - Stemming Parallel Batch...
üß† Menggunakan 11 dari 12 core CPU. Batch size: 2000

üìä Total baris: 1,987

üîπ Memproses batch 1


                                                                                                    

‚úÖ Batch 1 selesai ‚Üí disimpan

üéâ Semua batch selesai!
üìÅ Hasil akhir: ./step_data/step5_stemming.csv




`Detokenisasi`

In [9]:
# File: step6_detokenized.py
import pandas as pd
import ast
import re

# ===========================
#   PARSE LIST SAFELY
# ===========================
def parse_list(text):
    """Mengubah representasi string dari list menjadi string teks."""
    if isinstance(text, list):
        return " ".join(str(t) for t in text)

    try:
        # Mencoba mengevaluasi string yang berbentuk list
        parsed = ast.literal_eval(text)
        if isinstance(parsed, list):
            return " ".join(str(t) for t in parsed)
    except:
        pass

    return str(text)


# ===========================
#   NORMALIZER KHUSUS SPEK
# ===========================
def normalize_text(text):
    """Membersihkan dan menormalisasi teks spesifikasi (misalnya menggabungkan angka dan unit)."""
    if not isinstance(text, str):
        return text

    # Hilangkan koma, bracket, dan tanda kutip yang mungkin tersisa
    text = (
        text.replace(",", " ")
            .replace("[", "")
            .replace("]", "")
            .replace("'", "")
            .replace('"', ' ')
    )

    text = " ".join(text.split())  # bersihkan multi-spasi
    return text.capitalize() # Kapitalisasi sederhana


# ===========================
#       MAIN FUNCTION
# ===========================
def step6_detokenize_normalize(
    input_path: str = "./step_data/step5_stemming.csv",
    output_path: str = "./step_data/step6_detokenized.csv"
):
    print(f"üìÇ Membaca file: {input_path}")
    df = pd.read_csv(input_path)

    # Kolom yang harus di detokenisasi (konten sudah di-stem)
    text_cols = [
        "title", "content", "brand", "model", "processor",
        "ram", "storage", "display", "camera", "battery",
        "os", "tags"
    ]

    print("üßπ Detokenisasi dan Normalisasi semua kolom teks...")
    
    for col in tqdm(text_cols, desc="Detokenizing"):
        if col in df.columns:
            # 1. Detoken
            df[col] = df[col].apply(parse_list)
            # 2. Normalisasi
            df[col] = df[col].apply(normalize_text)

    df.to_csv(output_path, index=False)
    print(f"‚úÖ Step 6 (Detokenisasi & Normalisasi) selesai ‚Üí hasil disimpan ke {output_path}")

# Jalankan Step 6
if __name__ == "__main__":
    step6_detokenize_normalize()

üìÇ Membaca file: ./step_data/step5_stemming.csv
üßπ Detokenisasi dan Normalisasi semua kolom teks...


Detokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:00<00:00, 29.15it/s]

‚úÖ Step 6 (Detokenisasi & Normalisasi) selesai ‚Üí hasil disimpan ke ./step_data/step6_detokenized.csv





In [13]:
# File: step7_finalization.py
import pandas as pd
import os
import re

def clean_and_normalize_price(price_str):
    """Membersihkan string harga menjadi integer numerik (untuk filtering)."""
    if pd.isna(price_str): return None
    price_str = str(price_str).lower().replace('rp', '').replace('.', '').replace(',', '').strip()
    return int(price_str) if price_str.isdigit() else None

def map_device_type(text):
    """Menentukan tipe perangkat (untuk filtering kategori)."""
    if pd.isna(text): return 'Lain'
    text = str(text).lower()
    
    # Aturan sederhana: Mencari kata kunci di kolom title/tags/brand/model
    if 'laptop' in text or 'notebook' in text: return 'Laptop'
    if 'tablet' in text or 'matepad' in text: return 'Tablet'
    if 'hp' in text or 'handphone' in text or 'smartphone' in text: return 'Handphone'
    
    return 'Lain' # Default jika tidak teridentifikasi


def step7_finalization(
    input_file="./step_data/step6_detokenized.csv",
    output_file="./model/devices_df.csv" # Simpan langsung di folder model
):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    if not os.path.exists(input_file):
        print(f"‚ùå File {input_file} tidak ditemukan.")
        return

    print(f"üîπ Membaca file: {input_file}")
    df = pd.read_csv(input_file)

    # 1. Pembersihan & Normalisasi Harga üí∞
    print("üí∞ Mengekstrak harga numerik...")
    df["harga_num"] = df["harga"].apply(clean_and_normalize_price)
    
    # 2. Ekstraksi Device Type (Kategori Kritis) üè∑Ô∏è
    # Gunakan gabungan kolom untuk mendapatkan tipe perangkat yang akurat
    df["device_identifier"] = df["title"].astype(str) + " " + df["tags"].astype(str)
    df["device_type"] = df["device_identifier"].apply(map_device_type)
    df = df.drop(columns=['device_identifier'])

    # 3. Pembuatan Final Combined Text (untuk TF-IDF) üìù
    print("üìù Membuat kolom teks gabungan (combined_text) untuk TF-IDF...")
    text_cols_for_search = [
        "title", "brand", "model", "processor", "ram", "storage",
        "display", "camera", "battery", "os", "content", "tags"
    ]
    
    df["combined_text_final"] = df.apply(
        lambda row: " ".join(
            str(row[col]) for col in text_cols_for_search if col in df.columns and pd.notna(row[col])
        ), 
        axis=1
    )
    df["combined_text_final"] = df["combined_text_final"].apply(lambda x: re.sub(r'\s+', ' ', str(x)).strip())
    
    # Hapus kolom combined_text lama jika ada, ganti dengan yang baru
    if 'combined_text' in df.columns:
        df = df.drop(columns=['combined_text'])
    
    df.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"‚úÖ Step 7 (Finalisasi Kritis) selesai ‚Äî hasil disimpan di: {output_file}")
    print("\nContoh data kolom filter kritis:")
    print(df[["title", "device_type", "harga_num", "combined_text_final"]].head(3).to_string(index=False))


if __name__ == "__main__":
    step7_finalization()

üîπ Membaca file: ./step_data/step6_detokenized.csv
üí∞ Mengekstrak harga numerik...
üìù Membuat kolom teks gabungan (combined_text) untuk TF-IDF...
‚úÖ Step 7 (Finalisasi Kritis) selesai ‚Äî hasil disimpan di: ./model/devices_df.csv

Contoh data kolom filter kritis:
                                  title device_type  harga_num                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [14]:
# File: step8_modeling.py
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import pandas as pd
import os

def tokenize(text):
    """Fungsi sederhana untuk tokenisasi (memecah string menjadi list kata)."""
    return str(text).split()


def step8_remodel(
    input_file="./model/devices_df.csv",
    save_to="./model"
):
    print("üîπ Memuat data final untuk pemodelan ulang...")
    
    if not os.path.exists(input_file):
        print(f"‚ùå File input {input_file} tidak ditemukan. Pastikan Step 7 sudah dijalankan.")
        return

    df = pd.read_csv(input_file)
    
    if "combined_text_final" not in df.columns:
        print("‚ùå Kolom 'combined_text_final' tidak ditemukan. Jalankan Step 7 terlebih dahulu.")
        return

    os.makedirs(save_to, exist_ok=True)
    
    # --- 1. MODELING TF-IDF ---
    print("\n1. Membuat Model TF-IDF...")
    tfidf_vectorizer = TfidfVectorizer(
        max_features=50000,
        ngram_range=(1, 2)
    )

    tfidf_matrix = tfidf_vectorizer.fit_transform(df["combined_text_final"])

    print(f"‚úÖ TF-IDF Matrix dibuat dengan shape: {tfidf_matrix.shape}")
    
    # Simpan Aset TF-IDF
    with open(os.path.join(save_to, "tfidf_vectorizer.pkl"), "wb") as f:
        pickle.dump(tfidf_vectorizer, f)
    with open(os.path.join(save_to, "tfidf_matrix.pkl"), "wb") as f:
        pickle.dump(tfidf_matrix, f)
    print("üíæ Model TF-IDF berhasil disimpan.")


    # --- 2. PREPARASI JACCARD ---
    print("\n2. Mempersiapkan Token untuk Jaccard Similarity...")
    
    # Membuat list of tokens dari kolom combined_text_final
    jaccard_tokens = df["combined_text_final"].apply(tokenize).tolist()
    
    # Simpan Aset Jaccard
    with open(os.path.join(save_to, "jaccard_tokens.pkl"), "wb") as f:
        pickle.dump(jaccard_tokens, f)
        
    print(f"‚úÖ Jaccard Tokens berhasil dibuat. Total {len(jaccard_tokens)} dokumen.")
    print("üíæ Jaccard tokens berhasil disimpan.")


    print("\nüéâ Step 8 (Modeling Ulang) selesai.")

if __name__ == "__main__":
    step8_remodel()

üîπ Memuat data final untuk pemodelan ulang...

1. Membuat Model TF-IDF...
‚úÖ TF-IDF Matrix dibuat dengan shape: (1987, 50000)
üíæ Model TF-IDF berhasil disimpan.

2. Mempersiapkan Token untuk Jaccard Similarity...
‚úÖ Jaccard Tokens berhasil dibuat. Total 1987 dokumen.
üíæ Jaccard tokens berhasil disimpan.

üéâ Step 8 (Modeling Ulang) selesai.
