In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# === Unduh NLTK data ===
print("Downloading NLTK data...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab', quiet=True)
print("✓ Download selesai!\n")

# === Inisialisasi Stopwords ===
stop_words = set(stopwords.words('english'))

# === Fungsi Preprocessing tanpa Stemming ===
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return [], "", 0

    # 1️⃣ Bersihkan HTML tag dan noise
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()

    # 2️⃣ Case folding
    text = text.lower()

    # 3️⃣ Tokenizing
    tokens = word_tokenize(text)

    # 4️⃣ Stopword removal
    tokens = [word for word in tokens if word not in stop_words and word != ""]

    # 5️⃣ Gabungkan kembali jadi string
    cleaned_text = ' '.join(tokens)
    word_count = len(tokens)

    return tokens, cleaned_text, word_count

# === Baca file Excel ===
input_file = '/content/quora_opinion_israel.xlsx'
print(f"📂 Membaca file: {input_file}")
df = pd.read_excel(input_file)
print(f"✓ Berhasil membaca {len(df)} baris data\n")

# === Pilih kolom teks ===
text_column = 'opinion' if 'opinion' in df.columns else df.columns[0]
print(f"Menggunakan kolom: '{text_column}' untuk preprocessing\n")

# === Jalankan preprocessing ===
print("🔄 Memproses teks...")
processed = df[text_column].apply(preprocess_text)
df['tokens'] = processed.apply(lambda x: x[0])
df['clean_text'] = processed.apply(lambda x: x[1])
df['word_count'] = processed.apply(lambda x: x[2])
print("✓ Preprocessing selesai!\n")

# === Tampilkan contoh hasil ===
print("="*80)
print("CONTOH HASIL PREPROCESSING:")
print("="*80)
for i in range(min(3, len(df))):
    print(f"\n📝 Data ke-{i+1}:")
    print(f"TOKENS      : {df['tokens'].iloc[i][:10]} ...")
    print(f"CLEAN TEXT  : {df['clean_text'].iloc[i][:100]}...")
    print(f"WORD COUNT  : {df['word_count'].iloc[i]}")
    print("-"*80)

# === Simpan ke Excel ===
output_file = '/content/quora_opinion_israel_CLEANED.xlsx'
print(f"\n💾 Menyimpan hasil ke: {output_file}")
df_output = df[['tokens', 'clean_text', 'word_count']].copy()
df_output.to_excel(output_file, index=False, engine='openpyxl')
print("✓ File berhasil disimpan!")

# === Verifikasi file ===
print("\n🔍 Memverifikasi file yang tersimpan...")
df_verify = pd.read_excel(output_file)
print(f"✓ File terverifikasi dengan {len(df_verify)} baris")
print(f"✓ Kolom: {list(df_verify.columns)}")

print("\n📄 Sample dari file yang TERSIMPAN:")
print(df_verify.head(2).to_string())

print("\n" + "="*80)
print("✅ PROSES SELESAI — Tidak ada stemming, duplikasi kata dipertahankan per dokumen!")
print("="*80)
