# **Pra-pemrosesan Data**

In [None]:
# Install library yang dibutuhkan
!pip install sastrawi
!pip install nltk

# Import library yang dibutuhkan
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from google.colab import files

# Download data tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')

# Setup Stemmer dan Stopword Remover
stemmer = StemmerFactory().create_stemmer()
stopword_factory = StopWordRemoverFactory()
stopwords = stopword_factory.get_stop_words()

# Load Dataset tweet
tweet_filename = 'crawling-data-merge.csv'
df = pd.read_csv(tweet_filename)

# Hitung jumlah data awal
initial_data_count = len(df)
print(f"Jumlah data awal: {initial_data_count:,}")

# Load kamus slang yang sudah didownload dalam bentuk .csv (repo: insomniagung/kamus_kbba)
alay_df = pd.read_csv('kamus_slang.csv')
alay_df['slang'] = alay_df['slang'].astype(str).str.lower()
alay_df['formal'] = alay_df['formal'].astype(str).str.lower()
alay_dict = dict(zip(alay_df['slang'], alay_df['formal']))

# Filter Tweet Berdasarkan Kata Kunci
# Kata kunci yang wajib ada dan yang harus dihindari
keyword_includes = ['coretax']     # WAJIB ADA dalam teks
keyword_excludes = []  # TIDAK BOLEH ADA dalam teks

def should_keep(text):
    if pd.isnull(text):
        return False
    text = text.lower()

    # Harus mengandung setidaknya satu kata dari keyword_includes
    contains_include = any(re.search(rf'\b{re.escape(word)}\b', text) for word in keyword_includes)

    # Tidak boleh mengandung satupun dari keyword_excludes
    contains_exclude = any(re.search(rf'\b{re.escape(word)}\b', text) for word in keyword_excludes)

    return contains_include and not contains_exclude

# Filter berdasarkan kedua kondisi
df = df[df['full_text'].apply(should_keep)]

# Hitung jumlah data setelah filtering
after_filter_count = len(df)
print(f"Jumlah data setelah filtering: {after_filter_count:,}")
print(f"Data yang dibuang karena filtering: {initial_data_count - after_filter_count:,}")

# 2. Hapus duplikat
df = df.drop_duplicates(subset='full_text').reset_index(drop=True)

# Hitung jumlah data setelah menghapus duplikat
after_dedup_count = len(df)
print(f"Jumlah data setelah menghapus duplikat: {after_dedup_count:,}")
print(f"Data duplikat yang dihapus: {after_filter_count - after_dedup_count:,}")

# 3. Normalisasi kata slang
def normalize_text(text):
    return ' '.join([alay_dict.get(word, word) for word in text.split()])

# Handle elongation pada data
def remove_elongation(text):
    return re.sub(r'(\w)\1{2,}', r'\1', text)

# Fungsi untuk menghitung jumlah kata/token
def count_tokens(text):
    if pd.isnull(text) or text == '':
        return 0
    return len(word_tokenize(text))

# 4-9. Preprocessing Lengkap
def preprocessing(text):
    if pd.isnull(text):
        return ''

    # 4. Casefolding
    text = text.casefold()

    # 5. Pembersihan Data
    text = re.sub(r'@\w+', '', text) # menghapus mention dalam data tweet
    text = re.sub(r'#(\w+)', r'\1', text) # menghapus hashtag dalam data tweet
    text = re.sub(r'http\S+', '', text) # menghapus url/link pada data tweet
    text = re.sub(r'[^a-zA-Z\s]', ' ', text) #menghapus karakter selain huruf dan spasi
    text = re.sub(r'\d+', '', text) #menghapus semua angka
    text = re.sub(r'\s+', ' ', text).strip() #menghapus spasi ganda pada tweet

    # 6. Hilangkan elongation
    text = remove_elongation(text)

    # 7. Normalisasi slang
    text = normalize_text(text)

    # 8. Tokenisasi
    tokens = word_tokenize(text)

    # 9. Stopword removal
    tokens = [token for token in tokens if token not in stopwords]

    # 10. Stemming
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

# Hitung jumlah token sebelum preprocessing
print("\nMenghitung jumlah token sebelum preprocessing...")
df['token_count_before'] = df['full_text'].apply(count_tokens)
total_tokens_before = df['token_count_before'].sum()

# Proses preprocessing
print("Melakukan preprocessing...")
df['cleaned_text'] = df['full_text'].apply(preprocessing)

# Hitung jumlah token setelah preprocessing
print("Menghitung jumlah token setelah preprocessing...")
df['token_count_after'] = df['cleaned_text'].apply(count_tokens)
total_tokens_after = df['token_count_after'].sum()

# Hapus baris dengan teks kosong setelah preprocessing
df = df[df['cleaned_text'].str.strip() != ''].reset_index(drop=True)
final_data_count = len(df)

# Tampilkan statistik lengkap
print("\n" + "="*60)
print("STATISTIK DATA PREPROCESSING")
print("="*60)
print(f"Jumlah data awal: {initial_data_count:,}")
print(f"Jumlah data setelah filtering: {after_filter_count:,}")
print(f"Jumlah data setelah menghapus duplikat: {after_dedup_count:,}")
print(f"Jumlah data final (setelah preprocessing): {final_data_count:,}")
print(f"Total data yang dibuang: {initial_data_count - final_data_count:,}")
print(f"Persentase data yang tersisa: {(final_data_count/initial_data_count)*100:.2f}%")

print("\n" + "="*60)
print("STATISTIK TOKEN/KATA")
print("="*60)
print(f"Total token sebelum preprocessing: {total_tokens_before:,}")
print(f"Total token setelah preprocessing: {total_tokens_after:,}")
print(f"Token yang berkurang: {total_tokens_before - total_tokens_after:,}")
print(f"Persentase token yang tersisa: {(total_tokens_after/total_tokens_before)*100:.2f}%")
print(f"Rata-rata token per tweet sebelum: {total_tokens_before/final_data_count:.2f}")
print(f"Rata-rata token per tweet setelah: {total_tokens_after/final_data_count:.2f}")

# Simpan hasil preprocessing
output_filename = 'data-hasil-preprocessing-final.csv'
df.to_csv(output_filename, index=False)
files.download(output_filename)

# Tampilkan contoh hasil
print("\n" + "="*60)
print("CONTOH HASIL PREPROCESSING")
print("="*60)
for i in range(min(3, len(df))):
    print(f"\nContoh {i+1}:")
    print(f"Teks Asli ({df['token_count_before'].iloc[i]} token):")
    print(f"'{df['full_text'].iloc[i]}'")
    print(f"Teks Bersih ({df['token_count_after'].iloc[i]} token):")
    print(f"'{df['cleaned_text'].iloc[i]}'")
    print("-" * 40)

# Tampilkan distribusi panjang token
print("\n" + "="*60)
print("DISTRIBUSI PANJANG TOKEN")
print("="*60)
print("Sebelum Preprocessing:")
print(f"  Min token: {df['token_count_before'].min()}")
print(f"  Max token: {df['token_count_before'].max()}")
print(f"  Median token: {df['token_count_before'].median():.2f}")

print("\nSetelah Preprocessing:")
print(f"  Min token: {df['token_count_after'].min()}")
print(f"  Max token: {df['token_count_after'].max()}")
print(f"  Median token: {df['token_count_after'].median():.2f}")

print("\nPreprocessing selesai! File telah disimpan dan diunduh.")