In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
# Contoh: pastikan sudah download resource
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))  # Atau 'english' kalau teksnya Inggris
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to C:\Users\MyBook SAGA
[nltk_data]     12\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv("../data/raw/gabungan.csv", encoding='latin1')
df.head()


Unnamed: 0,id,source,text,created_at
0,1,x,gusy mending sunscreen omg apa yang amaterasun...,Mon Jun 16 03:20:32 +0000 2025
1,2,x,@yoenjaehyuk fw: hadalabo gokujyun (putih) / h...,Mon Jun 16 01:24:07 +0000 2025
2,3,x,Aku nemu promo menarik di Lazada nih. Yuk coba...,Sun Jun 15 17:27:30 +0000 2025
3,4,x,Haloooo aku maw preloved skincare pemakaian pr...,Sun Jun 15 12:16:36 +0000 2025
4,5,x,Gusyyyy mungkin ada yg pake sunscreen amateras...,Sun Jun 15 11:31:34 +0000 2025


In [4]:
# Daftar kata yang menandakan teks promosi/penjual
kata_penjual = [
    "jual", "diskon", "beli", "tokopedia", "shopee", "harga", "murah",
    "preloved", "produk", "produkku", "ready", "available", "order", "link", "promo"
]

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Hilangkan URL
    text = re.sub(r'http\S+|www.\S+', '', text)  # hapus https atau www tautan
    # Hilangkan karakter non-alfabet
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenizing
    tokens = text.split()
    # Stopword removal
    tokens = [word for word in tokens if word not in stop_words]
    # Hapus kata penjual
    tokens = [word for word in tokens if word not in kata_penjual]
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

In [5]:
df.head()


Unnamed: 0,id,source,text,created_at
0,1,x,gusy mending sunscreen omg apa yang amaterasun...,Mon Jun 16 03:20:32 +0000 2025
1,2,x,@yoenjaehyuk fw: hadalabo gokujyun (putih) / h...,Mon Jun 16 01:24:07 +0000 2025
2,3,x,Aku nemu promo menarik di Lazada nih. Yuk coba...,Sun Jun 15 17:27:30 +0000 2025
3,4,x,Haloooo aku maw preloved skincare pemakaian pr...,Sun Jun 15 12:16:36 +0000 2025
4,5,x,Gusyyyy mungkin ada yg pake sunscreen amateras...,Sun Jun 15 11:31:34 +0000 2025


In [6]:
# Pastikan kolom tanggal sudah benar, misalnya bernama 'created_at'
# Ganti 'created_at' jika nama kolom berbeda
def convert_date(date_str):
    try:
        # Format dari Twitter atau X
        dt = datetime.strptime(date_str, "%a %b %d %H:%M:%S %z %Y")
        return dt.strftime("%Y-%m-%d")
    except:
        return date_str  # Jika gagal, biarkan tetap

df['created_at'] = df['created_at'].apply(convert_date)

In [7]:
# Fungsi konversi berbagai format ke YYYY-MM-DD
def fix_date(date_str):
    date_str = str(date_str)
    known_formats = [
        "%a %b %d %H:%M:%S %z %Y",  # Twitter/X full format
        "%d/%m/%Y",                # Format seperti 31/05/2025
        "%Y-%m-%d"                 # Sudah benar
    ]
    for fmt in known_formats:
        try:
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")
        except:
            continue
    return date_str  # Jika tak cocok format, dikembalikan apa adanya

In [8]:
df.head()


Unnamed: 0,id,source,text,created_at
0,1,x,gusy mending sunscreen omg apa yang amaterasun...,Mon Jun 16 03:20:32 +0000 2025
1,2,x,@yoenjaehyuk fw: hadalabo gokujyun (putih) / h...,Mon Jun 16 01:24:07 +0000 2025
2,3,x,Aku nemu promo menarik di Lazada nih. Yuk coba...,Sun Jun 15 17:27:30 +0000 2025
3,4,x,Haloooo aku maw preloved skincare pemakaian pr...,Sun Jun 15 12:16:36 +0000 2025
4,5,x,Gusyyyy mungkin ada yg pake sunscreen amateras...,Sun Jun 15 11:31:34 +0000 2025


In [9]:
pip install pandas tqdm

Note: you may need to restart the kernel to use updated packages.


In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm

# 1. MUAT MODEL
# ----------------------------------------------------
print("Memuat model dan tokenizer...")
pretrained = "mdhugol/indonesia-bert-sentiment-classification"
model = AutoModelForSequenceClassification.from_pretrained(pretrained)
tokenizer = AutoTokenizer.from_pretrained(pretrained)

sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
label_index = {'LABEL_0': 'positive', 'LABEL_1': 'neutral', 'LABEL_2': 'negative'}
print("Model berhasil dimuat.")

# 2. ANALISIS SENTIMEN
# ----------------------------------------------------
text_column_name = 'text'  # pastikan kolom ini ada di df

def analyze_sentiment(text):
    if not isinstance(text, str):
        return 'neutral', 0.0
    try:
        result = sentiment_analysis(text)[0]
        label = label_index[result['label']]
        score = result['score']
        return label, score
    except Exception as e:
        print(f"Error saat menganalisis teks: '{text}'. Error: {e}")
        return 'error', 0.0

# Tambahkan tqdm progress bar
tqdm.pandas(desc="Menganalisis sentimen")

# Terapkan ke kolom text
results = df[text_column_name].progress_apply(analyze_sentiment)
df[['sentimen', 'skor_sentimen']] = pd.DataFrame(results.tolist(), index=df.index)

# 3. LABEL PENJUAL
# ----------------------------------------------------
kata_penjual = [
    'jual', 'beli', 'harga', 'diskon', 'promo', 'tokopedia',
    'shopee', 'link', 'ready', 'order', 'preloved', 'produk', 'gratis', 'tersedia'
]

def deteksi_penjual(text):
    if not isinstance(text, str):
        return False
    text = text.lower()
    if 'http' in text or 'www.' in text:
        return True
    return any(kata in text for kata in kata_penjual)

df['is_penjual'] = df[text_column_name].apply(deteksi_penjual)

# 4. SIMPAN HASIL
# ----------------------------------------------------
output_csv_path = 'done.csv'
print(f"Menyimpan hasil ke: {output_csv_path}")
df.to_csv(output_csv_path, index=False)
print("Proses selesai!")


Memuat model dan tokenizer...
