In [53]:
import pandas as pd
from datetime import datetime

In [54]:
df = pd.read_csv('../data/processed/hasil_sentimen.csv')
print("Data sebelum diproses:")
print(df.head())

Data sebelum diproses:
   id source                                               text  \
0   1      x  gusy mending sunscreen omg apa yang amaterasun...   
1   2      x  @yoenjaehyuk fw: hadalabo gokujyun (putih) / h...   
2   3      x  Aku nemu promo menarik di Lazada nih. Yuk coba...   
3   4      x  Haloooo aku maw preloved skincare pemakaian pr...   
4   5      x  Gusyyyy mungkin ada yg pake sunscreen amateras...   

                       created_at sentimen  skor_sentimen  
0  Mon Jun 16 03:20:32 +0000 2025  neutral       0.987218  
1  Mon Jun 16 01:24:07 +0000 2025  neutral       0.776992  
2  Sun Jun 15 17:27:30 +0000 2025  neutral       0.960741  
3  Sun Jun 15 12:16:36 +0000 2025  neutral       0.819093  
4  Sun Jun 15 11:31:34 +0000 2025  neutral       0.993792  


In [55]:
def convert_date(date_str):
    try:
        # Format dari Twitter atau X
        dt = datetime.strptime(date_str, "%a %b %d %H:%M:%S %z %Y")
        return dt.strftime("%Y-%m-%d")
    except:
        return date_str  # Jika gagal, biarkan tetap

df['created_at'] = df['created_at'].apply(convert_date)


In [56]:
def fix_date(date_str):
    date_str = str(date_str)
    known_formats = [
        "%a %b %d %H:%M:%S %z %Y",  # Twitter/X full format
        "%d/%m/%Y",                # Format seperti 31/05/2025
        "%Y-%m-%d"                 # Sudah benar
    ]
    for fmt in known_formats:
        try:
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")
        except:
            continue
    return date_str  # Jika tak cocok format, dikembalikan apa adanya

# Terapkan ke semua baris kolom created_at
df['created_at'] = df['created_at'].apply(fix_date)


In [57]:
print(df.head())

   id source                                               text  created_at  \
0   1      x  gusy mending sunscreen omg apa yang amaterasun...  2025-06-16   
1   2      x  @yoenjaehyuk fw: hadalabo gokujyun (putih) / h...  2025-06-16   
2   3      x  Aku nemu promo menarik di Lazada nih. Yuk coba...  2025-06-15   
3   4      x  Haloooo aku maw preloved skincare pemakaian pr...  2025-06-15   
4   5      x  Gusyyyy mungkin ada yg pake sunscreen amateras...  2025-06-15   

  sentimen  skor_sentimen  
0  neutral       0.987218  
1  neutral       0.776992  
2  neutral       0.960741  
3  neutral       0.819093  
4  neutral       0.993792  


In [58]:
import pandas as pd
import re
import string
import os
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [59]:
stemmer = StemmerFactory().create_stemmer()
stopword_remover = StopWordRemoverFactory().create_stop_word_remover()

In [60]:
# Fungsi preprocessing lengkap
def preprocess(text):
    # Lowercase
    text = text.lower()
    # Hapus username
    text = re.sub(r'@\w+', '', text)
    # Hapus URL
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Hapus angka dan tanda baca
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Hapus whitespace berlebih
    text = text.strip()
    # Hapus stopword
    text = stopword_remover.remove(text)
    # Stemming
    text = stemmer.stem(text)
    return text


In [61]:
df["preprocessed_text"] = df["text"].apply(preprocess)

In [66]:
keywords_penjual = [
    # Kata jualan/promosi
    "jual", "harga", "diskon", "promo", "gratis", "murah", "hemat",
    "beli", "pesan", "checkout", "ready", "stock", "open po", "preorder",
    "limited", "termurah", "cepat", "promo gila", "borong", "paket", "cuma", "hanya",

    # Call to action
    "order", "dm", "chat", "klik", "kunjungi", "swipe", "cek link", "lihat link",
    "yuk cek", "yuk beli", "buruan", "dapetin", "dapatkan", "langsung aja", "langsung dm",

    # Transaksi dan kontak
    "transfer", "cod", "bayar", "rekening", "no rek", "ongkir", "wa", "whatsapp", "linktree",
    "tokopedia", "shopee", "lazada", "blibli", "checkout sekarang",

    # Istilah jualan preloved atau dropship
    "preloved", "second", "bekas", "bisa nego", "nego", "reseller", "dropship", "stok terbatas",

    # Penawaran dan tawaran
    "tawar", "tawaran", "grab", "ambil", "ambil sekarang", "open slot", "promo spesial"
]

def is_penjual(text):
    text = str(text).lower()
    return any(keyword in text for keyword in keywords_penjual)

In [67]:
df['is_penjual'] = df['preprocessed_text'].apply(is_penjual)
df_final = df[~df['is_penjual']].drop(columns=['is_penjual'])

In [68]:
df_final = df_final[['id', 'source', 'created_at', 'preprocessed_text', 'sentimen', 'skor_sentimen']]


In [69]:
os.makedirs('../data/processed', exist_ok=True)
df_final.to_csv('../data/processed/done.csv', index=False)

In [70]:
!pip install scikit-learn pandas




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\MyBook SAGA 12\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [78]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder

In [80]:
df = pd.read_csv('../data/processed/done.csv')
df.head()

Unnamed: 0,id,source,created_at,preprocessed_text,sentimen,skor_sentimen
0,1,x,2025-06-16,gusy mending sunscreen omg apa amaterasun buat...,neutral,0.987218
1,10,x,2025-06-15,wts physical sunscreen amaterasun pakai sekali...,neutral,0.987875
2,15,x,2025-06-14,amaterasun uv sunscreen serum spf pa melindung...,neutral,0.801552
3,18,x,2025-06-14,amaterasun cb kak yg physical sunscreen,neutral,0.964037
4,20,x,2025-06-13,saranin sunscreen bisa nahan minyak ga longsor...,negative,0.684278


In [81]:
texts = df['preprocessed_text'].astype(str)
labels = df['sentimen']

In [82]:
le = LabelEncoder()
y = le.fit_transform(labels)

In [83]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(texts)
print("TF-IDF - Ukuran vektor fitur:", X_tfidf.shape)

TF-IDF - Ukuran vektor fitur: (548, 1179)


In [84]:
bow = CountVectorizer()
X_bow = bow.fit_transform(texts)
print("BoW - Ukuran vektor fitur:", X_bow.shape)

BoW - Ukuran vektor fitur: (548, 1179)


In [85]:
selector = SelectKBest(score_func=chi2, k=100)
X_selected = selector.fit_transform(X_tfidf, y)
print("Setelah Feature Selection (TF-IDF):", X_selected.shape)

Setelah Feature Selection (TF-IDF): (548, 100)


In [88]:
selector = SelectKBest(score_func=chi2, k=100)
X_selected = selector.fit_transform(X_bow, y)
print("Setelah Feature Selection (BOW):", X_selected.shape)

Setelah Feature Selection (BOW): (548, 100)
