In [7]:
import pandas as pd
import re
import random
from google_play_scraper import Sort, reviews


# ============================================
# 1. SCRAPING 1000 ULASAN CHATGPT
# ============================================
APP_ID = 'com.openai.chatgpt'
JUMLAH_ULASAN = 1000

result, _ = reviews(
    APP_ID,
    lang='id',
    country='id',
    sort=Sort.NEWEST,
    count=JUMLAH_ULASAN
)

df = pd.DataFrame(result)
df = df[['content']]   # kolom ulasan saja


# ============================================
# 2. CLEANING TEKS
# ============================================
def bersihkan(teks):
    if pd.isna(teks):
        return ""
    teks = teks.lower()
    teks = re.sub(r'[^a-zA-Z0-9\s]', ' ', teks)
    teks = re.sub(r'\s+', ' ', teks).strip()
    return teks

df['clean'] = df['content'].apply(bersihkan)


# ============================================
# 3. LABELING SENTIMEN (Rule-based)
# ============================================
kata_positif = [
    "bagus", "puas", "cepat", "mantap", "keren", "baik", "berguna",
    "akurat", "recommended", "suka", "lancar", "bantu", "nyaman"
]

kata_negatif = [
    "buruk", "jelek", "lambat", "error", "crash", "mengecewakan",
    "tidak bisa", "bug", "lemot", "parah", "susah", "tidak berfungsi"
]

def label(teks):
    words = teks.split()
    pos = sum(1 for w in words if w in kata_positif)
    neg = sum(1 for w in words if w in kata_negatif)

    if pos > neg and pos > 0:
        return "Positif"
    elif neg > pos and neg > 0:
        return "Negatif"
    else:
        return "Netral"

df['sentimen'] = df['clean'].apply(label)

TARGET_POS = 100
TARGET_NEG = 100
TARGET_NET = 50

df_pos = df[df['sentimen'] == "Positif"]
df_neg = df[df['sentimen'] == "Negatif"]
df_net = df[df['sentimen'] == "Netral"]

# fungsi sesuaikan jumlah (pakai oversampling bila kurang)
def ambil_data(df_kat, target):
    if len(df_kat) >= target:
        return df_kat.sample(n=target, random_state=42)
    else:
        kekurangan = target - len(df_kat)
        tambahan = df_kat.sample(n=kekurangan, replace=True, random_state=42)
        return pd.concat([df_kat, tambahan])

final_pos = ambil_data(df_pos, TARGET_POS)
final_neg = ambil_data(df_neg, TARGET_NEG)
final_net = ambil_data(df_net, TARGET_NET)

# gabung final
final_df = pd.concat([final_pos, final_neg, final_net]).sample(frac=1, random_state=42)

print("\nJumlah Akhir:")
print(final_df['sentimen'].value_counts())

# simpan ke CSV
final_df.to_csv("dataset_chatgpt_100_pos_100_neg_50_net.csv", index=False, encoding='utf-8')

print("\nDataset selesai dibuat: dataset_chatgpt_100_pos_100_neg_50_net.csv")
print("Total data:", len(final_df))



Jumlah Akhir:
sentimen
Negatif    100
Positif    100
Netral      50
Name: count, dtype: int64

Dataset selesai dibuat: dataset_chatgpt_100_pos_100_neg_50_net.csv
Total data: 250


Labeling Self Training

In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.linear_model import LogisticRegression

# ----------------------------------------------------
# 1. Load Data Ulasan (1000 ulasan hasil scraping)
# ----------------------------------------------------
df_raw = df.copy()

print("Total ulasan hasil scraping:", len(df_raw))

# ----------------------------------------------------
# 2. Siapkan 250 Data Berlabel Manual
#    (100 positif, 100 negatif, 50 netral)
# ----------------------------------------------------
df_labeled = pd.read_csv("dataset_chatgpt_100_pos_100_neg_50_net.csv")
sentiment_mapping = {"Positif": 2, "Netral": 1, "Negatif": 0}
df_labeled["label"] = df_labeled["sentimen"].map(sentiment_mapping)

print("Total data berlabel:", len(df_labeled))

# ----------------------------------------------------
# 3. Siapkan 750 Data Tidak Berlabel
# ----------------------------------------------------

df_unlabeled = df_raw.iloc[len(df_labeled):].copy()
df_unlabeled["label"] = -1  # wajib label -1 untuk self-training

print("Total data tidak berlabel (untuk self-training):", len(df_unlabeled))

# ----------------------------------------------------
# 4. Gabungkan Data Berlabel + Tidak Berlabel
# ----------------------------------------------------
df_all = pd.concat([df_labeled[['content', 'label']], df_unlabeled[['content', 'label']]], ignore_index=True)

X_text = df_all["content"].astype(str)   # kolom ulasan
y_label = df_all["label"].values

# ----------------------------------------------------
# 5. TF-IDF Vectorization
# ----------------------------------------------------
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X_text)

# ----------------------------------------------------
# 6. Model Dasar (Logistic Regression)
# ----------------------------------------------------
base_model = LogisticRegression(max_iter=1500)

# ----------------------------------------------------
# 7. Self-Training Classifier
#    threshold=0.80 → hanya mengambil prediksi yang yakin
# ----------------------------------------------------
self_training_model = SelfTrainingClassifier(
    base_model,
    threshold=0.80,
    verbose=True
)

# ----------------------------------------------------
# 8. Train Model Self-Training
# ----------------------------------------------------
print("\n⚡ Training Self-Training Classifier...")
self_training_model.fit(X_vectorized.toarray(), y_label)

print("\nSelf-training selesai!")

# ----------------------------------------------------
# 9. Hasil: Ambil Label Baru dari 750 Ulasan
# ----------------------------------------------------
df_all["predicted_label"] = self_training_model.predict(X_vectorized.toarray())

# Hanya ambil data yang awalnya tidak berlabel (-1)
df_new_labels = df_all[df_all["label"] == -1][["content", "predicted_label"]]

# Convert numerical predicted_label back to sentiment strings for output if desired
reverse_sentiment_mapping = {2: "Positif", 1: "Netral", 0: "Negatif"}
df_new_labels["predicted_sentimen"] = df_new_labels["predicted_label"].map(reverse_sentiment_mapping)

print("\nJumlah ulasan yang berhasil dilabel otomatis:", len(df_new_labels))

# ----------------------------------------------------
# 10. Simpan hasil Self-Training
# ----------------------------------------------------
df_new_labels.to_csv("hasil_label_self_training_750.csv", index=False, encoding="utf-8")
df_all.to_csv("gabungan_1000_dengan_label.csv", index=False, encoding="utf-8")

print("\n✔ File berhasil disimpan:")
print(" - hasil_label_self_training_750.csv")
print(" - gabungan_1000_dengan_label.csv")


Total ulasan hasil scraping: 1000
Total data berlabel: 250
Total data tidak berlabel (untuk self-training): 750

⚡ Training Self-Training Classifier...
End of iteration 1, added 127 new labels.
End of iteration 2, added 30 new labels.
End of iteration 3, added 9 new labels.
End of iteration 4, added 4 new labels.

Self-training selesai!

Jumlah ulasan yang berhasil dilabel otomatis: 750

✔ File berhasil disimpan:
 - hasil_label_self_training_750.csv
 - gabungan_1000_dengan_label.csv
