In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# ============================
# 1. Load data
# ============================
full = pd.read_csv("/content/drive/MyDrive/SMT 7/Pemrosesan Teks Teori/Hasil_Preprocessing (1).csv")
manual = pd.read_csv("/content/drive/MyDrive/SMT 7/Pemrosesan Teks Teori/label_manual.csv")   # 250 data hasil labeling manual

# ganti nama kolom teks sesuai dataset kamu
text_col = "final_text"   # This will be the column name for the generated self_training_labels DataFrame

# ============================
# 2. Pisahkan data berlabel & tidak berlabel
# ============================
# Use the correct column names from the loaded DataFrames
labeled_texts = manual["DATA"]          # Changed from text_col to "DATA" based on kernel state
labeled_labels = manual["LABEL"]        # Changed from "label" to "LABEL" based on kernel state

unlabeled_texts = full["full_text"].iloc[len(manual):]   # Changed from text_col to "full_text" based on kernel state

# ============================
# 3. TF-IDF Vectorizer
# ============================
vectorizer = TfidfVectorizer(max_features=5000)
X_labeled = vectorizer.fit_transform(labeled_texts)
X_unlabeled = vectorizer.transform(unlabeled_texts)

# ============================
# 4. Train model Logistic Regression
# ============================
model = LogisticRegression(max_iter=1000)
model.fit(X_labeled, labeled_labels)

# ============================
# 5. Prediksi pada 750 data unlabeled
# ============================
proba = model.predict_proba(X_unlabeled)
predictions = model.predict(X_unlabeled)

# confidence score
confidence = proba.max(axis=1)

# ============================
# 6. Pilih pseudo-label dengan confidence tinggi
# ============================
threshold = 0.50
selected = confidence >= threshold

self_training_labels = pd.DataFrame({
    text_col: unlabeled_texts.iloc[selected].values, # Use text_col here for the output DataFrame
    "label": predictions[selected],
    "confidence": confidence[selected]
})

# ============================
# 7. Simpan hasil pseudo-label
# ============================
self_training_labels.to_csv("label_self_training.csv", index=False)

print("Jumlah data manual :", len(manual))
print("Jumlah hasil self-training (conf>=0.50):", len(self_training_labels))
print(self_training_labels.head())

Jumlah data manual : 250
Jumlah hasil self-training (conf>=0.50): 496
                                          final_text     label  confidence
0  @KangManto123 Moxel gini yg merusak niat awal MBG  negative    0.565045
1  Ini kek nyebut kasus keracunan mbg cuma 0 0001...  negative    0.522068
2  Yg begini ini yg harusnya jauh lebih mendesak ...  negative    0.632556
3  @kompascom Bisa kasih MBG puluhan triliun seme...  negative    0.544436
4            @hafsabadoobee mana mbg yg kau janjikan  negative    0.632596
