In [3]:
import pandas as pd


df_manual = pd.read_csv("BCA_labeled_progress.csv")


df_251_1000 = pd.read_csv("BCA_data_251_1000.csv")


df_final = pd.concat([df_manual, df_251_1000], ignore_index=True)


df_final.to_csv("BCA_manual250_plus_251_1000.csv", index=False)

print("Selesai. Total baris:", df_final.shape)


Selesai. Total baris: (1000, 2)


In [6]:
import numpy as np

df = pd.read_csv("BCA_manual250_plus_251_1000.csv")


df["content"] = df["content"].astype(str)


df["label"] = df["label"].replace(["None", "none", "", " "], np.nan)

In [7]:

df["label_manual"] = df["label"]


df["label_self"] = df["label_manual"]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    min_df=3
)

X = vectorizer.fit_transform(df["content"])

model = LogisticRegression(max_iter=1000, n_jobs=-1)


In [10]:
import numpy as np

threshold = 0.90
max_rounds = 10

for round in range(max_rounds):
    print(f"\n=== ROUND {round+1} ===")

    # ubah boolean mask ke index integer
    labeled_idx   = np.where(df["label_self"].notna())[0]
    unlabeled_idx = np.where(df["label_self"].isna())[0]

    print("  Labeled   :", len(labeled_idx))
    print("  Unlabeled :", len(unlabeled_idx))

    if len(unlabeled_idx) == 0:
        print("  Tidak ada data unlabeled tersisa.")
        break

    # split data
    X_labeled   = X[labeled_idx]
    y_labeled   = df.loc[labeled_idx, "label_self"].astype(int)

    X_unlabeled = X[unlabeled_idx]

    # train
    model.fit(X_labeled, y_labeled)

    # prediksi unlabeled
    proba = model.predict_proba(X_unlabeled)
    preds = model.predict(X_unlabeled)

    max_proba = proba.max(axis=1)
    selected = max_proba >= threshold

    if selected.sum() == 0:
        print("  Tidak ada pseudo-label dengan confidence tinggi. Stop.")
        break

    # ambil index yang memenuhi threshold
    new_idx = unlabeled_idx[selected]
    new_labels = preds[selected]

    # masukkan pseudo-label
    df.loc[new_idx, "label_self"] = new_labels

    print("  Pseudo-label baru:", len(new_idx))

print("\nSelf-training selesai.")



=== ROUND 1 ===
  Labeled   : 250
  Unlabeled : 750
  Pseudo-label baru: 170

=== ROUND 2 ===
  Labeled   : 420
  Unlabeled : 580
  Pseudo-label baru: 223

=== ROUND 3 ===
  Labeled   : 643
  Unlabeled : 357
  Pseudo-label baru: 172

=== ROUND 4 ===
  Labeled   : 815
  Unlabeled : 185
  Pseudo-label baru: 78

=== ROUND 5 ===
  Labeled   : 893
  Unlabeled : 107
  Pseudo-label baru: 19

=== ROUND 6 ===
  Labeled   : 912
  Unlabeled : 88
  Pseudo-label baru: 4

=== ROUND 7 ===
  Labeled   : 916
  Unlabeled : 84
  Tidak ada pseudo-label dengan confidence tinggi. Stop.

Self-training selesai.


In [11]:
df.to_csv("BCA_self_training_output.csv", index=False)
print("\nSelf-training selesai → file: BCA_self_training_output.csv")



Self-training selesai → file: BCA_self_training_output.csv
