In [3]:
import pandas as pd


df_manual = pd.read_csv("BCA_labeled_progress.csv")


df_251_1000 = pd.read_csv("BCA_data_251_1000.csv")


df_final = pd.concat([df_manual, df_251_1000], ignore_index=True)


df_final.to_csv("BCA_manual250_plus_251_1000.csv", index=False)

print("Selesai. Total baris:", df_final.shape)


Selesai. Total baris: (1000, 2)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np

In [13]:
df = pd.read_csv("BCA_manual250_plus_251_1000.csv")

df["content"] = df["content"].astype(str)

if "label" not in df.columns:
    df["label"] = np.nan

df["label"] = df["label"].replace(
    ["None", "none", "", " ", "nan", "NaN"],
    np.nan
)

print("Shape awal:", df.shape)
print("Jumlah label manual (tidak NaN) di kolom 'label':", df["label"].notna().sum())



Shape awal: (1000, 2)
Jumlah label manual (tidak NaN) di kolom 'label': 250


In [14]:
df["label_manual"] = df["label"]

df["label_self"] = df["label_manual"].copy()

print("Jumlah label_manual:", df["label_manual"].notna().sum())
print("Jumlah label_self  :", df["label_self"].notna().sum())



Jumlah label_manual: 250
Jumlah label_self  : 250


In [15]:
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=3
)

X = vectorizer.fit_transform(df["content"])

print("Shape TF-IDF:", X.shape)



Shape TF-IDF: (1000, 1312)


In [16]:

model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)



In [17]:
threshold  = 0.90
max_rounds = 10

for round_idx in range(max_rounds):
    print(f"\n=== ROUND {round_idx+1} ===")


    labeled_idx   = np.where(df["label_self"].notna())[0]
    unlabeled_idx = np.where(df["label_self"].isna())[0]

    print("  Labeled   :", len(labeled_idx))
    print("  Unlabeled :", len(unlabeled_idx))

    if len(unlabeled_idx) == 0:
        print("  Tidak ada data unlabeled tersisa. Stop.")
        break


    X_labeled = X[labeled_idx]
    y_labeled = df.loc[labeled_idx, "label_self"].astype(int)

    model.fit(X_labeled, y_labeled)

    X_unlabeled = X[unlabeled_idx]
    proba = model.predict_proba(X_unlabeled)
    preds = model.predict(X_unlabeled)

    max_proba = proba.max(axis=1)
    selected_mask = max_proba >= threshold

    if selected_mask.sum() == 0:
        print("  Tidak ada pseudo-label dengan confidence >= threshold. Stop.")
        break

    new_idx = unlabeled_idx[selected_mask]
    new_labels = preds[selected_mask]

    df.loc[new_idx, "label_self"] = new_labels

    print("  Pseudo-label baru:", len(new_idx))




=== ROUND 1 ===
  Labeled   : 250
  Unlabeled : 750
  Pseudo-label baru: 170

=== ROUND 2 ===
  Labeled   : 420
  Unlabeled : 580
  Pseudo-label baru: 223

=== ROUND 3 ===
  Labeled   : 643
  Unlabeled : 357
  Pseudo-label baru: 172

=== ROUND 4 ===
  Labeled   : 815
  Unlabeled : 185
  Pseudo-label baru: 78

=== ROUND 5 ===
  Labeled   : 893
  Unlabeled : 107
  Pseudo-label baru: 19

=== ROUND 6 ===
  Labeled   : 912
  Unlabeled : 88
  Pseudo-label baru: 4

=== ROUND 7 ===
  Labeled   : 916
  Unlabeled : 84
  Tidak ada pseudo-label dengan confidence >= threshold. Stop.


In [18]:
changed_manual = df[
    df["label_manual"].notna() &
    (df["label_manual"].astype(str) != df["label_self"].astype(str))
]

print("\nJumlah baris yang label_manual != label_self (harusnya 0):", len(changed_manual))

# kalau mau keras: assert
if len(changed_manual) > 0:
    print("PERINGATAN: ada label_manual yang beda dengan label_self. Cek variabel 'changed_manual'.")
else:
    print("AMAN: label_manual tidak berubah.")




Jumlah baris yang label_manual != label_self (harusnya 0): 0
AMAN: label_manual tidak berubah.


In [20]:
df.to_csv("BCA_self_training_output.csv", index=False)
print("\nSelf-training selesai. File disimpan sebagai 'BCA_self_training_output.csv'")


Self-training selesai. File disimpan sebagai 'BCA_self_training_output.csv'


In [21]:
df

Unnamed: 0,content,label,label_manual,label_self
0,sayang g bisa digunakan m bca gbisa dipake q b...,-1.0,-1.0,-1.0
1,"jelek aplikasi payah BCA,masak tau tau kluar s...",-1.0,-1.0,-1.0
2,baik,0.0,0.0,0.0
3,"suka loading kalau dibuka,, gak bisa sat set l...",-1.0,-1.0,-1.0
4,"Peringatan yang selalu mengganggu ""Transaksi d...",-1.0,-1.0,-1.0
...,...,...,...,...
995,"otp sulit , keblokir ,padahal udah bner masuki...",,,-1.0
996,ahir2 ini pakai jaringan Wifi tidak bisa trans...,,,-1.0
997,ih gelekðŸ¤®,,,-1.0
998,susah benermau cek mutasi ðŸ¤·ðŸ¤¦?!!!!,,,-1.0


In [22]:
df_final = df[["content", "label_self"]].copy()

df_final.to_csv("BCA_Final_Data.csv", index=False)

df_final.head()


Unnamed: 0,content,label_self
0,sayang g bisa digunakan m bca gbisa dipake q b...,-1.0
1,"jelek aplikasi payah BCA,masak tau tau kluar s...",-1.0
2,baik,0.0
3,"suka loading kalau dibuka,, gak bisa sat set l...",-1.0
4,"Peringatan yang selalu mengganggu ""Transaksi d...",-1.0
