In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.linear_model import LogisticRegression
from scipy.sparse import vstack

In [21]:
df = pd.read_csv("/content/drive/MyDrive/pelabelanmanual.csv", sep=";")

In [22]:
df.head()

Unnamed: 0,content,score,review_cleaned,label
0,"suka berguna banget, program dan internalnya l...",5,suka berguna banget program internalnya fleksi...,netral
1,"Terbaru sangat praktis,program dan internalnya...",5,terbaru praktis program internalnya memadai pa...,netral
2,"Asliii sangat praktis,program dan internalnya ...",5,asli praktis program internalnya memadai pakai...,netral
3,"Asli sangat praktis,program dan internalnya le...",5,asli praktis program internalnya memadai pakai...,netral
4,"Asli sangat praktis,program dan internalnya le...",5,asli praktis program internalnya memadai pakai...,netral


In [23]:
labeled_df = df[df["label"].notna()]
counts = labeled_df["label"].value_counts()
counts

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
positif,100
negatif,100
netral,50


In [24]:
text_col = "content"
label_col = "label"
labeled_df = df[df[label_col].notna()].copy()
unlabeled_df = df[df[label_col].isna()].copy()

X_labeled = labeled_df[text_col]
y_labeled = labeled_df[label_col]

X_unlabeled = unlabeled_df[text_col]

print("Jumlah data berlabel :", len(X_labeled))
print("Jumlah data belum berlabel :", len(X_unlabeled))

Jumlah data berlabel : 250
Jumlah data belum berlabel : 750


In [25]:
#Encoding label menjadi angka
label_map = {label: i for i, label in enumerate(y_labeled.unique())}
inv_label_map = {v: k for k, v in label_map.items()}

y_labeled_enc = y_labeled.map(label_map)
y_unlabeled_enc = pd.Series([-1] * len(X_unlabeled))

In [26]:
#TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X_labeled_vec = vectorizer.fit_transform(X_labeled)
X_unlabeled_vec = vectorizer.transform(X_unlabeled)

X_all = vstack([X_labeled_vec, X_unlabeled_vec])
y_all = pd.concat([y_labeled_enc, y_unlabeled_enc], ignore_index=True)

In [27]:
#self training
base_clf = LogisticRegression(max_iter=500)
self_training = SelfTrainingClassifier(
    base_clf,
    threshold=0.80,
    verbose=True
)

self_training.fit(X_all, y_all)

End of iteration 1, added 41 new labels.
End of iteration 2, added 55 new labels.
End of iteration 3, added 63 new labels.
End of iteration 4, added 43 new labels.
End of iteration 5, added 50 new labels.
End of iteration 6, added 34 new labels.
End of iteration 7, added 21 new labels.
End of iteration 8, added 11 new labels.
End of iteration 9, added 13 new labels.
End of iteration 10, added 5 new labels.


In [28]:
#prediksi
pred_unlabeled_enc = self_training.predict(X_unlabeled_vec)

#Decoding angka ke label
unlabeled_df[label_col] = [inv_label_map[p] for p in pred_unlabeled_enc]

In [29]:
final_df = pd.concat([labeled_df, unlabeled_df]).sort_index()

In [30]:
final_df.head(20)

Unnamed: 0,content,score,review_cleaned,label
0,"suka berguna banget, program dan internalnya l...",5,suka berguna banget program internalnya fleksi...,netral
1,"Terbaru sangat praktis,program dan internalnya...",5,terbaru praktis program internalnya memadai pa...,netral
2,"Asliii sangat praktis,program dan internalnya ...",5,asli praktis program internalnya memadai pakai...,netral
3,"Asli sangat praktis,program dan internalnya le...",5,asli praktis program internalnya memadai pakai...,netral
4,"Asli sangat praktis,program dan internalnya le...",5,asli praktis program internalnya memadai pakai...,netral
5,"Asli sangat praktis,program dan internalnya le...",5,asli praktis program internalnya memadai pakai...,netral
6,"Bagus mayan praktis,program dan internalnya le...",5,bagus mayan praktis program internalnya memada...,netral
7,"Asli sangat praktis,program dan internalnya le...",5,asli praktis program internalnya memadai pakai...,netral
8,"Asli praktis juga harga terjangkau, programnya...",5,asli praktis harga terjangkau programnya inter...,netral
9,"Asli top banget,program dan internalnya lebih ...",5,asli banget program internalnya memadai pakai ...,netral


In [31]:
final_df.to_csv("Pelabelan Self-Training.csv", index=False)