In [None]:
# ================= CELL 4: WORKER LABELING =================
# JALANKAN INI DI 3 JENDELA BERBEDA
import pandas as pd
import google.generativeai as genai
import time
import json
import os
from tqdm import tqdm # Pakai tqdm biasa biar aman di terminal/notebook

# --- KONFIGURASI MANUAL (GANTI INI DI SETIAP WINDOW) ---
API_KEY = "APIGemini"
INPUT_CSV = "task_positif.csv"       # Window 1: task_positif.csv
OUTPUT_CSV = "labeled_positif.csv"   # Window 1: labeled_positif.csv

MODEL_NAME = 'gemini-2.5-flash' # Gunakan 2.5 Flash yang valid
BATCH_SIZE = 20 

# --- SETUP ---
genai.configure(api_key=API_KEY)

# Cek File
if not os.path.exists(INPUT_CSV):
    print(f"File {INPUT_CSV} tidak ditemukan!")
    exit()

df_task = pd.read_csv(INPUT_CSV, encoding='utf-8-sig')
print(f"üî• Memproses {INPUT_CSV} -> {OUTPUT_CSV}")
print(f"üî• Total Data: {len(df_task)} | Batch Size: {BATCH_SIZE}")

# Setup Output File (Resume Capability)
start_idx = 0
if os.path.exists(OUTPUT_CSV):
    # Cek apakah kosong
    if os.path.getsize(OUTPUT_CSV) > 100: 
        # Hitung baris
        with open(OUTPUT_CSV, encoding='utf-8-sig') as f:
            start_idx = sum(1 for line in f) - 1 # Minus header
        print(f"üîÑ Melanjutkan dari data ke-{start_idx}...")
    else:
        # Bikin baru
        pd.DataFrame(columns=['sent_id_1', 'sent_id_2', 'text_1', 'text_2', 'score']).to_csv(OUTPUT_CSV, index=False, encoding='utf-8-sig')
else:
    # Bikin baru
    pd.DataFrame(columns=['sent_id_1', 'sent_id_2', 'text_1', 'text_2', 'score']).to_csv(OUTPUT_CSV, index=False, encoding='utf-8-sig')

# --- LOOP PROCESSING ---
total_rows = len(df_task)

# Loop manual range step
for i in tqdm(range(start_idx, total_rows, BATCH_SIZE), desc="Labeling"):
    # Ambil Batch
    end_i = min(i + BATCH_SIZE, total_rows)
    batch_df = df_task.iloc[i:end_i]
    
    # Siapkan Prompt JSON
    pairs_list = []
    for idx, row in batch_df.iterrows():
        # Bersihkan text lagi jaga2
        t1 = str(row['text_1']).replace('"', "'")
        t2 = str(row['text_2']).replace('"', "'")
        pairs_list.append({"id": idx, "sent1": t1, "sent2": t2})
    
    prompt = f"""
    Rate Semantic Similarity 0.0 to 5.0.
    Output JSON: [{{ "id": 0, "score": 4.5 }}]
    Data: {json.dumps(pairs_list)}
    """
    
    # Retry Logic (Robust)
    success = False
    for attempt in range(3):
        try:
            model = genai.GenerativeModel(MODEL_NAME, generation_config={"response_mime_type": "application/json"})
            response = model.generate_content(prompt)
            results = json.loads(response.text)
            scores_map = {item['id']: item['score'] for item in results if 'score' in item}
            
            # Simpan Batch Ini
            results_to_save = []
            for idx, row in batch_df.iterrows():
                if idx in scores_map:
                    results_to_save.append({
                        'sent_id_1': row['sent_id_1'],
                        'sent_id_2': row['sent_id_2'],
                        'text_1': row['text_1'],
                        'text_2': row['text_2'],
                        'score': scores_map[idx]
                    })
            
            if results_to_save:
                # Mode Append ('a'), Header False
                pd.DataFrame(results_to_save).to_csv(OUTPUT_CSV, mode='a', header=False, index=False, encoding='utf-8-sig')
            
            success = True
            break # Lanjut ke batch berikutnya
            
        except Exception as e:
            if "429" in str(e):
                time.sleep(10) # Tunggu agak lama
            else:
                time.sleep(1)
    
    if not success:
        print(f"‚ùå Gagal di batch index {i}. Lewat.")
    
    time.sleep(1) # Jeda sopan

print("‚úÖ SELESAI WINDOW INI!")

üî• Memproses task_positif.csv -> labeled_positif.csv
üî• Total Data: 20000 | Batch Size: 20
üîÑ Melanjutkan dari data ke-18039...


Labeling: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 99/99 [36:34<00:00, 22.17s/it]


‚úÖ SELESAI WINDOW INI!
