In [2]:
import json
import subprocess
import time
from pathlib import Path

class OllamaBatchTranslator:
    def __init__(self, source_file: str, output_file: str, batch_size=10, model="lauchacarro/qwen2.5-translator"):
        self.source_file = Path(source_file)
        self.output_file = Path(output_file)
        self.batch_size = batch_size
        self.model = model
        self.checkpoint_file = self.output_file.parent / "checkpoint.json"

        self.stats = self._load_checkpoint() or {
            'processed_index': 0,
            'start_time': time.time()
        }

        self._load_source()

    def _load_checkpoint(self):
        if self.checkpoint_file.exists():
            with open(self.checkpoint_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        return None

    def _save_checkpoint(self):
        with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
            json.dump(self.stats, f, indent=2, ensure_ascii=False)

    def _load_source(self):
        with open(self.source_file, 'r', encoding='utf-8') as f:
            self.source_data = json.load(f)

    def _translate_batch(self, batch):
        prompt_lines = "\n".join([f"{i+1}. {text}" for i, text in enumerate(batch)])
        prompt = (
            "Terjemahkan semua kalimat ini ke dalam Bahasa Indonesia secara kontekstual dan ilmiah sesuai bidang ornitologi:\n\n"
            f"{prompt_lines}\n\nJawaban:"
        )

        result = subprocess.run(
            ["ollama", "run", self.model],
            input=prompt.encode(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        output = result.stdout.decode("utf-8").strip()

        lines = output.split("\n")
        translations = []
        for line in lines:
            if "." in line:
                text = line.split(".", 1)[1].strip()
                if text:
                    translations.append(text)
        return translations

    def run(self):
        print("\n=== Starting Translation with Checkpoint Support ===")
        index = self.stats['processed_index']

        all_entries = []
        for entry_idx, entry in enumerate(self.source_data["dataset"]):
            for cap_idx, cap in enumerate(entry["captions"]):
                all_entries.append((entry_idx, cap_idx, cap["english"]))

        while index < len(all_entries):
            batch_entries = all_entries[index:index + self.batch_size]
            batch_texts = [item[2] for item in batch_entries]

            print(f"\n🔁 Translating batch {index} to {index + len(batch_entries) - 1}...")
            translations = self._translate_batch(batch_texts)

            for i, (entry_idx, cap_idx, _) in enumerate(batch_entries):
                self.source_data["dataset"][entry_idx]["captions"][cap_idx]["indo"] = (
                    translations[i] if i < len(translations) else "[MISSING TRANSLATION]"
                )

            index += self.batch_size
            self.stats['processed_index'] = index
            self._save_checkpoint()
            print(f"✅ Saved checkpoint at index {index}")
            time.sleep(1)

        print("\n🎉 Translation complete!")
        with open(self.output_file, 'w', encoding='utf-8') as f:
            json.dump(self.source_data, f, indent=2, ensure_ascii=False)
        print(f"📝 Results saved to: {self.output_file}")


In [3]:
processor = OllamaBatchTranslator(
    source_file="paralel_cub_200_2011_captions.json",
    output_file="ollama_translated.json",
    batch_size=10
)
processor.run()



=== Starting Translation with Checkpoint Support ===

🔁 Translating batch 0 to 9...
✅ Saved checkpoint at index 10

🔁 Translating batch 10 to 19...
✅ Saved checkpoint at index 20

🔁 Translating batch 20 to 29...
✅ Saved checkpoint at index 30

🔁 Translating batch 30 to 39...
✅ Saved checkpoint at index 40

🔁 Translating batch 40 to 49...
✅ Saved checkpoint at index 50

🔁 Translating batch 50 to 59...
✅ Saved checkpoint at index 60

🔁 Translating batch 60 to 69...
✅ Saved checkpoint at index 70

🔁 Translating batch 70 to 79...
✅ Saved checkpoint at index 80

🔁 Translating batch 80 to 89...
✅ Saved checkpoint at index 90

🔁 Translating batch 90 to 99...
✅ Saved checkpoint at index 100

🔁 Translating batch 100 to 109...
✅ Saved checkpoint at index 110

🔁 Translating batch 110 to 119...
✅ Saved checkpoint at index 120

🔁 Translating batch 120 to 129...
✅ Saved checkpoint at index 130

🔁 Translating batch 130 to 139...
✅ Saved checkpoint at index 140

🔁 Translating batch 140 to 149...
✅ Sa

KeyboardInterrupt: 

In [5]:
output_file = "ollama_translated.json"


with open("ollama_translated.json", "r", encoding="utf-8") as f:
    translated_data = json.load(f)

# Tampilkan 3 entri awal
from pprint import pprint
pprint(translated_data["dataset"][:3])


FileNotFoundError: [Errno 2] No such file or directory: 'ollama_translated.json'

In [6]:
from pathlib import Path
import json

source_file = "paralel_cub_200_2011_captions.json"
checkpoint_file = Path("ollama_translated.json").parent / "checkpoint.json"
output_file = "ollama_translated_partial.json"

# Load data sumber asli
with open(source_file, "r", encoding="utf-8") as f:
    data = json.load(f)


In [7]:
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"✅ Output hasil parsial disimpan ke: {output_file}")


✅ Output hasil parsial disimpan ke: ollama_translated_partial.json


In [2]:
import json
import subprocess
import time
from pathlib import Path

class OllamaBatchTranslator:
    def __init__(self, source_file: str, output_file: str, batch_size=10, model="qwen2.5:1.5b"):
        self.source_file = Path(source_file)
        self.output_file = Path(output_file)
        self.batch_size = batch_size
        self.model = model
        self.checkpoint_file = self.output_file.parent / "checkpoint1.json"

        self.stats = self._load_checkpoint() or {
            'processed_index': 0,
            'start_time': time.time()
        }

        self._load_source()

    def _load_checkpoint(self):
        if self.checkpoint_file.exists():
            with open(self.checkpoint_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        return None

    def _save_checkpoint(self):
        with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
            json.dump(self.stats, f, indent=2, ensure_ascii=False)

    def _load_source(self):
        with open(self.source_file, 'r', encoding='utf-8') as f:
            self.source_data = json.load(f)

    def _translate_batch(self, batch):
        prompt_lines = "\n".join([f"{i+1}. {text}" for i, text in enumerate(batch)])
        prompt = (
            "You are a professional translator specialized in ornithology."
            "Translate the following English sentences into accurate Indonesian using correct ornithological terminology:\n"
            f"{prompt_lines}\n\n" +
            "Respond with only the translations, numbered accordingly."
        )

        result = subprocess.run(
            ["ollama", "run", self.model],
            input=prompt.encode(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        output = result.stdout.decode("utf-8").strip()

        lines = output.split("\n")
        translations = []
        for line in lines:
            if "." in line:
                text = line.split(".", 1)[1].strip()
                if text:
                    translations.append(text)
        return translations

    def run(self):
        print("\n=== Starting Translation with Checkpoint Support ===")
        index = self.stats['processed_index']

        all_entries = []
        for entry_idx, entry in enumerate(self.source_data["dataset"]):
            for cap_idx, cap in enumerate(entry["captions"]):
                all_entries.append((entry_idx, cap_idx, cap["english"]))

        while index < len(all_entries):
            batch_entries = all_entries[index:index + self.batch_size]
            batch_texts = [item[2] for item in batch_entries]

            print(f"\n🔁 Translating batch {index} to {index + len(batch_entries) - 1}...")
            translations = self._translate_batch(batch_texts)

            for i, (entry_idx, cap_idx, _) in enumerate(batch_entries):
                self.source_data["dataset"][entry_idx]["captions"][cap_idx]["indo"] = (
                    translations[i] if i < len(translations) else "[MISSING TRANSLATION]"
                )

            index += self.batch_size
            self.stats['processed_index'] = index
            self._save_checkpoint()
            print(f"✅ Saved checkpoint at index {index}")
            time.sleep(1)

        print("\n🎉 Translation complete!")
        with open(self.output_file, 'w', encoding='utf-8') as f:
            json.dump(self.source_data, f, indent=2, ensure_ascii=False)
        print(f"📝 Results saved to: {self.output_file}")


In [3]:
processor = OllamaBatchTranslator(
    source_file="paralel_cub_200_2011_captions.json",
    output_file="ollama_translated1.json",
    batch_size=25
)
processor.run()



=== Starting Translation with Checkpoint Support ===

🔁 Translating batch 0 to 24...
✅ Saved checkpoint at index 25

🔁 Translating batch 25 to 49...
✅ Saved checkpoint at index 50

🔁 Translating batch 50 to 74...
✅ Saved checkpoint at index 75

🔁 Translating batch 75 to 99...
✅ Saved checkpoint at index 100

🔁 Translating batch 100 to 124...
✅ Saved checkpoint at index 125

🔁 Translating batch 125 to 149...


KeyboardInterrupt: 

In [4]:
from pathlib import Path
import json

source_file = "paralel_cub_200_2011_captions.json"
checkpoint_file = Path("ollama_translated1.json").parent / "checkpoint1.json"
output_file = "ollama_translated_partial1.json"

# Load data sumber asli
with open(source_file, "r", encoding="utf-8") as f:
    data = json.load(f)


In [5]:
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"✅ Output hasil parsial disimpan ke: {output_file}")


✅ Output hasil parsial disimpan ke: ollama_translated_partial1.json
