In [6]:
from googletrans import Translator
import os
import time
from queue import Queue
from threading import Thread

SRC = "en"
DEST = "tr"
TRANSLATE_LIMIT = -1 # unlimited
DATA_PATH = "D:/C12M/cc12m.tsv"
OUTPUT_PATH = "D:/C12M/cc12m_tr.tsv"
COPY_PATH = "D:/C12M/cc12m_tr_copy.tsv"
RESPECT_DURATION = 1
TRANSLATE_BATCH_SIZE = 100
WORKER_COUNT = 50
INFO_FREQ = 10

def safe_translate(batch, translator):
	translations = batch_translate(batch, translator)
	if translations:
		for i in range(translations):
			if translations[i] == batch[i]:
				return False, translations
	return True, translations

def batch_translate(batch, translator):
	return [translation.text for translation in translator.translate(batch, src=SRC, dest=DEST)]

def to_tsv(url, en, tr):
	return f"{url}\t{en}\t{tr}\n"

In [7]:
def translate_single_thread(data_path, out_path, batch_size, translate_limit, start_idx):
	translator = Translator()
	with open(data_path, "r", encoding="utf-8") as src, open(out_path, "a", encoding="utf-8") as dest:
		batch = [] * batch_size
		active_lines = [""] * batch_size
		index = 0
		translate_counter = 0
		batch_start = time.time()
		for i, line in enumerate(src):
			if i < start_idx:
				continue
			url, text = line.strip().split("\t")
			index = translate_counter % batch_size
			active_lines[index] = (url, text)
			batch[index] = text
			translate_counter += 1
			if translate_counter == translate_limit:
				break
			if index == (batch_size-1):
				try:
					translations = batch_translate(batch, translator)
					for i, translation in enumerate(translations):
						dest.write(to_tsv(active_lines[i][0], active_lines[i][1], translation))
					index = -1	# index is different than -1 after the loop, we need to translate remainder lines as well
					batch_stop = time.time()
					print(f"Translated {translate_counter} entries SPT:{(batch_stop-batch_start) / batch_size}")
				except:
					continue
				batch_start = time.time()
		if index != -1:
			translations = batch_translate(batch[:index], translator)
			for i, translation in enumerate(translations):
				dest.write(to_tsv(active_lines[i][0], active_lines[i][1], translation))

In [8]:
def line_count(file_path):
	start_idx = 0
	if os.path.exists(file_path):
		start_idx = sum(1 for line in open(file_path, "r", encoding="utf-8"))
	return start_idx

def translate_scheduler(path, task_queue:Queue, start_idx, translate_limit):
	with open(path, "r", encoding="utf-8") as f:
		translate_counter = 0
		for i, line in enumerate(f):
			if i < start_idx:
				continue
			url, text = line.strip().split("\t")
			task_queue.put((url, text))
			translate_counter += 1
			if translate_counter == translate_limit:
				break
		task_queue.put(None)

def translate_worker(task_queue:Queue, result_queue:Queue, batch_size):
	translator = Translator()
	running = True
	active_buffer = [""] * batch_size
	batch = [""] * batch_size
	batch_ctr = 0
	last_contribution = time.time()
	while running:
		reference_time = time.time() - 30 # exit if no translations in last 30 secs
		task = task_queue.get()
		if task is None or last_contribution < reference_time:
			running = False
			task_queue.put(None)
			break
		active_buffer[batch_ctr] = task
		batch[batch_ctr] = task[1]
		batch_ctr += 1
		if batch_ctr == batch_size:
			try:
				translations = batch_translate(batch, translator)
			except Exception as e:
				translations = None
				print(e)
			if translations is not None:
				for i, translation in enumerate(translations):
					if translation != batch[i]:
						result_queue.put((active_buffer[i][0], active_buffer[i][1], translation))
						last_contribution = time.time()
			batch_ctr = 0
	if batch_ctr > 0:
		translations = batch_translate(batch[:batch_ctr], translator)
		for i, translation in enumerate(translations):
			for i, translation in enumerate(translations):
				if translation != batch[i]:
					result_queue.put((active_buffer[i][0], active_buffer[i][1], translation))
	result_queue.put(None)

def translate_combiner(path, result_queue:Queue, worker_count, info_freq):
	running = True
	f = open(path, "a", encoding="utf-8")
	retired_count = 0
	complete_ctr = 0
	info_start = time.time()
	while running:
		task = result_queue.get()
		if task is None:
			retired_count += 1
			running = retired_count < worker_count
			continue
		url = task[0]
		en = task[1]
		tr = task[2]
		if f.closed:
			f = open(path, "a", encoding="utf-8")
		f.write(to_tsv(url, en, tr))
		complete_ctr += 1
		if complete_ctr % info_freq == 0:
			info_end = time.time()
			diff = info_end - info_start
			if diff > 0.001:
				print(f"Translated {complete_ctr} entries. TPS: {INFO_FREQ / (info_end - info_start)}")
			info_start = info_end
	f.flush()
	f.close()

In [9]:
def copy_rows(src_path, dest_path, start, end):
	with open(src_path, "r", encoding="utf-8") as src, open(dest_path, "w", encoding="utf-8") as dest:
		for i, line in enumerate(src):
			if i < start:
				continue
			if i >= end:
				break
			dest.write(line)

In [10]:
start_idx = line_count(OUTPUT_PATH)
print(f"Found {start_idx} entries.")
task_queue = Queue()
result_queue = Queue()
scheduler = Thread(target=translate_scheduler, args=(DATA_PATH, task_queue, start_idx, -1))
workers = []
for i in range(WORKER_COUNT):
	thread = Thread(target=translate_worker, args=(task_queue, result_queue, TRANSLATE_BATCH_SIZE))
	workers.append(thread)
combiner = Thread(target=translate_combiner, args=(OUTPUT_PATH, result_queue, WORKER_COUNT, INFO_FREQ))
scheduler.start()
for i in range(WORKER_COUNT):
	workers[i].start()
combiner.start()
scheduler.join()
print("JOB FINISHED: Scheduler")
for i in range(WORKER_COUNT):
	workers[i].join()
	print(f"JOB FINISHED: Worker-{i}")
combiner.join()
print("JON FINISHED: Combiner")

Found 249994 entries.
JOB FINISHED: Scheduler
JOB FINISHED: Worker-0
JON FINISHED: Combiner
