# 一个升级后的批量翻译代码

这个代码最初由黄泓森进行开发，由李鲁鲁转到colab并进行了更改

[骆驼项目主页](https://github.com/LC1332/Luotuo-Chinese-LLM)

如果你使用我们的代码获取了有用的数据，也欢迎分享给我们，或者告诉我们你公开后的github/huggingface链接

如果你使用我们的代码获取数据并发表了论文或者tech report，欢迎cite我们的github repo

## 安装环境

In [1]:
!pip install openai
!pip install aiofiles
!pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.7-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from openai)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5 (from aiohttp->openai)
  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->openai)
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->openai)
  Downloadin

In [2]:
import os
import json
import time
import openai
import asyncio
import aiohttp
import aiofiles
from functools import partial
from tqdm.asyncio import tqdm as tqdm
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")
max_zh_en_ratio = 2.3

## 输入你的openAI API

In [3]:
# 在这里输入你的openAI API token

api_key = ["sk-DfFyR"]


class KeyPool:
    def __init__(self, strings):
        self.pool = list(strings)
        self.last_used = {s: -1 for s in strings}

    def getKey(self):
        result = min(self.last_used, key=self.last_used.get)
        self.last_used[result] = int(time.time() * 1000)
        return result

pool = KeyPool(api_key)

## 指定工作目录



In [4]:
os.chdir("/content/")

## 获取需要翻译的样本

这里我们使用WizardLM的样本

In [5]:
!wget https://raw.githubusercontent.com/LC1332/WizardLM/main/data/WizardLM_testset.jsonl -O WizardLM_testset.jsonl

--2023-05-25 03:31:32--  https://raw.githubusercontent.com/LC1332/WizardLM/main/data/WizardLM_testset.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81139 (79K) [text/plain]
Saving to: ‘WizardLM_testset.jsonl’


2023-05-25 03:31:32 (5.81 MB/s) - ‘WizardLM_testset.jsonl’ saved [81139/81139]



In [6]:
delay = 0.05

concurrency_limit = 32

input_file = "WizardLM_testset.jsonl"

# 数据缓存目录
temp_path = "/content/temp"

# 数据输出目录
output_path = "/content/translate"

output_prefix = "WizardLM_tr"

max_file_size = 1024**3

# 需要翻译的字段
entries = ["Instruction"]

os.system(f"mkdir -p {temp_path} {output_path}")

0

In [7]:
import re

async def getTranslation(item, entries: list = []):
    async def get(text):
        # text = text.replace("\n", " ")
        openai.api_key = pool.getKey()
        try:
            en_token_len = float(len(enc.encode( text )))
            max_zh_len = int( max_zh_en_ratio * en_token_len + 10 )

            messages =  [  
            {'role':'system', 'content':'将反引号中的英文文本翻译成简体中文，并输出到一对反引号中，如`cat`->`猫`'},
            {'role':'user', 'content':'将反引号中的指令翻译成中文:`dog`'},
            {'role':'assistant', 'content':'`狗`'},   
            {'role':'user', 'content':f'将反引号中的指令翻译成中文:`{text}`'}  ]

            resp = await openai.ChatCompletion.acreate(
                model="gpt-3.5-turbo",
                messages=messages,
                temperature=0,
                max_tokens=max_zh_len
            )
            if "choices" in resp:
                result = resp['choices'][0]['message']['content']

                result = result.strip()

                if len(result) > 1 and result[0] == result[-1] == '`':  # 判断首尾字符是否是反引号
                    return result[1:-1]  # 如果是，去掉反引号，并返回True
                else:
                    return result # 如果不是，返回原字符串和False
            else:
                raise Exception(f"Invalid API response: {resp}")
        except Exception as e:
            print(f"[Error] {e}")
            return None

    for entry in entries:
        trans = await get(item[entry])
        if trans is None:
            return None
        else:
            item[f"{entry}_zh"] = trans
    return item


async def process(id, item, semaphore):
    async with semaphore:
        file_name = f"{temp_path}/{output_prefix}_{id}.json"
        try:
            it = await getTranslation(item, entries)
            if it is None:
                raise Exception(file_name)
            async with aiofiles.open(file_name, "w") as f:
                await f.write(json.dumps(it, ensure_ascii=False, indent=4))
        except Exception as e:
            print(f"Error saving item: {e}")


async def main():
    try:
      with open(input_file, "r") as file:
          data = json.load(file)
    except json.JSONDecodeError:
      data = []
      with open(input_file, "r") as file:
          for line in file:
              entry = json.loads(line)
              data.append(entry)

    tasks = []

    semaphore = asyncio.Semaphore(concurrency_limit)

    for id, item in enumerate(data):
        if os.path.exists(f"{output_prefix}{id}.json"):
            continue
        tasks.append(asyncio.create_task(process(id, item, semaphore)))

    async for task in tqdm(tasks, total=len(tasks), desc="Processing items"):
        await task
        time.sleep(delay)

由于网络问题或OpenAI的限制会导致获取数据失败，此时脚本会跳过这部分数据

重新运行下面的单元格即可补充获取失败的数据

In [8]:
await main()

Processing items:   1%|          | 2/218 [00:00<00:29,  7.37it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_4.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_7.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_15.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_25.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_10.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_12.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_6.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_29.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_8.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_23.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_3.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_18.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_14.json
[Err

Processing items:   4%|▎         | 8/218 [00:00<00:14, 14.70it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_5.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_27.json


Processing items:  15%|█▍        | 32/218 [00:01<00:09, 19.19it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_28.json


Processing items:  17%|█▋        | 36/218 [00:02<00:11, 15.29it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_32.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_34.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_33.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_35.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_38.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_36.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_39.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_42.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_40.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_49.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_37.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_41.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_47.json

Processing items:  29%|██▉       | 64/218 [00:03<00:10, 15.20it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_62.json


Processing items:  30%|███       | 66/218 [00:03<00:11, 13.58it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_69.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_64.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_67.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_68.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_75.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_70.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_66.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_74.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_73.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_78.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_71.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_72.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_83.json

Processing items:  43%|████▎     | 94/218 [00:05<00:06, 18.12it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_90.json


Processing items:  45%|████▍     | 98/218 [00:05<00:09, 12.64it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_96.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_97.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_102.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_105.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_106.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_98.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_113.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_103.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_104.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_108.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_100.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_101.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_t

Processing items:  50%|█████     | 110/218 [00:06<00:07, 14.17it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_107.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_127.json


Processing items:  60%|█████▉    | 130/218 [00:07<00:07, 11.77it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_128.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_133.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_134.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_135.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_131.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_129.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_137.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_150.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_145.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_141.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_142.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_149.json
[Error] <empty message>
Error saving item: /content/temp/WizardL

Processing items:  64%|██████▍   | 140/218 [00:08<00:05, 14.58it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_144.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_136.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_152.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_153.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_154.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_155.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_156.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_151.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_158.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_157.json


Processing items:  74%|███████▍  | 162/218 [00:09<00:03, 15.66it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_159.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_160.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_162.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_161.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_164.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_165.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_163.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_169.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_172.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_171.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_170.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_166.json
[Error] <empty message>
Error saving item: /content/temp/WizardL

Processing items:  87%|████████▋ | 190/218 [00:11<00:01, 18.47it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_186.json


Processing items:  90%|████████▉ | 196/218 [00:11<00:01, 15.72it/s]

[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_192.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_193.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_194.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_198.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_199.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_201.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_204.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_195.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_202.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_203.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_205.json
[Error] <empty message>
Error saving item: /content/temp/WizardLM_tr_211.json
[Error] <empty message>
Error saving item: /content/temp/WizardL

Processing items: 100%|██████████| 218/218 [00:12<00:00, 16.98it/s]


## 合并所有翻译数据

In [9]:
data = []
for filename in tqdm(os.listdir(temp_path)):
    if filename.startswith(output_prefix) and filename.endswith(".json"):
        with open(os.path.join(temp_path, filename), 'r', encoding='utf-8') as file:
            try:
                entry = json.load(file)
                data.append(entry)
            except json.JSONDecodeError:
                pass

0it [00:00, ?it/s]


In [10]:
file_counter = 1
current_file_size = 0
output_file = f"{output_path}/{output_prefix}_{file_counter}.jsonl"

with open(output_file, 'w', encoding='utf-8') as out:
    for item in tqdm(data):
        item_json = json.dumps(item, ensure_ascii=False)
        item_size = len(item_json.encode('utf-8'))
        out.write(item_json + "\n")
        current_file_size += item_size
        if current_file_size > max_file_size:
            file_counter += 1
            output_file = f"{output_path}/{output_prefix}_{file_counter}.jsonl"
            out = open(output_file, 'w', encoding='utf-8')
            current_file_size = 0

0it [00:00, ?it/s]


In [11]:
print(output_file)

/content/translate/WizardLM_tr_1.jsonl
