#### labse预训练模型下载地址：https://huggingface.co/sentence-transformers/LaBSE

In [1]:
## 下面是alignment_extractor.py文件

import glob
import torch
import argparse
import torch.nn.functional as F
from transformers import BertModel, BertTokenizerFast
from datetime import datetime
import sys
import os
import json as js
import pdb


def read_in_chunks(file_path):
	file_object = open(file_path)
	while True:
		line = file_object.readline()
		if not line:
			break
			yield line
			
def get_sent_len(txt):
	tokenized = tokenizer(txt,return_tensors="pt",padding=True).to(device)
				
				
def get_emb_and_sent_chunk(batch_lines):
	"""
		given a chunk of sentences, return the predicted matrix of sentences in chunk
	"""
	batch_inputs = tokenizer(batch_lines,return_tensors='pt',padding=True,truncation=True).to(device)
	with torch.no_grad():
		batch_outputs = model(**batch_inputs)
	return batch_outputs[1]

if __name__ =="__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--local_rank",type=int)
	parser.add_argument("--root-path",help='root path for workspace')
	parser.add_argument("--threshold",help='threshold for filtering, only select pairs which have higher values')
	parser.add_argument("--model-path',help='path for the pretrained model")
	parser.add_argument("--src-lang',help='source lang")
	parser.add_argument("--tgt-lang',help='target lang")
	parser.add_argument("--indentifier",help="identifier for wild-char matching. eg. article.01.sent-seg.zh, 01.sent-seg could be an identifier. file will be selected by *.identifier.lang")
	parser.add_argument("--output-path",help="path of the output lines")
	parser.add_argument("--output-filter-path",help="path of the output file")
	parser.add_argument("--output-filter-path","path of filted file")
	parser.add_argument("--embed-only",action="store_true",help="only claculate embed of source senteces")
	parser.add_argument("--f-content-prefix",help='path of file content stored in a json file')

	args = parser.parse_args()

	file_root_path  = args.root_path
	threshold = float(args.threshold)
	model_path = args.model_path
	tokenizer = BertTokenizerFast.from_pretrained(model_path,model_max_length=512)
	model = BertModel.from_pretrained(model_path)
	device = "cuda" if torch.cuda.is_available() else 'cpu'
	if torch.cuda.device_count() > 1:
		print("using one cuda device")
	model = model.to(device)
	src_lang, tgt_lang = args.src_lang, args.tgt_lang

	identifier = args.identifier
	out_file = args.output_path
	out_filter = args.output_filter_path
	model = model.eval()

	start = datetime.now()
	with open(f"{file_root_path}/{out_file}","w") as aligned_file, open("{file_root_path}/{out_filter}","w") as f_filt:
		n_total_tgt_lines = 0
		n_matched_tgt_lines = 0

		src_lang_files = sorted(glob.glob(os.path.join(file_root_path,f'*.{identifier}.{src_lang}')))
		tgt_lang_files = sorted(glob.glob(os.path.join(file_root_path,f'*.{identifier}.{tgt_lang}')))
		assert len(src_lang_files) == len(tgt_lang_files)

	for src_file, tgt_file in zip(src_lang_files,tgt_lang_files):
		filename = os.path.basename(src_file)[:-3]
		try:
			with open(f"{args.f_content_prefix}.{src_lang}") as f_s , open(f"{args.f_content_prefix}.{tgt_lang}") as f_t:
				src_lines = js.load(f_s)
				tgt_lines = js.load(f_t)
				print("load json file")
		except:
			src_lines  = [x.strip() for x in read_in_chunks(src_file)]
			tgt_lines  = [x.strip() for x in read_in_chunks(tgt_file)]
			if args.f_content_prefix:
				with open(f"{args.f_content_prefix}.{src_lang}") as f_s , open(f"{args.f_content_prefix}.{tgt_lang}") as f_t:
					js.dump(src_lines,f_s)
					js.dump(tgt_lines,f_t)
					print("stored file content to a json file")
		step = 100
		for i in range(0,len(src_lines),step):
			src_batch_embeddings, tgt_batch_embeddings = [],[]
			src_batch_sents = src_lines[i:i+step]
			tgt_batch_sents = tgt_lines[i:i+step]
			assert len(src_batch_sents) == len(tgt_batch_sents)

			src_batch_embeddings = get_emb_and_sent_chunk(src_batch_sents)
			src_batch_embeddings = F.normalize(src_batch_embeddings,p=2)
			batch_right = step if i+ step <= len(src_lines) else len(src_lines) -i

			if args.emb_only:
				src_batch_embeddings = src_batch_embeddings.tolist()
				for idx in range(batch_right):
					emb = ' '.join([str(x) for x in src_batch_embeddings[idx]])
					aligned_file.write(f"{src_batch_sents[idx]}\t{filename}\t{i+idx+1}\t{emb}\n")
				continue
			tgt_batch_embeddings = get_emb_and_sent_chunk(tgt_batch_sents)
			tgt_batch_embeddings = F.normalize(tgt_batch_embeddings,p=2)
			n_total_tgt_lines +=len(src_batch_sents)

			batch_scores = torch.matmul(tgt_batch_embeddings,src_batch_embeddings.transpose(0,1))
			for idx in range(batch_right):
				try:
					score = batch_scores[idx][idx].item()
				except:
					print("index %d of file %s : "%(idx,filename))
					print("{}\t{}".format(src_batch_sents[idx],tgt_batch_sents[idx]))
					print("shape of matrix:", batch_scores.shape)
				if score > threshold:
					aligned_file.write("{}\t{}\t{}\t{}\t{}\n".format(score,src_batch_sents[idx],tgt_batch_sents[idx],filename,i+idx+1))
				else:
					f_filt.write("{}\t{}\t{}\t{}\t{}\n".format(score,src_batch_sents[idx],tgt_batch_sents[idx],filename,i+idx+1))
		
	end = datetime.now()
	seconds = (end-start).seconds
	print("f{len(src_lang_files)} file processed, took {seconds} seconds")


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link
CUDA SETUP: CUDA runtime path found: /share/jinchang/miniconda3/envs/llama2/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /share/jinchang/miniconda3/envs/llama2/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...
[2023-09-20 14:05:16,590] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


ValueError: invalid option string 'path of filted file': must start with a character '-'

In [None]:
## score.sh 脚本
## 需要在$PATH/data目录下放两个文件，一个是源端句子en-zh.en ,一个是目标端句子en-zh.zh，会生成一个en-zh.out文件


PATH=~/score_Labse
cd $PATH/data

CUDA_VISIBLE_DEVICES=0 python $PATH/alignment_extractor.py\
--root-path .\
--threshold 0 \
--model-path $path/Labse\
--src-lang en \
--tgt-lang zh \
--identifier en-zh \
--output-path en-zh.out \
--output-filter-path en-zh.filter \
--f-content-prefix en-zh.json 

rm -rf $PATH/data/en-zh.json*