In [1]:
from datasets import load_dataset
dataset = load_dataset('bot-yaya/UN_PDF_SUBSET_PREPROCESSED')

Found cached dataset parquet (/home/jia/.cache/huggingface/datasets/bot-yaya___parquet/bot-yaya--UN_PDF_SUBSET_PREPROCESSED-5216c9199c92efc3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
ds = dataset['train'].select(range(50))
ds = ds.remove_columns(['zh', 'fr', 'es', 'ru'])

In [3]:
ds

Dataset({
    features: ['en', 'record'],
    num_rows: 50
})

In [4]:
# add a new column to ds, is_hard_linebreak, which is a list of bools
# indicating the number of linebreak in the en text
ds = ds.map(lambda example: {'is_hard_linebreak': []})
ds

Loading cached processed dataset at /home/jia/.cache/huggingface/datasets/bot-yaya___parquet/bot-yaya--UN_PDF_SUBSET_PREPROCESSED-5216c9199c92efc3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-d36fd9ddf80f84f4.arrow


Dataset({
    features: ['en', 'record', 'is_hard_linebreak'],
    num_rows: 50
})

In [46]:
import json

def dump_to_jsonl(data, file_path):
    with open(file_path, 'w') as file:
        for item in data:
            json.dump(item, file)
            file.write('\n')
            
def compare_breaks(raw_text, output_text):
    print(len(raw_text), len(output_text))
    is_hard_line_break = [raw_text[i] == output_text[i] for i in range(len(raw_text)) if raw_text[i] == '\n']
    return is_hard_line_break

raw_text = "My name is\nNicola.\nHow are you doing ?\nMy name is Axel. I am\nfine, thanks"
output_text = "My name is Nicola.\nHow are you doing ?\nMy name is Axel. I am fine, thanks"

print(compare_breaks(raw_text, output_text))


73 73
[False, True, True, False]


In [3]:
import pylcs # 建议使用https://github.com/voidf/pylcs这个分支的代码，降低内存使用
from typing import Tuple

def lcs_sequence_alignment(ibatch: list[str] | str, obatch: list[str] | str) -> Tuple[dict[int, set[int]], list[float], list[float]]:
    """将ibatch每行的单词用最长公共子序列对齐到obatch每行的单词中。
    
    Args:
        ibatch(str): 输入的一段话
        obatch(str): chatgpt给对齐好的一段话
    
    Returns:
        mapping(dict[int, set[int]]): 输出行号对应输入的行号
        irate(list[float]): 输入每行的匹配率（匹配的单词总长度/本行总单词总长度）
        orate(list[float]): 输出每行的匹配率
    """
    if isinstance(ibatch, str):
        ibatch = ibatch.splitlines()
    if isinstance(obatch, str):
        obatch = obatch.splitlines()
    offset = 19968
    dic = {}
    
    ibuf = [] # 输入token
    ilen = []

    obuf = []
    olen = []
    # 手写的token转换，优化lcs的效率，这里换成中文字形式编码这些token，只判等
    offset = 19968 # 中文unicode起点
    dic = {}
    for ilineid, iline in enumerate(ibatch):
        sp = iline.split()
        ilen.append(sum(map(len, sp)))
        for i in sp:
            ibuf.append((
                chr(offset + dic.setdefault(i, len(dic))),
                len(i),
                ilineid,
                ))
    
    for olineid, oline in enumerate(obatch):
        sp = oline.split()
        olen.append(sum(map(len, sp)))
        for i in oline.split():
            if i in dic: # 为子序列写的优化
                obuf.append((
                    chr(offset + dic[i]),
                    len(i),
                    olineid,
                    ))
    

    irate = [0 for _ in ilen]
    orate = [0 for _ in olen]

    n1 = ''.join(map(lambda x: x[0], ibuf))
    n2 = ''.join(map(lambda x: x[0], obuf))
    # print(f'n1:{len(n1)}, n2:{len(n2)}')
    idxs = pylcs.lcs_sequence_idx(n1, n2)
    mapping = {}
    for iidx, oidx in enumerate(idxs):
        if oidx != -1:
            _, iklen, ikgroup = ibuf[iidx]
            _, oklen, okgroup = obuf[oidx]
            mapping.setdefault(okgroup, set()).add(ikgroup)
            irate[ikgroup] += iklen
            orate[okgroup] += oklen
    
    for p, i in enumerate(irate):
        irate[p] = i / ilen[p]
    for p, i in enumerate(orate):
        orate[p] = i / olen[p]

    # 额外处理：匹配率低于50%的olineid不要
    print(mapping)
    print('orate', orate)
    for p, i in enumerate(orate):
        if i < 0.5:
            if p in mapping:
                mapping.pop(p)

    return mapping, irate, orate

def get_br_indexes_from_alignmap(align_map: dict[int, set[int]]) -> list[int]:
    br = []
    for igroups in align_map.values():
        for i in igroups:
            if i + 1 in igroups:
                br.append(i)
    br.sort()
    return br

def compare_breaks_v2(raw_text: str, output_text: str) -> list[bool]:
    """
    利用O(mn)的LCS计算欲移除的换行下标，允许一定的标注错误，提供一定的人工容错率

    我们可以用某个显眼的unicode字符简单的可视化通过任何方法合并断行后的文本其中删去的断行字符，
    这种字符以空格和原文本隔开，并且几乎不会影响compare_breaks_v2这个函数对下标的计算。
    这样我们就可以直观地查看原文本中已经被删去换行的地方在哪，然后结合本函数直接人工修改已标注文本并且得到新的下标。

    如：Never gonna\ngive you\nup\nNever gonna\nlet you\ndown
    合并断行后：Never gonna give you up\nNever gonna let you down
    我们可以把它可视化成：Never gonna ❤ give you ❤ up\nNever gonna ❤ let you ❤ down
    """
    align_map, _, _ = lcs_sequence_alignment(raw_text, output_text)
    br = get_br_indexes_from_alignmap(align_map)
    is_hard_line_break = [True] * raw_text.count('\n')
    for i in br:
        is_hard_line_break[i] = False
    return is_hard_line_break

raw_text = "Never gonna\ngive you\nup\nNever gonna\nlet you\ndown"
output_text = "Never gonna give you up\nNever gonna let you down"
output_text_heart = "Never gonna ❤ give you ❤ up\nNever gonna ❤ let you ❤ down"
result_type1 = compare_breaks_v2(raw_text, output_text)
result_type2 = compare_breaks_v2(raw_text, output_text_heart)
print(result_type1 == result_type2)
    

{0: {0, 1, 2}, 1: {3, 4, 5}}
orate [1.0, 1.0]
{0: {0, 1, 2}, 1: {3, 4, 5}}
orate [0.9047619047619048, 0.9090909090909091]
True


In [34]:
validation_data = []

In [35]:
raw_text = """General Assembly Distr.: Limited
28 November 2001
Original: English
01-66657 (E) 291101
*0166657*
Fifty-sixth session
Third Committee
Agenda item 119 (b)
Human rights questions: human rights questions, including
alternative approaches for improving the effective enjoyment
of human rights and fundamental freedoms
South Africa:* draft resolution
The right to development
The General Assembly,
Guided by the Charter of the United Nations, expressing, in particular, the
determination to promote social progress and better standards of life in larger
freedom as well as to employ international mechanisms for the promotion of the
economic and social advancement of all peoples,
Recalling that the Declaration on the Right to Development, adopted by the
General Assembly in its resolution 41/128 of 4 December 1986, confirmed that the
right to development is an inalienable human right and that equality of opportunity
for development is a prerogative both of nations and of individuals, who make up
nations,"""

In [36]:
expected_output = """General Assembly Distr.: Limited
28 November 2001
Original: English
01-66657 (E) 291101
*0166657*
Fifty-sixth session
Third Committee
Agenda item 119 (b)
Human rights questions: human rights questions, including
alternative approaches for improving the effective enjoyment
of human rights and fundamental freedoms
South Africa:* draft resolution
The right to development
The General Assembly,
Guided by the Charter of the United Nations, expressing, in particular, the determination to promote social progress and better standards of life in larger freedom as well as to employ international mechanisms for the promotion of the economic and social advancement of all peoples,
Recalling that the Declaration on the Right to Development, adopted by the General Assembly in its resolution 41/128 of 4 December 1986, confirmed that the right to development is an inalienable human right and that equality of opportunity for development is a prerogative both of nations and of individuals, who make up nations,"""

In [37]:
validation_data.append({
    "raw_text": raw_text,
    "is_hard_linebreak": compare_breaks(raw_text, expected_output),
})

# case 2

In [41]:
sample = ds[1]
print(sample['en'])

United Nations A/CONF.192/PC/L.3
General Assembly Distr.: Limited
1 December 2000
Original: English
00-77708 (E) 141200
*0077708*
Preparatory Committee for the United Nations
Conference on the Illicit Trade in Small Arms
and Light Weapons in All Its Aspects
Second session
8-19 January 2001
Draft Objective of the Conference
Working paper by the Chairman of the Preparatory Committee
The Preparatory Committee recommends that the objective of the Conference
should be to develop and strengthen international efforts to prevent, combat and
eradicate the illicit trade in small arms and light weapons in all its aspects. To this
end, the aims of the Conference should be:
– To strengthen or develop norms at the global, regional and national levels that
would reinforce and further coordinate efforts to prevent and combat the illicit
trade in small arms and light weapons in all its aspects;
– To develop agreed international measures to prevent and combat illicit arms
trafficking in and manufacturin

In [42]:
expected_output = """United Nations A/CONF.192/PC/L.3
General Assembly Distr.: Limited
1 December 2000
Original: English
00-77708 (E) 141200
*0077708*
Preparatory Committee for the United Nations
Conference on the Illicit Trade in Small Arms
and Light Weapons in All Its Aspects
Second session
8-19 January 2001
Draft Objective of the Conference
Working paper by the Chairman of the Preparatory Committee
The Preparatory Committee recommends that the objective of the Conference should be to develop and strengthen international efforts to prevent, combat and eradicate the illicit trade in small arms and light weapons in all its aspects. To this end, the aims of the Conference should be:
– To strengthen or develop norms at the global, regional and national levels that would reinforce and further coordinate efforts to prevent and combat the illicit trade in small arms and light weapons in all its aspects;
– To develop agreed international measures to prevent and combat illicit arms trafficking in and manufacturing of small arms and light weapons and to reduce excessive and destabilizing accumulations and transfers of such weapons throughout the world;
– To put particular emphasis on the regions of the world where conflicts come to an end and where serious problems with the proliferation of small arms and light weapons have to be dealt with urgently;
– To mobilize the political will throughout the international community to prevent and combat illicit transfers in and manufacturing of small arms and light weapons in all their aspects, and raise awareness of the character and seriousness of the interrelated problems associated with the illicit trafficking in and manufacture of small arms and light weapons and the excessive and destabilizing accumulation and spread of these weapons;
– To promote responsibility by States with a view to preventing the illicit export, import, transit and retransfer of small arms and light weapons."""

In [44]:
validation_data.append({
    "raw_text": sample['en'],
    "is_hard_linebreak": compare_breaks(sample['en'], expected_output),
})

1929 1929


In [47]:
dump_to_jsonl(validation_data, 'validation_small.jsonl')

In [49]:
len(validation_data)

2

# Case 3

In [45]:
sample = ds[2]
print(sample['en'])

United Nations A/61/917
General Assembly Distr.: General
21 May 2007
Original: English
07-35147 (E) 230507
*0735147*
Sixty-first session
Agenda items 68 and 117
Report of the Human Rights Council
Programme budget for the biennium 2006-2007
Reports of the Secretary-General on the revised
estimates resulting from decision S-4/101 adopted
by the Human Rights Council at its fourth special
session in 2006 (A/61/530/Add.2) and on the revised
estimates resulting from resolutions adopted by the
Council at its fourth session in 2007 (A/61/530/Add.3)
Report of the Advisory Committee on Administrative and
Budgetary Questions
1. The Advisory Committee has considered the reports of the Secretary-General
on the revised estimates resulting from decision S-4/101 adopted by the Human
Rights Council at its fourth special session in 2006 (A/61/530/Add.2) and on the
revised estimates resulting from resolutions adopted by the Council at its fourth
session in 2007 (A/61/530/Add.3).
2. As noted in the report