# Toxic Spans Detection Dataset

In [2]:
import os
from collections import defaultdict
os.chdir('/home/s3/hyeryung/mucoco')

from transformers import AutoTokenizer
import pandas as pd

In [3]:
def get_token2index(tmp_text, tmp_tokens, spans):
    j = 0
    token2index = defaultdict(list)
    tmp_token = tmp_tokens[j]
    for i in range(len(tmp_text)):
        # print(i, tmp_text[i], tmp_token)
        while True:
            # print(i, tmp_text[i], tmp_token)
            if tmp_text[i].replace(' ', 'Ġ').replace('\n', 'Ċ').replace('’', 'â') == tmp_token[0]:
                tmp_token = tmp_token[1:]
                token2index[j].append(i)
                if len(tmp_token) == 0:
                    j += 1
                    if j >= len(tmp_tokens):
                        break
                    tmp_token = tmp_tokens[j]
                break
            else:
                j += 1
                if j >= len(tmp_tokens):
                    break
                tmp_token = tmp_tokens[j]
        if j >= len(tmp_tokens):
            break
    index2token={tuple(v):k for k,v in token2index.items()}  

    tokens_ix = []
    for sp in spans:
        for tk, v in index2token.items():
            if sp in tk:
                tokens_ix.append(v)
                break
    
    return sorted(list(set(tokens_ix)))

In [4]:
def get_tok2char(row: pd.Series) -> dict:
    """
    A function to convert a list of tokens into a mapping between each token's index and its corresponding character offsets.
    @param row: A row from dataframe
    @return tok2char: A dictionary with token's location index as keys and tuples of corresponding character offsets as values.

    Example:
    row=pd.Series()
    row['tokens']=[86, 6648, 1830, 290, 11386, 25998, 278, 7510, 466, 314, 5465, 8223, 5762, 1830, 13]
    tok2char=get_tok2char(row)
    tok2char
    {0: (0,),
     1: (1, 2, 3, 4, 5, 6),
     2: (7, 8, 9, 10, 11, 12),
     3: (13, 14, 15, 16),
     ...
     13: (59, 60, 61, 62, 63, 64),
     14: (65,)}
    """
    global tokenizer
    
    tok2char=dict()
    token_offsets=[0]
    
    for i in range(1,len(row['tokens'])+1):
        decoded=tokenizer.decode(row['tokens'][:i])
        token_offsets.append(len(decoded))
        tok2char[i-1]=tuple(range(token_offsets[i-1],token_offsets[i]))
    return tok2char

In [2]:
tokenizer = AutoTokenizer.from_pretrained()

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [279]:
data_path = "data/toxicity/toxic_spans/SemEval2021/data/tsd_test.csv"
data = pd.read_csv(data_path)

In [280]:
data['tokens']=data['text'].apply(lambda x: tokenizer.tokenize(x))

In [281]:
data['spans']=data['spans'].apply(eval)

In [283]:
data['spans_tokens']=data.apply(lambda x: get_token2index(x['text'], x['tokens'], x['spans']),axis=1)

In [285]:
data.to_json("data/toxicity/toxic_spans/SemEval2021/data/tsd_test_spans.jsonl",lines=True,orient="records")

In [286]:
data_path = "data/toxicity/toxic_spans/SemEval2021/data/tsd_train.csv"
data = pd.read_csv(data_path)

In [287]:
data['tokens']=data['text'].apply(lambda x: tokenizer.tokenize(x))

In [288]:
data['spans']=data['spans'].apply(eval)

In [289]:
data['spans_tokens']=data.apply(lambda x: get_token2index(x['text'], x['tokens'], x['spans']),axis=1)

In [290]:
data.to_json("data/toxicity/toxic_spans/SemEval2021/data/tsd_train_spans.jsonl",lines=True,orient="records")

In [291]:
data_path = "data/toxicity/toxic_spans/SemEval2021/data/tsd_trial.csv"
data = pd.read_csv(data_path)

In [292]:
data['tokens']=data['text'].apply(lambda x: tokenizer.tokenize(x))

In [293]:
data['spans']=data['spans'].apply(eval)

In [294]:
data['spans_tokens']=data.apply(lambda x: get_token2index(x['text'], x['tokens'], x['spans']),axis=1)

In [295]:
data.to_json("data/toxicity/toxic_spans/SemEval2021/data/tsd_trial_spans.jsonl",lines=True,orient="records")

# MQM Dataset

In [1]:
import os
from collections import defaultdict
os.chdir('/home/s3/hyeryung/mucoco')

from transformers import AutoTokenizer, AddedToken
import pandas as pd
import numpy as np
import torch

## EN-DE

In [213]:
data_path = 'data/MT/wmt-mqm-human-evaluation/newstest2020/ende/mqm_newstest2020_ende.tsv'
data = pd.read_csv(data_path, sep='\t', quoting=3)

In [214]:
tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-xl")

In [215]:
tokenizer.add_tokens([AddedToken("<v>", lstrip=True), AddedToken("</v>", lstrip=True)])

2

In [216]:
data['target_tokens']=data['target'].apply(lambda x: tokenizer.encode(x,add_special_tokens=False))

In [217]:
data['target_clean']=data['target'].str.replace("<v>","").str.replace("</v>","")

In [218]:
data['target_clean_tokens']=data['target_clean'].apply(lambda x: tokenizer.encode(x,add_special_tokens=False))

In [219]:
tokenizer.vocab['</v>']

250003

In [220]:
def index_wo_error(x, y):
    try:
        return x.index(y)
    except:
        return -1
data['label_start']=data['target_tokens'].apply(lambda x: index_wo_error(x, 250002))

In [221]:
data['label_end']=data['target_tokens'].apply(lambda x: index_wo_error(x, 250003)-1) ## -1 to consider that <v> is added.

In [222]:
# def get_span(row):
#     return row['target_clean_tokens'][row['label_start']:row['label_end']]

# data['target_span']=data.apply(get_span, axis=1) ## 꽤 잘 찾는다.

In [223]:
# for i, row in data.iterrows():
#     if row['severity']=='no-error':
#         continue
#     if i > 100:
#         break
#     print(f"{i}: {row['target_tokens']} => found span: {row['target_span']}")

In [224]:
def get_label(row):
    return [1 if row['label_start'] <= i < row['label_end'] else 0 for i in range(len(row['target_clean_tokens']))]

data['labels']= data.apply(get_label, axis=1)

In [225]:
# for i, row in data.iterrows():
#     if row['severity']=='no-error':
#         continue
#     if i > 100:
#         break
#     print(f"{i}: {row['target_tokens']} => label: {row['labels']} => applying label: {torch.masked_select(torch.Tensor(row['target_clean_tokens']), torch.Tensor(row['labels']).bool())}")

In [226]:
data['labels']=data['labels'].apply(np.array)

In [227]:
labels = data.groupby(['system', 'doc_id','seg_id','source','target_clean'])['labels'].agg(lambda x: np.mean(x,axis=0)).reset_index()

In [228]:
labels['no_error']=labels['labels'].apply(lambda x: True if np.sum(x) == 0 else False)

In [229]:
num_labelers = data.groupby(['system', 'doc_id','seg_id','source','target_clean'])['labels'].size().reset_index().rename(columns={'labels':'num_annotations'})

In [230]:
labels = pd.merge(labels, num_labelers,how='left')

In [231]:
labels.loc[17824,'target_clean']

'Die Familien-Reality-Show Keeping Up With The Kardashians zeigte jedoch später, dass die Veranstaltung mit Khloe in Tränen endete, als sie sich mit seiner Untreue abmühte.'

In [232]:
labels.loc[17825,'target_clean']

'Die Familien-Reality-Show des Stars Keeping Up With The Kardashians zeigte jedoch später, dass die Veranstaltung mit Khloe in Tränen endete, als sie sich mit seiner Untreue abmühte.'

In [233]:
## drop rows with duplicate labels
print(f"# rows before drop dup: {len(labels)}")
labels = labels.drop_duplicates(subset=['system', 'doc_id','seg_id'],keep=False)
print(f"# rows after drop dup: {len(labels)}")

# rows before drop dup: 17829
# rows after drop dup: 11280


In [234]:
## drop rows with no error
print(f"# rows before drop no error: {len(labels)}")
labels = labels.loc[labels['no_error']==False].copy()
print(f"# rows after drop no error: {len(labels)}")

# rows before drop no error: 11280
# rows after drop no error: 9520


In [235]:
labels=labels.rename(columns={'labels':'raw_labels'})

In [236]:
labels=labels.rename(columns={'no_error':'raw_no_error'})

In [237]:
labels['labels']=labels['raw_labels'].apply(lambda x: np.select(condlist=[x>=0.5, x<0.5], choicelist=[1, 0], 
    default=np.nan))

In [238]:
# for i, row in labels.iterrows():
#     if i > 100:
#         break
#     print(f"{i}: {row['raw_labels']} => label: {row['labels']}")

In [239]:
labels['no_error']=labels['labels'].apply(lambda x: True if np.sum(x) == 0 else False)

In [240]:
labels['no_error'].value_counts()

no_error
True     6565
False    2955
Name: count, dtype: int64

In [241]:
labels.to_json(f"/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_ende.jsonl",lines=True,orient="records")

In [242]:
labels_error = labels.loc[labels['no_error']==False,:]

In [243]:
labels_error.to_json(f"/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_ende_with_error.jsonl",lines=True,orient="records")

### Add labels at character level

In [250]:
data_path = 'data/MT/wmt-mqm-human-evaluation/newstest2020/ende/mqm_newstest2020_ende.tsv'
data = pd.read_csv(data_path, sep='\t', quoting=3)
data['target_clean']=data['target'].str.replace("<v>","").str.replace("</v>","")

In [245]:
def index_wo_error(x, y):
    try:
        return x.index(y)
    except:
        return -1

In [246]:
# def get_span_char(row):

#     start_index = index_wo_error(row['target'], '<v>')
#     end_index = index_wo_error(row['target'], '</v>')-3
#     return row['target_clean'][start_index:end_index]
    
# span_char = data.apply(get_span_char,axis=1)

In [247]:
# for i, row in data.iterrows():
#     if row['severity']=='no-error':
#         continue
#     if i > 100:
#         break
#     print(f"{i}: {row['target']} => found span: {span_char[i]}")

In [251]:
def get_labels_char(row):

    start_index = index_wo_error(row['target'], '<v>')
    end_index = index_wo_error(row['target'], '</v>')-3
    return [1 if start_index <= i < end_index else 0 for i in range(len(row['target_clean']))]

In [252]:
data['labels_char']=data.apply(get_labels_char,axis=1)

In [253]:
data['labels_char']=data['labels_char'].apply(np.array)

In [254]:
labels_char = data.groupby(['system', 'doc_id','seg_id','source','target_clean'])['labels_char'].agg(lambda x: np.mean(x,axis=0)).reset_index()

In [255]:
labels_char['no_error']=labels_char['labels_char'].apply(lambda x: True if np.sum(x) == 0 else False)

In [256]:
## drop rows with duplicate labels
print(f"# rows before drop dup: {len(labels_char)}")
labels_char = labels_char.drop_duplicates(subset=['system', 'doc_id','seg_id'],keep=False)
print(f"# rows after drop dup: {len(labels_char)}")

# rows before drop dup: 17829
# rows after drop dup: 11280


In [257]:
## drop rows with no error
print(f"# rows before drop no error: {len(labels_char)}")
labels_char = labels_char.loc[labels_char['no_error']==False].copy()
print(f"# rows after drop no error: {len(labels_char)}")

# rows before drop no error: 11280
# rows after drop no error: 9573


In [258]:
labels_char=labels_char.rename(columns={'labels_char':'raw_labels_char'})

In [259]:
labels_char=labels_char.rename(columns={'no_error':'raw_no_error'})

In [260]:
labels_char['labels_char']=labels_char['raw_labels_char'].apply(lambda x: np.select(condlist=[x>=0.5, x<0.5], choicelist=[1, 0], 
    default=np.nan))

In [261]:
labels_char['no_error']=labels_char['labels_char'].apply(lambda x: True if np.sum(x) == 0 else False)

In [262]:
labels_char['no_error'].value_counts()

no_error
True     6577
False    2996
Name: count, dtype: int64

In [263]:
## checked why no error stats based on labels_char and labels were different. -> usually related to cases where a space is labeled.
## 결론: 우리 방식은 어짜피 punctuation은 라벨링을 못하기 때문에, 토큰 기준으로 에러가 있는 것만 가지고 평가하는 것이 맞겠다.
## 결론2: 그런데, tokenize할 떄 arbitrarily meta character가 더해지는 문제가 있어서, token 기준 라벨에 에러가 섞인 경우도 있다.
##      따라서, token기준과 label 기준의 no_error가 일치하는 건만 사용하겠다.

## debug_df = pd.merge(labels, labels_char, on=['system', 'doc_id','seg_id','source','target_clean'], how='left')
## debug_df = debug_df.loc[debug_df['no_error_x']!=debug_df['no_error_y'],:].copy()
## debug_df_raw = pd.merge(data, debug_df, on=['system', 'doc_id','seg_id','source','target_clean'],how='inner')
## debug_df_raw[ 'severity'].value_counts()
# severity
# Minor       178
# no-error     32
# Neutral       6
# Major         6
# Name: count, dtype: int64

## if a major error is involved, it is that only a few among all annotators labeled the specific portion as major error.
## debug_df_raw.loc[debug_df_raw['severity']=='Major']
## for i, row in debug_df_raw.loc[(debug_df_raw['system']=='OPPO.1535') & (debug_df_raw['doc_id']==6) & (debug_df_raw['seg_id']==1352),:].iterrows():
##    print(row['target'], '->', row['severity'])
# Das Virus verursacht bei weniger als <v>1%</v> der Infizierten eine gefährliche Meningitis oder Enzephalitis. -> Minor
# Das Virus verursacht bei weniger als 1<v>%</v> der Infizierten eine gefährliche Meningitis oder Enzephalitis. -> Minor
# Das Virus verursacht bei weniger als 1% der Infizierten eine gefährliche Meningitis oder <v>Enzephalitis</v>. -> Major
# Das Virus verursacht bei weniger als 1% der Infizierten eine gefährliche <v>Meningitis</v> oder Enzephalitis. -> Major
# Das Virus verursacht bei weniger als 1<v>%</v> der Infizierten eine gefährliche Meningitis oder Enzephalitis. -> Minor

In [276]:
labels = pd.read_json("/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_ende.jsonl",lines=True)

In [277]:
labels = pd.merge(labels, labels_char, on=['system', 'doc_id', 'seg_id', 'source', 'target_clean'], how='left', suffixes=['', '_char'])

In [278]:
labels.loc[labels['raw_no_error']!=labels['raw_no_error_char'], :] ## good!

Unnamed: 0,system,doc_id,seg_id,source,target_clean,raw_labels,raw_no_error,num_annotations,labels,no_error,raw_labels_char,raw_no_error_char,labels_char,no_error_char


In [279]:
labels.loc[labels['no_error']!=labels['no_error_char'], :] ## -> drop these rows.

Unnamed: 0,system,doc_id,seg_id,source,target_clean,raw_labels,raw_no_error,num_annotations,labels,no_error,raw_labels_char,raw_no_error_char,labels_char,no_error_char
2,Human-A.0,1,78,Saudi Arabia To Offer Tourist Visas For First ...,Saudi-Arabien stellt erstmals Touristenvisa au...,"[0.0, 0.0, 0.0, 0.0, 0.25, 0.25, 0.25, 0.25, 0...",False,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True
5,Human-A.0,1,123,Patriot missile system to Saudi Arabia after I...,Patriot-Raketensystem an Saudi-Arabien nach Öl...,"[0.0, 0.0, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25,...",False,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25,...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False
351,Human-A.0,5,209,"The Duchess of York wrote on Twitter: """" I kno...",Die Herzogin von York schrieb auf Twitter: „ I...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False
575,Human-A.0,8,32,"In a statement, United Healthcare told CNN it'...",In einer Erklärung teilte United Healthcare CN...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.33333333330000003,...",False,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False
632,Human-A.0,8,1330,"""""It is clear [...] that the weapons used were...",„Es ist klar [...] dass die verwendeten Waffen...,"[0.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0,...",False,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25,...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False
760,Human-A.0,11,668,"Townsend said; """"We will publish a date for ou...",Townsend erklärte: „Wir werden ein Datum für u...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False
828,Human-A.0,13,320,"However, as Sky News revealed last month, the ...","Wie jedoch Sky News letzten Monat enthüllte, e...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False
1039,Human-B.0,2,820,A prison locksmith has been arrested amid fear...,Ein Gefängnisschlosser wurde nach Befürchtunge...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25,...",False,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False
1062,Human-B.0,2,1185,A senior Trump administration official acknowl...,Ein hochrangiger Mitarbeiter der Trump-Regieru...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False
1177,Human-B.0,4,258,"""""My wife writes down the numbers for me durin...",„Meine Frau schreibt mir die Zahlen während de...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False


In [280]:
## drop rows with no_error mismatch
print(f"# rows before drop no_error mismatch: {len(labels)}")
labels = labels.loc[labels['no_error']==labels['no_error_char'], :].copy()
print(f"# rows after drop no_error mismatch: {len(labels)}")

# rows before drop no_error mismatch: 9520
# rows after drop no_error mismatch: 9469


In [281]:
labels.to_json(f"/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_ende.jsonl",lines=True,orient="records")

In [282]:
labels_error = labels.loc[labels['no_error']==False,:]

In [283]:
labels_error.to_json(f"/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_ende_with_error.jsonl",lines=True,orient="records")

## ZH-EN

In [131]:
data_path = 'data/MT/wmt-mqm-human-evaluation/newstest2020/zhen/mqm_newstest2020_zhen.tsv'
data = pd.read_csv(data_path, sep='\t', quoting=3)

In [132]:
tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-xl")

In [133]:
tokenizer.add_tokens([AddedToken("<v>", lstrip=True), AddedToken("</v>", lstrip=True)])

2

In [134]:
data['target_tokens']=data['target'].apply(lambda x: tokenizer.encode(x,add_special_tokens=False))

In [135]:
data['target_clean']=data['target'].str.replace("<v>","").str.replace("</v>","")

In [136]:
data['target_clean_tokens']=data['target_clean'].apply(lambda x: tokenizer.encode(x,add_special_tokens=False))

In [137]:
start_token_id = tokenizer.vocab['<v>']; print(start_token_id)
end_token_id = tokenizer.vocab['</v>']; print(end_token_id)

250002
250003


In [138]:
def index_wo_error(x, y):
    try:
        return x.index(y)
    except:
        return -1
data['label_start']=data['target_tokens'].apply(lambda x: index_wo_error(x, start_token_id))

In [139]:
data['label_end']=data['target_tokens'].apply(lambda x: index_wo_error(x, end_token_id)-1) ## -1 to consider that <v> is added.

In [140]:
def get_span(row):
    return row['target_clean_tokens'][row['label_start']:row['label_end']]

data['target_span']=data.apply(get_span, axis=1) ## 꽤 잘 찾는다.

In [145]:
## erroneous case -> solution: only use rows whose no_error status based on character and token level labels.
for i, row in data.loc[(data['system']=='WeChat_AI.1525') & (data['doc_id']==7) & (data['seg_id']==1189),:].iterrows():
    if row['severity']=='no-error':
        continue
    # if i > 100:
        # break
    print(f"{i}: {row['target_tokens']} => found span: {row['target_span']}")

71674: [8622, 509, 10, 117, 16498, 18770, 53, 49488, 250002, 6, 4, 54433, 450, 250003, 70, 29041, 1902, 35839, 136, 80756, 1257, 5] => found span: [4, 54433, 450, 70]
71675: [8622, 509, 10, 117, 16498, 18770, 53, 49488, 4, 54433, 450, 250002, 70, 250003, 29041, 1902, 35839, 136, 80756, 1257, 5] => found span: [70]
71676: [250002, 8622, 509, 250003, 10, 117, 16498, 18770, 53, 49488, 4, 54433, 450, 70, 29041, 1902, 35839, 136, 80756, 1257, 5] => found span: [8622, 509]
71677: [8622, 509, 10, 117, 16498, 18770, 53, 250002, 49488, 250003, 6, 4, 54433, 450, 70, 29041, 1902, 35839, 136, 80756, 1257, 5] => found span: [49488]


In [132]:
def get_label(row):
    return [1 if row['label_start'] <= i < row['label_end'] else 0 for i in range(len(row['target_clean_tokens']))]

data['labels']= data.apply(get_label, axis=1)

In [None]:
# for i, row in data.iterrows():
#     if row['severity']=='no-error':
#         continue
#     if i > 100:
#         break
#     print(f"{i}: {row['target_tokens']} => label: {row['labels']} => applying label: {torch.masked_select(torch.Tensor(row['target_clean_tokens']), torch.Tensor(row['labels']).bool())}")

In [133]:
data['labels']=data['labels'].apply(np.array)

In [156]:
labels = data.groupby(['system', 'doc_id','seg_id','source','target_clean'])['labels'].agg(lambda x: np.mean(x,axis=0)).reset_index()

In [157]:
labels['no_error']=labels['labels'].apply(lambda x: True if np.sum(x) == 0 else False)

In [158]:
num_labelers = data.groupby(['system', 'doc_id','seg_id','source','target_clean'])['labels'].size().reset_index().rename(columns={'labels':'num_annotations'})

In [159]:
labels = pd.merge(labels, num_labelers,how='left')

In [160]:
## drop rows with duplicate labels
print(f"# rows before drop dup: {len(labels)}")
labels = labels.drop_duplicates(subset=['system', 'doc_id','seg_id'],keep=False)
print(f"# rows after drop dup: {len(labels)}")

# rows before drop dup: 33198
# rows after drop dup: 12334


In [161]:
## drop rows with no error
print(f"# rows before drop no error: {len(labels)}")
labels = labels.loc[labels['no_error']==False].copy()
print(f"# rows after drop no error: {len(labels)}")

# rows before drop no error: 12334
# rows after drop no error: 10710


In [162]:
labels=labels.rename(columns={'labels':'raw_labels'})

In [163]:
labels=labels.rename(columns={'no_error':'raw_no_error'})

In [164]:
labels['labels']=labels['raw_labels'].apply(lambda x: np.select(condlist=[x>=0.5, x<0.5], choicelist=[1, 0], 
    default=np.nan))

In [165]:
# for i, row in labels.iterrows():
#     if i > 100:
#         break
#     print(f"{i}: {row['raw_labels']} => label: {row['labels']}")

In [166]:
labels['no_error']=labels['labels'].apply(lambda x: True if np.sum(x) == 0 else False)

In [167]:
labels['no_error'].value_counts()

no_error
True     7244
False    3466
Name: count, dtype: int64

In [168]:
labels.to_json(f"/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_zhen.jsonl",lines=True,orient="records")

In [169]:
labels_error = labels.loc[labels['no_error']==False,:]

In [170]:
labels_error.to_json(f"/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_zhen_with_error.jsonl",lines=True,orient="records")

### Add labels at character level

In [183]:
data_path = 'data/MT/wmt-mqm-human-evaluation/newstest2020/zhen/mqm_newstest2020_zhen.tsv'
data = pd.read_csv(data_path, sep='\t', quoting=3)

In [184]:
data['target_clean']=data['target'].str.replace("<v>","").str.replace("</v>","")

In [185]:
def index_wo_error(x, y):
    try:
        return x.index(y)
    except:
        return -1

In [186]:
# def get_span_char(row):

#     start_index = index_wo_error(row['target'], '<v>')
#     end_index = index_wo_error(row['target'], '</v>')-3
#     return row['target_clean'][start_index:end_index]
    
# data['target_clean']=data['target'].str.replace("<v>","").str.replace("</v>","")
# span_char = data.apply(get_span_char,axis=1)

In [187]:
# for i, row in data.iterrows():
#     if row['severity']=='no-error':
#         continue
#     if i > 100:
#         break
#     print(f"{i}: {row['target']} => found span: {span_char[i]}")

In [188]:
def get_labels_char(row):

    start_index = index_wo_error(row['target'], '<v>')
    end_index = index_wo_error(row['target'], '</v>')-3
    return [1 if start_index <= i < end_index else 0 for i in range(len(row['target_clean']))]

In [189]:
data['labels_char']=data.apply(get_labels_char,axis=1)

In [190]:
data['labels_char']=data['labels_char'].apply(np.array)

In [191]:
labels_char = data.groupby(['system', 'doc_id','seg_id','source','target_clean'])['labels_char'].agg(lambda x: np.mean(x,axis=0)).reset_index()

In [192]:
labels_char['no_error']=labels_char['labels_char'].apply(lambda x: True if np.sum(x) == 0 else False)

In [193]:
## drop rows with duplicate labels
print(f"# rows before drop dup: {len(labels_char)}")
labels_char = labels_char.drop_duplicates(subset=['system', 'doc_id','seg_id'],keep=False)
print(f"# rows after drop dup: {len(labels_char)}")

# rows before drop dup: 33198
# rows after drop dup: 12334


In [194]:
## drop rows with no error
print(f"# rows before drop no error: {len(labels_char)}")
labels_char = labels_char.loc[labels_char['no_error']==False].copy()
print(f"# rows after drop no error: {len(labels_char)}")

# rows before drop no error: 12334
# rows after drop no error: 10712


In [195]:
labels_char=labels_char.rename(columns={'labels_char':'raw_labels_char'})

In [196]:
labels_char=labels_char.rename(columns={'no_error':'raw_no_error'})

In [197]:
labels_char['labels_char']=labels_char['raw_labels_char'].apply(lambda x: np.select(condlist=[x>=0.5, x<0.5], choicelist=[1, 0], 
    default=np.nan))

In [198]:
labels_char['no_error']=labels_char['labels_char'].apply(lambda x: True if np.sum(x) == 0 else False)

In [199]:
labels_char['no_error'].value_counts()

no_error
True     7242
False    3470
Name: count, dtype: int64

In [200]:
## checked why no error stats based on labels_char and labels were different. -> usually related to cases where a space is labeled.
## 결론: 우리 방식은 어짜피 punctuation은 라벨링을 못하기 때문에, 토큰 기준으로 에러가 있는 것만 가지고 평가하는 것이 맞겠다.
## 결론2: 그런데, tokenize할 떄 arbitrarily meta character가 더해지는 문제가 있어서, token 기준 라벨에 에러가 섞인 경우도 있다.
##      따라서, token기준과 label 기준의 no_error가 일치하는 건만 사용하겠다.

## labels = pd.read_json("/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_zhen.jsonl",lines=True)
## debug_df = pd.merge(labels, labels_char, on=['system', 'doc_id','seg_id','source','target_clean'], how='left')
## debug_df = debug_df.loc[debug_df['no_error_x']!=debug_df['no_error_y'],:].copy()
## debug_df_raw = pd.merge(data, debug_df, on=['system', 'doc_id','seg_id','source','target_clean'],how='inner')
## debug_df_raw['severity'].value_counts()
# severity
# Minor       125
# Major        35
# no-error     18
# Neutral       2
# Name: count, dtype: int64

## if a major error is involved, sometimes it is due to tokenizer malfunctions (arbitrarily adding meta character after <v> or </v> and not adding it if <v> or </v> is not present.
## debug_df_raw.loc[debug_df_raw['severity']=='Major']
## for i, row in debug_df_raw.loc[(debug_df_raw['system']=='WeChat_AI.1525') & (debug_df_raw['doc_id']==7) & (debug_df_raw['seg_id']==1189),:].iterrows():
##    print(row['target'], '->', row['severity'], row['no_error_x'], row['no_error_y'], row['labels'], torch.masked_select(torch.Tensor(row['target_clean_tokens']), torch.Tensor(row['labels']).bool()))
# There was a perfunctory smile<v>, saying that</v> the child had called and hung up. -> Major False True [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0.] tensor([70.])
# There was a perfunctory smile, saying that <v>the</v> child had called and hung up. -> Minor False True [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0.] tensor([70.])
# <v>There was</v> a perfunctory smile, saying that the child had called and hung up. -> Minor False True [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0.] tensor([70.])
# There was a perfunctory <v>smile</v>, saying that the child had called and hung up. -> Minor False True [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0.] tensor([70.])

## erroneous case -> solution: only use rows whose no_error status based on character and token level labels.
## for i, row in data.loc[(data['system']=='WeChat_AI.1525') & (data['doc_id']==7) & (data['seg_id']==1189),:].iterrows():
##     if row['severity']=='no-error':
##         continue
##     # if i > 100:
##         # break
##     print(f"{i}: {row['target_tokens']} => found span: {row['target_span']}")

# 71674: [8622, 509, 10, 117, 16498, 18770, 53, 49488, 250002, 6, 4, 54433, 450, 250003, 70, 29041, 1902, 35839, 136, 80756, 1257, 5] => found span: [4, 54433, 450, 70]
# 71675: [8622, 509, 10, 117, 16498, 18770, 53, 49488, 4, 54433, 450, 250002, 70, 250003, 29041, 1902, 35839, 136, 80756, 1257, 5] => found span: [70]
# 71676: [250002, 8622, 509, 250003, 10, 117, 16498, 18770, 53, 49488, 4, 54433, 450, 70, 29041, 1902, 35839, 136, 80756, 1257, 5] => found span: [8622, 509]
# 71677: [8622, 509, 10, 117, 16498, 18770, 53, 250002, 49488, 250003, 6, 4, 54433, 450, 70, 29041, 1902, 35839, 136, 80756, 1257, 5] => found span: [49488]

In [206]:
labels = pd.read_json("/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_zhen.jsonl",lines=True)

In [202]:
labels = pd.merge(labels, labels_char, on=['system', 'doc_id', 'seg_id', 'source', 'target_clean'], how='left', suffixes=['', '_char'])

In [212]:
labels.loc[labels['raw_no_error']!=labels['raw_no_error_char'], :] ## good!

Unnamed: 0,system,doc_id,seg_id,source,target_clean,raw_labels,raw_no_error,num_annotations,labels,no_error,raw_labels_char,raw_no_error_char,labels_char,no_error_char


In [None]:
labels.loc[labels['no_error']!=labels['no_error_char'], :] ## -> drop these rows.

In [208]:
## drop rows with no_error mismatch
print(f"# rows before drop no_error mismatch: {len(labels)}")
labels = labels.loc[labels['no_error']==labels['no_error_char'], :].copy()
print(f"# rows after drop no_error mismatch: {len(labels)}")

# rows before drop no_error mismatch: 10710
# rows after drop no_error mismatch: 10675


In [209]:
labels.to_json(f"/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_zhen.jsonl",lines=True,orient="records")

In [210]:
labels_error = labels.loc[labels['no_error']==False,:]

In [211]:
labels_error.to_json(f"/home/s3/hyeryung/mucoco/new_module/locate/data/mqm_newstest2020_zhen_with_error.jsonl",lines=True,orient="records")