In [2]:
import os
os.chdir('/home/s3/hyeryung/mucoco')

In [12]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer

from new_module.locate import evaluate_locate as eval_utils

In [23]:
primary_tokenizer = AutoTokenizer.from_pretrained("/shared/s3/lab07/hyeryung/loc_edit/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/step_2800_best_checkpoint")
em_tokenizer = AutoTokenizer.from_pretrained('/shared/s3/lab07/hyeryung/loc_edit/roberta-base-jigsaw-toxicity-classifier-energy-training/step_600_best_checkpoint/')

In [24]:
def count_pad_token(x):
    return np.sum(np.array(x)==primary_tokenizer.pad_token_id)

def remove_label_for_pad_token(row, colname):
    return list(np.array(row[colname])[np.array(row['tokens'])!=primary_tokenizer.pad_token_id])

def remove_pad_token(x):
    return list(np.array(x)[np.array(x)!=primary_tokenizer.pad_token_id])

In [25]:
## load labels data
label_path = "new_module/data/toxicity-avoidance/testset_gpt2_2500.jsonl" #"new_module/data/toxicity-avoidance/testset_gpt2_2500.jsonl"
labels = pd.read_json(label_path, lines=True)
labels['prompt']=labels['prompt'].apply(lambda x: x['text'])

labels = labels.explode('generations')

labels['text']=labels['generations'].apply(lambda x: x['text'])
labels['tokens']=labels['generations'].apply(lambda x: x['tokens'])

labels['locate_labels']=labels['generations'].apply(lambda x: x.get('locate_labels', np.nan))

del labels['generations']

labels = labels.rename(columns={'locate_labels':'labels'})
labels = labels.dropna(subset='labels')

## correct minor errors -> remove trailing pad_token in the generations.
has_pad_token = labels['tokens'].apply(count_pad_token) > 0
labels.loc[has_pad_token, 'labels'] = labels.loc[has_pad_token,:].apply(lambda x: remove_label_for_pad_token(x, 'labels'),axis=1).values
labels.loc[has_pad_token, 'tokens'] = labels.loc[has_pad_token, 'tokens'].apply(remove_pad_token)

## similarly, clean text column -> remove "<|endoftext|>" text
labels['text']=labels['text'].str.replace("<|endoftext|>","")

In [33]:
labels.shape

(115, 5)

In [27]:

def get_labels_binary_word(row, dataset_type="gpt2"):

    if dataset_type == "gpt2":
        labels_token_index = np.where(np.array(row['labels_binary'])==1)[0]
        labels_word_index = list(set([row['tok2word'][id] for id in labels_token_index]))
    elif dataset_type == "tsd":
        labels_char_index = np.array(row['labels_index_char'])
        labels_word_index = list(set([row['char2word'][id] for id in labels_char_index]))

    return [1 if id in labels_word_index else 0 for id in range(len(row['words']))]

def get_labels_binary_char(row, dataset_type="gpt2"):

    if dataset_type == "gpt2":
        labels_token_index = np.where(np.array(row['labels_binary'])==1)[0]
        labels_char_index = list(set(sum([list(row['tok2char'][id]) for id in labels_token_index],[])))
        return [1 if id in labels_char_index else 0 for id in range(len(row['char']))]
    elif dataset_type == "tsd": ## for tsd, this function is not needed.
        raise NotImplementedError
    

In [30]:

def get_tok2char(row: pd.Series, dataset_type:str="gpt2") -> dict:
    """
    A function to convert a list of tokens into a mapping between each token's index and its corresponding character offsets.
    @param row: A row from dataframe
    @return tok2char: A dictionary with token's location index as keys and tuples of corresponding character offsets as values.

    Example:
    row=pd.Series()
    row['text']='wearing games and holy ****ing shit do I hate horse wearing games .'
    row['tokens']=[86, 6648, 1830, 290, 11386, 25998, 278, 7510, 466, 314, 5465, 8223, 5762, 1830, 764]
    tok2char=get_tok2char(row, "tsd")
    tok2char
    {0: (0,),
    1: (1, 2, 3, 4, 5, 6),
    2: (7, 8, 9, 10, 11, 12),
    3: (13, 14, 15, 16),
    ...
    13: (59, 60, 61, 62, 63, 64),
    14: (65,66)}
    """
    # if dataset_type == "gpt2":        
    #     tok2char=dict()
    #     token_offsets=[0]
    #     for i in range(1,len(row['tokens'])+1):
    #         decoded=tokenizer.decode(row['tokens'][:i])
    #         token_offsets.append(len(decoded))
    #         tok2char[i-1]=tuple(range(token_offsets[i-1],token_offsets[i]))
    #     return tok2char
    
    # elif dataset_type == "tsd":
    tok2char=dict()
    token_offsets=[0]
    j = 0
    for i in range(1,len(row['tokens'])+1):
        while True:
            if tokenizer.decode(tokenizer.encode(row['text'][:j],add_special_tokens=False)) != tokenizer.decode(row['tokens'][:i]):
                if tokenizer.decode(row['tokens'][:i])[-1]=='�':#handle a case where a character is split into multiple tokens
                    break
                j+=1
            else:
                token_offsets.append(j)
                tok2char[i-1]=tuple(range(token_offsets[-2],token_offsets[-1]))
                tmp_id = i-2
                while (tmp_id >= 0 and tmp_id not in tok2char):
                    tok2char[tmp_id]=tuple(range(token_offsets[-2],token_offsets[-1]))
                    tmp_id-=1
                j+=1
                break
    return tok2char

def get_word2char(row: pd.Series, ws: str) -> dict:
    """
    A function to convert a list of words into a mapping between each word's index and its corresponding character offsets.
    @param row: A row from dataframe
    @return word2char: A dictionary with word's location index as keys and tuples of corresponding character offsets as values.

    Caveat:
    This code assumes that words are separated by only one type of whitespace, e.g. space.

    Example:
    row=pd.Series()
    row['words']=['wearing', 'games', 'and', 'holy', '****ing', 'shit', 'do', 'I', 'hate', 'horse', 'wearing', 'games.']
    word2char=get_word2char(row)
    word2char
    {0: (0, 1, 2, 3, 4, 5, 6),
    1: (7, 8, 9, 10, 11, 12),...
    9: (45, 46, 47, 48, 49, 50),
    10: (51, 52, 53, 54, 55, 56, 57, 58),
    11: (59, 60, 61, 62, 63, 64, 65)}
    """
    
    word_offsets=[0]
    word2char=dict()
    for i in range(1,len(row['words'])+1):
        decoded=ws.join(row['words'][:i])
        word_offsets.append(len(decoded))
        word2char[i-1]=tuple(range(word_offsets[i-1],word_offsets[i]))
    return word2char

## group token indices that belong to the same word

def get_word2tok(row: pd.Series, ws: str=None) -> dict:
    """
    A function that take a list of words and a corresponding list of tokens 
    into a mapping between each word's index and its corresponding token indexes.
    @param row: A row from dataframe
    @return word2char: A dictionary with word's location index as keys and tuples of corresponding token location indexes as values.

    Example:
    row=pd.Series()
    row['words']=['wearing', 'games', 'and', 'holy', '****ing', 'shit', 'do', 'I', 'hate', 'horse', 'wearing', 'games.']
    row['tokens']=[86, 6648, 1830, 290, 11386, 25998, 278, 7510, 466, 314, 5465, 8223, 5762, 1830, 13]
    word2tok=get_word2tok(row)
    word2tok
    {0: [0, 1],
    1: [2],
    2: [3],
    ...
    10: [12],
    11: [13, 14]}
    """
    global tokenizer
    
    jl, jr, k = 0, 0, 0
    grouped_tokens = []
    if ws is not None:
        while jr <= len(row['tokens'])+1 and k < len(row['words']):
            # print(f"{jl}, {jr}, {k}: {tokenizer.decode(row['tokens'][jl:jr]).strip(' ')}")
            if tokenizer.decode(row['tokens'][jl:jr]).strip(' ') == row['words'][k]:
                grouped_tokens.append(list(range(jl,jr)))
                k += 1
                jl = jr
                jr += 1
            else:
                jr += 1
        word2tok = dict(zip(range(len(grouped_tokens)), grouped_tokens))
    else:
        while jr <= len(row['tokens'])+1 and k < len(row['words']):
            # print(f"{jl}, {jr}, {k}: {tokenizer.decode(row['tokens'][jl:jr]).strip()}")
            if tokenizer.decode(row['tokens'][jl:jr]).strip() == row['words'][k]:
                grouped_tokens.append(list(range(jl,jr)))
                k += 1
                jl = jr
                jr += 1
            else:
                jr += 1
        word2tok = dict(zip(range(len(grouped_tokens)), grouped_tokens))
    return word2tok


In [32]:
# ### Token ↔︎ Word ↔︎ Char 이 가능한 mapping 정의
sample_text = labels[['text','tokens']].copy()
sample_text['char']=sample_text['text'].apply(list)
sample_text['char_index']=sample_text['char'].apply(lambda x: list(range(len(x))))
assert (sample_text['char'].apply(len) != sample_text['char_index'].apply(len)).sum() == 0

sample_text['tokens_index']=sample_text['tokens'].apply(lambda x: list(range(len(x))))

# if args.dataset_type == "gpt2":
#     sample_text['words']=sample_text['text'].str.split()
# elif args.dataset_type == "tsd":
#     sample_text['words']=sample_text['text'].str.split(' ')
sample_text['words']=sample_text['text'].str.split(' ')
sample_text['words_index']=sample_text['words'].apply(lambda x: list(range(len(x))))

sample_text['tok2char']=sample_text.apply(lambda x: get_tok2char(x, "gpt2"),axis=1)
sample_text['word2char']=sample_text.apply(lambda x: get_word2char(x, " "),axis=1)
# if args.dataset_type == "gpt2":
#     sample_text['word2tok']=sample_text.apply(lambda x: get_word2tok(x),axis=1)
# elif args.dataset_type == "tsd":
#     sample_text['word2tok']=sample_text.apply(lambda x: get_word2tok(x, " "),axis=1)
sample_text['word2tok']=sample_text.apply(lambda x: get_word2tok(x, " "),axis=1)
sample_text['tok2word']=sample_text['word2tok'].apply(kv_swap)
sample_text['char2tok']=sample_text['tok2char'].apply(kv_swap)
sample_text['char2word']=sample_text['word2char'].apply(kv_swap)


KeyboardInterrupt: 

In [29]:
labels['labels_binary'] = labels['labels'].apply(lambda x: [1 if i >= 0.5 else 0 for i in x])
labels['labels_binary_word']=labels.apply(lambda x: get_labels_binary_word(x, dataset_type="gpt2"),axis=1)
labels['labels_binary_char']=labels.apply(get_labels_binary_char,axis=1)

KeyError: 'tok2word'