In [1]:
import pandas as pd
from transformers import AutoTokenizer
import re
from random import randint
import json
import os
from sklearn.model_selection import train_test_split
from nltk import sent_tokenize
from collections import Counter
# import nltk
# nltk.download('punkt_tab')

#length limitations
max_input_len = 300
data_size = None

INPUT = '../data/finecite/'
OUTPUT = f'../data/finecite/'
if data_size: OUTPUT = f'../data/finecite/{data_size}/'
SEED = 82

# as ther is not such a hughe difference between the token count we use mistrals tokenizer for the evaluation
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.3')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load data 
full_raw = pd.read_csv(INPUT + "full_raw.csv")

#extract relevant data
full_df = full_raw[['paragraph', 'target_reference_location', 'context_location1']]
full_df.columns = ['text', 'target_loc', 'context']

#split data into train / test
train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=82)

In [3]:
#helper 
def preprocess_df(df, max_len):
    res_df = pd.DataFrame(columns=['text', 'context'])
    for idx, row in df.iterrows():

        # prepare text
        text = re.sub(r'<ref.*?>.*?</ref>',lambda x: x.group().replace(';',','), row['text']) #clean all ';' from references
        text = [word.strip() for word in text.split(';')] #create word list of par
        text[row['target_loc']] = '#TARGET_REF' # replace targeted reference
        text = ['#REF' if re.search(r'<ref.*?>.*?</ref>', word) else word for word in text] # replace all other references
        
        # Process the context labels
        context = eval(row["context"])
        
        # check whether context label and word list are same length
        assert len(context) == len(text), f'The labels are of length {len(context)}, while the word list is of length {len(text)}'

        # adjust context length to < max_len
        tokenized_text = tokenizer.tokenize(' '.join(text))
        if len(tokenized_text) > max_len:
            text_tokens = []
            labels = []
            word_id = []
            
            #create word to token mapping
            for idx, (label, word) in enumerate(list(zip(context, text))):
                tokens = tokenizer.tokenize(word)
                text_tokens.extend(tokens)
                labels.extend([labels] * len(tokens))
                word_id.extend([idx] * len(tokens))
            assert len(text_tokens) == len(labels), 'something went wrong in the tokenization process'
            
            # reduce size to max_len by deleting as little annotated text as possible
            excess_len = len(text_tokens) - max_len
            zero_count_start = labels[:excess_len].count(0) #count not annotated token at begin of text
            zero_count_end = labels[max_len:].count(0) # count not annotated token at end of text
            if zero_count_end > zero_count_start:
                end_id = word_id[max_len]
                text, context = text[:end_id -1], context[:end_id -1]
            else:
                begin_id = word_id[excess_len-1]
                text, context = text[begin_id +1:], context[begin_id +1:]
        
        #add text, context to result
        res_df.loc[len(res_df)] = [text, context]
        
    return res_df
    
def label_mapping(label):
    if type(label) == int:
        if label == 1: return 'INFORMATION'
        if label == 2: return 'PERCEPTION'
        if label == 3: return 'BACKGROUND'
    else:
        if label == 'INFORMATION': return 1
        if label == 'PERCEPTION': return 2
        if label == 'BACKGROUND': return 3
    
def return_xml_string(text, context):
    res_text = []
    prev_label = 0
    
    # insert xml tags on label change
    for label, word in list(zip(context, text)):
        if label != prev_label:
            if prev_label != 0:
                res_text.append(f'</{label_mapping(prev_label)}>')
            if label != 0:
                res_text.append(f'<{label_mapping(label)}>')
        res_text.append(word)
        prev_label = label
        
    #append last closing tag
    if prev_label != 0:
        res_text.append(f'</{label_mapping(prev_label)}>')  
    
    return ' '.join(res_text)

def return_json1_object(text, context):
    res_object = {
        "INFORMATION": [],
        "PERCEPTION": [],
        "BACKGROUND": [],
    }
    curr_str = []
    curr_label = 0
    prev_label = 0
    
    # map labeled sequence to context type
    for label, word in list(zip(context, text)):
        if label != prev_label and prev_label != 0:
            res_object[label_mapping(prev_label)].append(' '.join(curr_str))
            curr_str = []
        if label != 0:
            curr_str.append(word)
        prev_label = label
            
        
    #append last context sequence
    if curr_str:
        res_object[label_mapping(prev_label)].append(' '.join(curr_str)) 
    
    return res_object

def return_majority_token(sent_text, context):
    sent_labels = []
    for length in [len(sent) for sent in sent_text]:
        labels = context[:length]
        
        context = context[length:]
        majority_label = Counter(labels).most_common(1)[0][0]
        sent_labels.append(majority_label)
    return sent_labels

# def return_json2_object(context):
#     # create setence majority label
#     res_output = {
#         "INFORMATION": [],
#         "PERCEPTION": [],
#         "BACKGROUND": [],
#     }
    
#     for idx, label in enumerate(context):
#         if label != 0:
#             res_output[label_mapping(label)].append(f'sent{idx}')
    
#     return res_output

In [4]:
#create different task shemata for acl-arc
'''data schema
    [{
        "gold": {
            "text": [<str>],
            "label": [<int>]
        },
        "input": <str>
        "output": <str>
    }]
'''

def create_xml_data(df):
    res_data = []
    for idx, row in df.iterrows():
        text = row['text']
        context = row['context']
        input = ' '.join(row['text'])
        
        #set XML tags
        output = return_xml_string(text, context)
        
        # add example to response
        res_data.append({
            "gold": {
                "text": text,
                "context": context
            },
            "input": input,
            "output": output
        })
    return res_data

def create_json_data(df):
    res_data = []
    for idx, row in df.iterrows():
        text = row['text']
        context = row['context']
        input = ' '.join(row['text'])
        
        #create json object
        output = return_json1_object(text, context)
        
        # add example to response
        res_data.append({
            "gold": {
                "text": text,
                "context": context
            },
            "input": input,
            "output": json.dumps(output)
        })
    return res_data

# def create_json_2_data(df):
#     res_data = []
#     for idx, row in df.iterrows():
        
#         #split text to sentences
#         text = [sent.split(' ') for sent in sent_tokenize(' '.join(row['text']))]
#         if len(row['text']) != sum([len(sent) for sent in text]):
#             #print(f"the length of text is {len(row['text'])}, while the sum of len of sent is {sum([len(sent) for sent in sents])}")
#             continue
#         input = [f"sent{idx}: {' '.join(sent)}\n" for idx, sent in enumerate(text)]
        
#         #find sentence majority token
#         context = return_majority_token(text, row['context'])
#         output = return_json2_object(context)
        
#         # add example to response
#         res_data.append({
#             "gold": {
#                 "text": text,
#                 "context": context
#             },
#             "input": ' '.join(input),
#             "output": json.dumps(output)
#         })
#     return res_data

In [5]:
# prepare data
test_df_clean = preprocess_df(test_df, max_input_len)
train_df_clean = preprocess_df(train_df, max_input_len)

# create train data with size data_size
if data_size and data_size < len(train_df_clean):
    train_df_clean = train_df_clean.sample(frac=1).reset_index(drop=True)[:data_size]

# create XML data
test_xml = create_xml_data(test_df_clean)
train_xml = create_xml_data(train_df_clean)
XML_OUTPUT = OUTPUT + 'XML/'
os.makedirs(XML_OUTPUT, exist_ok=True)
with open(XML_OUTPUT+ 'train.json', 'w', encoding='utf-8') as f:
    json.dump(train_xml, f, ensure_ascii=False, indent=4)
with open(XML_OUTPUT+ 'test.json', 'w', encoding='utf-8') as f:
    json.dump(test_xml, f, ensure_ascii=False, indent=4)


# create JSON 1 data
test_json1 = create_json_data(test_df_clean)
train_json1 = create_json_data(train_df_clean)
JSON1_OUTPUT = OUTPUT + 'JSON/'
os.makedirs(JSON1_OUTPUT, exist_ok=True)
with open(JSON1_OUTPUT + '/train.json', 'w', encoding='utf-8') as f:
    json.dump(train_json1, f, ensure_ascii=False, indent=4)
with open(JSON1_OUTPUT + 'test.json', 'w', encoding='utf-8') as f:
    json.dump(test_json1, f, ensure_ascii=False, indent=4)

# # create JSON 2 data
# test_json2 = create_json_2_data(test_df_clean)
# train_json2 = create_json_2_data(train_df_clean)
# JSON2_OUTPUT = OUTPUT + 'JSON2/'
# os.makedirs(JSON2_OUTPUT, exist_ok=True)
# with open(JSON2_OUTPUT + 'train.json', 'w', encoding='utf-8') as f:
#     json.dump(train_json2, f, ensure_ascii=False, indent=4)
# with open(JSON2_OUTPUT + 'test.json', 'w', encoding='utf-8') as f:
#     json.dump(test_json2, f, ensure_ascii=False, indent=4)