In [4]:
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer
tqdm.pandas()

bert_path = '../data/pretraining_models/bert/'

tokenizer = BertTokenizer.from_pretrained(bert_path + 'bert-base-uncased-tokenizer/vocab.txt')

In [5]:
df_train = pd.read_csv('../data/input/google-quest-challenge/train.csv')
output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)


output categories:
	 ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

input categories:
	 ['question_title', 'question_body', 'answer']


In [8]:
s = df_train['question_body'][13]
s

"I'm reading this description on sensor size:\n\n\n  Digital compact cameras have substantially smaller sensors offering a\n  similar number of pixels. As a consequence, the pixels are much\n  smaller, which is a key reason for the image quality difference,\n  especially in terms of noise and dynamic range.\n\n\nCould you please elaborate on the last sentence: what's the relation between sensor size and image quality? In particular, what are the advantages and disadvantages of a small sensor (of a compact camera, in contrast to a large sensor of a DSLR camera)?\n"

In [26]:
res_dict = tokenizer.encode_plus(text=s, add_special_tokens=True, max_length=512, truncation_strategy='longest_first')
input_ids = res_dict['input_ids']
token_type_ids = res_dict['token_type_ids']
attention_mask = res_dict['attention_mask']
print(f'input_ids len: {len(input_ids)}\ntoken_type_ids len: {len(token_type_ids)}\nattention_mask len: {len(attention_mask)}')


input_ids len: 110
token_type_ids len: 110
attention_mask len: 110


In [30]:
def get_ids(sentence1, sentence2, truncation_strategy, max_length):
    res_dict = tokenizer.encode_plus(text=sentence1,
                                     text_pair=sentence2,
                                     add_special_tokens=True,
                                     truncation_strategy=truncation_strategy,
                                     max_length=max_length)

    input_ids, input_masks, input_segments = res_dict["input_ids"], res_dict['attention_mask'], res_dict["token_type_ids"]

    padding_length, padding_id = max_length - len(input_ids), tokenizer.pad_token_id
        
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)

    return [input_ids, input_masks, input_segments]

# get_ids(s, None, 'longest_first', 512)

In [None]:
def _convert_to_transformer_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for transformer (including bert)"""
    
    def return_id(str1, str2, truncation_strategy, length):

        inputs = tokenizer.encode_plus(str1, str2,
            add_special_tokens=True,
            max_length=length,
            truncation_strategy=truncation_strategy)
        
        input_ids =  inputs["input_ids"]
        input_masks = [1] * len(input_ids)
        input_segments = inputs["token_type_ids"]
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)
        
        return [input_ids, input_masks, input_segments]
    
    input_ids_q, input_masks_q, input_segments_q = return_id(
        title + ' ' + question, None, 'longest_first', max_sequence_length)
    
    input_ids_a, input_masks_a, input_segments_a = return_id(
        answer, None, 'longest_first', max_sequence_length)
    
    return [input_ids_q, input_masks_q, input_segments_q,
            input_ids_a, input_masks_a, input_segments_a]

def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
    input_ids_q, input_masks_q, input_segments_q = [], [], []
    input_ids_a, input_masks_a, input_segments_a = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        ids_q, masks_q, segments_q, ids_a, masks_a, segments_a = \
        _convert_to_transformer_inputs(t, q, a, tokenizer, max_sequence_length)
        
        input_ids_q.append(ids_q)
        input_masks_q.append(masks_q)
        input_segments_q.append(segments_q)

        input_ids_a.append(ids_a)
        input_masks_a.append(masks_a)
        input_segments_a.append(segments_a)
        
    return [np.asarray(input_ids_q, dtype=np.int32), 
            np.asarray(input_masks_q, dtype=np.int32), 
            np.asarray(input_segments_q, dtype=np.int32),
            np.asarray(input_ids_a, dtype=np.int32), 
            np.asarray(input_masks_a, dtype=np.int32), 
            np.asarray(input_segments_a, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])