In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K
import os
from scipy.stats import spearmanr
from math import floor, ceil
from transformers import *

np.set_printoptions(suppress=True)
print(tf.__version__)

2.1.0


In [2]:
os.listdir('../input')

['blarge-token',
 'pretrained-models-1',
 'bert-large-config',
 'bert-base-uncased-huggingface-transformer',
 'google-quest-challenge',
 'blarge']

#### 1. Read data and tokenizer

Read tokenizer and data, as well as defining the maximum sequence length that will be used for the input to Bert (maximum is usually 512 tokens)

In [3]:
PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/bert-base-uncased-huggingface-transformer/'
tokenizer0 = BertTokenizer.from_pretrained(BERT_PATH+'bert-base-uncased-vocab.txt')
tokenizer = BertTokenizer.from_pretrained('../input/blarge-token/bert_large_vocab/vocab.txt')


MAX_SEQUENCE_LENGTH = 384

df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)

output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

train shape = (6079, 41)
test shape = (476, 11)

output categories:
	 ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

input categories:
	 ['question_title', 'question_body', 'answer']


#### 2. Preprocessing functions

These are some functions that will be used to preprocess the raw text data into useable Bert inputs.<br>

*update 4:* credits to [Minh](https://www.kaggle.com/dathudeptrai) for this implementation. If I'm not mistaken, it could be used directly with other Huggingface transformers too! Note that due to the 2 x 512 input *(update 5: 2 x 384)*, it will require significantly more memory when finetuning BERT.

In [4]:
def _convert_to_transformer_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for transformer (including bert)"""
    
    def return_id(str1, str2, truncation_strategy, length):

        inputs = tokenizer.encode_plus(str1, str2,
            add_special_tokens=True,
            max_length=length,
            truncation_strategy=truncation_strategy)
        
        input_ids =  inputs["input_ids"]
        input_masks = [1] * len(input_ids)
        input_segments = inputs["token_type_ids"]
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)
        
        return [input_ids, input_masks, input_segments]
    
    input_ids_q, input_masks_q, input_segments_q = return_id(
        title + ' ' + question, None, 'longest_first', max_sequence_length)
    
    input_ids_a, input_masks_a, input_segments_a = return_id(
        answer, None, 'longest_first', max_sequence_length)
    
    return [input_ids_q, input_masks_q, input_segments_q,
            input_ids_a, input_masks_a, input_segments_a]

def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
    input_ids_q, input_masks_q, input_segments_q = [], [], []
    input_ids_a, input_masks_a, input_segments_a = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        ids_q, masks_q, segments_q, ids_a, masks_a, segments_a = \
        _convert_to_transformer_inputs(t, q, a, tokenizer, max_sequence_length)
        
        input_ids_q.append(ids_q)
        input_masks_q.append(masks_q)
        input_segments_q.append(segments_q)

        input_ids_a.append(ids_a)
        input_masks_a.append(masks_a)
        input_segments_a.append(segments_a)
        
    return [np.asarray(input_ids_q, dtype=np.int32), 
            np.asarray(input_masks_q, dtype=np.int32), 
            np.asarray(input_segments_q, dtype=np.int32),
            np.asarray(input_ids_a, dtype=np.int32), 
            np.asarray(input_masks_a, dtype=np.int32), 
            np.asarray(input_segments_a, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

#### 3. Create model

`compute_spearmanr()` is used to compute the competition metric for the validation set
<br><br>
`create_model()` contains the actual architecture that will be used to finetune BERT to our dataset.


In [5]:
#config = BertConfig.from_pretrained('../input/bert-large-config/config.json')
#bert_model = TFBertModel.from_pretrained('../input/bert-large-config/bert_model.h5',config = config)

In [6]:
def compute_spearmanr_ignore_nan(trues, preds):
    rhos = []
    for tcol, pcol in zip(np.transpose(trues), np.transpose(preds)):
        rhos.append(spearmanr(tcol, pcol).correlation)
    return np.nanmean(rhos)

def create_model():
    q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    a_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    a_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    a_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    config = BertConfig.from_pretrained('../input/bert-large-config/config.json')
    bert_model = TFBertModel.from_pretrained('../input/bert-large-config/bert_model.h5',config = config)
    # caution: when using e.g. XLNet, XLNetConfig() will automatically use xlnet-large config
    
    # normally ".from_pretrained('bert-base-uncased')", but because of no internet, the 
    # pretrained model has been downloaded manually and uploaded to kaggle. 
#   bert_model = TFBertModel.from_pretrained(
#       BERT_PATH+'bert-base-uncased-tf_model.h5', config=config)
    
    # if config.output_hidden_states = True, obtain hidden states via bert_model(...)[-1]
    q_embedding = bert_model(q_id, attention_mask=q_mask, token_type_ids=q_atn)[0]
    a_embedding = bert_model(a_id, attention_mask=a_mask, token_type_ids=a_atn)[0]
    q_embedding = tf.keras.layers.Activation('linear', dtype='float16')(q_embedding)
    a_embedding = tf.keras.layers.Activation('linear', dtype='float16')(a_embedding)

    q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding)
    a = tf.keras.layers.GlobalAveragePooling1D()(a_embedding)
    
    x = tf.keras.layers.Concatenate()([q, a])
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(30, activation='sigmoid')(x)
   #x = tf.keras.layers.Activation('linear', dtype='float32')(x)

    model = tf.keras.models.Model(inputs=[q_id, q_mask, q_atn, a_id, a_mask, a_atn,], outputs=x)
    
    return model

In [7]:
MAX_SEQUENCE_LENGTH0 = 512

In [8]:
def create_model0():
    q_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH0,), dtype=tf.int32)
    a_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH0,), dtype=tf.int32)
    
    q_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH0,), dtype=tf.int32)
    a_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH0,), dtype=tf.int32)
    
    q_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH0,), dtype=tf.int32)
    a_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH0,), dtype=tf.int32)
    
    #config = BertConfig.from_pretrained('../input/bert-large-config/config.json')
    #bert_model = TFBertModel.from_pretrained('../input/bert-large-config/bert_model.h5',config = config)
    # caution: when using e.g. XLNet, XLNetConfig() will automatically use xlnet-large config
    
    # normally ".from_pretrained('bert-base-uncased')", but because of no internet, the 
    # pretrained model has been downloaded manually and uploaded to kaggle. 
    config0 = BertConfig()
    bert_model = TFBertModel.from_pretrained(
        BERT_PATH+'bert-base-uncased-tf_model.h5', config=config0)
    
    # if config.output_hidden_states = True, obtain hidden states via bert_model(...)[-1]
    q_embedding = bert_model(q_id, attention_mask=q_mask, token_type_ids=q_atn)[0]
    a_embedding = bert_model(a_id, attention_mask=a_mask, token_type_ids=a_atn)[0]
    q_embedding = tf.keras.layers.Activation('linear', dtype='float16')(q_embedding)
    a_embedding = tf.keras.layers.Activation('linear', dtype='float16')(a_embedding)

    q = tf.keras.layers.GlobalAveragePooling1D()(q_embedding)
    a = tf.keras.layers.GlobalAveragePooling1D()(a_embedding)
    
    x = tf.keras.layers.Concatenate()([q, a])
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(30, activation='sigmoid')(x)
   #x = tf.keras.layers.Activation('linear', dtype='float32')(x)

    model = tf.keras.models.Model(inputs=[q_id, q_mask, q_atn, a_id, a_mask, a_atn,], outputs=x)
    
    return model

#### 4. Obtain inputs and targets, as well as the indices of the train/validation splits

In [9]:
outputs = compute_output_arrays(df_train, output_categories)
#inputs = compute_input_arrays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arrays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [10]:
#outputs = compute_output_arrays(df_train, output_categories)
#inputs = compute_input_arrays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs0 = compute_input_arrays(df_test, input_categories, tokenizer0, 512)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




#### 5.Testing

In [11]:
#gkf = GroupKFold(n_splits=5).split(X=df_train.question_body, groups=df_train.question_body)
#
#valid_preds = []
#test_preds = []
#for fold, (train_idx, valid_idx) in enumerate(gkf):
#    
#    # will actually only do 2 folds (out of 5) to manage < 2h
#    if fold in [0, 2]:
#
#        train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
#        train_outputs = outputs[train_idx]
#
#        valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
#        valid_outputs = outputs[valid_idx]
#        
#        K.clear_session()
#        model = create_model()
#        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
#        model.compile(loss='binary_crossentropy', optimizer=optimizer)
#        model.fit(train_inputs, train_outputs, epochs=3, batch_size=6)
#        # model.save_weights(f'bert-{fold}.h5')
#        valid_preds.append(model.predict(valid_inputs))
#        test_preds.append(model.predict(test_inputs))
#        
#        rho_val = compute_spearmanr_ignore_nan(valid_outputs, valid_preds[-1])
#        print('validation score = ', rho_val)

In [12]:
test_preds = []
#number_of_models = 2
#for i in range(number_of_models):
#    model = create_model()
#    model.load_weights('../input/blarge/large_models/best_modellarge_batch'+str(i)+'.h5')
#    test_preds.append(model.predict(test_inputs))    

In [13]:
#model = create_model()
#model.load_weights('../input/berty-1/Berty/berty.h5')
#test_preds.append(model.predict(test_inputs))
number_of_models = 5
for i in range(number_of_models):
    model = create_model0()
    model.load_weights('../input/pretrained-models-1/best_model3_batch'+str(i)+'.h5')
    test_preds.append(model.predict(test_inputs0))    

#### 6. Process and submit test predictions

Average fold predictions, then save as `submission.csv`

In [14]:
df_sub.iloc[:, 1:] = np.average(test_preds, axis=0) # for weighted average set weights=[...]
df_sub.to_csv('submission.csv', index=False)