## Import Packages & Libraries

In [3]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import *
from sklearn.model_selection import GroupKFold
import tensorflow.keras.backend as K

## Import Data

In [4]:
def read_data(train, test, fraction = 0):
    train_data = pd.read_csv(train + '.csv')
    test_data = pd.read_csv(test + '.csv')
    if fraction != 0:
        train_data = train_data.sample(frac=fraction)
        test_data = test_data.sample(frac=fraction)
    return train_data, test_data

In [5]:
# Use 0.5% for run-time purposes.
train, test = read_data('train', 'test', 0.005)

## Pre-process Data

In [6]:
# https://huggingface.co/transformers/pretrained_models.html
def preprocess():
    return BertTokenizer.from_pretrained('bert-base-uncased', additional_special_tokens=['<END_TITLE>'])

## Encode Data

In [38]:
# Transfer text data to vector series for training purposes.
def encode_func(dataframe):
    question_title = dataframe['question_title']
    question_body = dataframe['question_body']
    answer = dataframe['answer']

    question_encoded_dict = preprocess().encode_plus(question_title + ' <END_TITLE> ' + question_body,
                                                     None,
                                                     max_length=450,
                                                     pad_to_max_length=True,
                                                     add_special_tokens=True)

    answer_encoded_dict = preprocess().encode_plus(answer,
                                                   None,
                                                   max_length=450,
                                                   pad_to_max_length=True,
                                                   add_special_tokens=True)

    return pd.Series([question_encoded_dict['input_ids'],
                      question_encoded_dict['attention_mask'],
                      question_encoded_dict['token_type_ids'],
                      answer_encoded_dict['input_ids'],
                      answer_encoded_dict['attention_mask'],
                      answer_encoded_dict['token_type_ids']])

In [39]:
def get_token(data):
    returnme = data[['qa_id']].copy()
    returnme[['q_enc',
              'q_mask',
              'q_type_ids',
              'a_enc',
              'a_mask',
              'a_type_ids']] = data.apply(encode_func,
                                          axis=1)
    return returnme

In [40]:
train_tok = get_token(train)
test_tok = get_token(test)

In [37]:
train_tok.iloc[:5]

Unnamed: 0,qa_id,q_enc,q_mask,q_type_ids,a_enc,a_mask,a_type_ids
4898,7795,"[101, 16487, 2013, 10200, 4654, 8586, 2000, 24...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 2005, 2005, 2216, 6603, 2054, 1037, 1376...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4893,7788,"[101, 2478, 1037, 8301, 10412, 2005, 3467, 288...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 13433, 2278, 2036, 3084, 1037, 10412, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3081,4905,"[101, 2054, 2515, 1523, 10930, 1011, 7570, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 10930, 1011, 7570, 1011, 7570, 2003, 314...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4855,7725,"[101, 2129, 2064, 1045, 4638, 18833, 3298, 841...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 2008, 1005, 1055, 1037, 2204, 2801, 1998...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3793,6030,"[101, 6358, 3527, 21318, 8370, 3769, 6279, 208...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 1999, 18315, 11022, 3087, 2842, 5927, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
def column_spit(data, type = 'target'):
    column = data.columns[11 : 41]
    if type == ('question'):
        return column[0 : 21]
    else:
        return column[21 : ]

In [11]:
column = train.columns[11 : 41]
question_score = column_spit(train, 'question')
answer_score = column_spit(train)

In [53]:
# pre-train stage: pre-process and clean data and change type to int32, this is to increase the speed 
# of training and also save the memory (64 bit to 32 bit) since our data does not need 64 bit.
def pre_train():
    
    #maxinum length of sentence
    leng = 450
    #set config to false
    config = BertConfig()
    #For right now, use no hidden states (runtime)
    config.output_hidden_states = False
    
    #call model
    bert_model = TFBertModel.from_pretrained('bert-base-uncased', config=config)
    
    #call encode, mask, type_ids in int32 format of keras tensorflow layers
    ecocoded = tf.keras.layers.Input((leng,), dtype=tf.int32)
    
    mask = tf.keras.layers.Input((leng,), dtype=tf.int32)
    
    type_ids = tf.keras.layers.Input((leng,), dtype=tf.int32)
    
    bert = bert_model(ecocoded, attention_mask=mask, token_type_ids=type_ids)[0]
    
    bert_summary = tf.keras.layers.Flatten()(tf.keras.layers.AveragePooling1D(leng)(bert))
    
    return bert_model, ecocoded, mask, type_ids, bert, bert_summary

In [56]:
a = train_tok.index.isin(train_tok.iloc[train_index].index)

In [57]:
a

array([ True,  True,  True,  True,  True, False, False,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True])

In [13]:
#tuning parameter
k_m = GroupKFold(5)

for i, (train_index,
        test_index) in enumerate(k_m.split(train_tok,
                                           groups=train['question_title'])):

    train_boolean = train_tok.index.isin(train_tok.iloc[train_index].index)

    K.clear_session()


    question_bert_model, question_encoded, question_mask, question_type_ids, question_bert, question_bert_summary = pre_train()

    answer_bert_model, answer_encoded, answer_mask, answer_type_ids, answer_bert, answer_bert_summary = pre_train()

    bert_summary = tf.keras.layers.Concatenate()([question_bert_summary, answer_bert_summary])

    output = tf.keras.layers.Dense(30, activation='sigmoid')(tf.keras.layers.Dropout(0.2)(bert_summary))

    model = tf.keras.models.Model(inputs=[question_encoded,
                                          question_mask,
                                          question_type_ids,
                                          answer_encoded,
                                          answer_mask,
                                          answer_type_ids],
                                  outputs=output)

    model.compile(optimizer=tf.keras.optimizers.Adam(2e-5),
                  loss='binary_crossentropy')
    model.fit([np.array(list(train_tok.loc[train_boolean, c].values)) for c in
               ['q_enc',
                'q_mask',
                'q_type_ids',
                'a_enc',
                'a_mask',
                'a_type_ids']],
              train.loc[train_boolean,
                        column].values,
              epochs=3,
              verbose=2,
              batch_size=6)

    if not os.path.exists('model_{}'.format(i)):
        os.mkdir('model_{}'.format(i))
    model.save_weights(os.path.join('model_{}'.format(i),
                                    'model.h5'))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…


Train on 24 samples
Epoch 1/3
24/24 - 135s - loss: 0.6395
Epoch 2/3
24/24 - 98s - loss: 0.4933
Epoch 3/3
24/24 - 103s - loss: 0.4404
Train on 24 samples
Epoch 1/3
24/24 - 167s - loss: 0.6381
Epoch 2/3
24/24 - 139s - loss: 0.4950
Epoch 3/3
24/24 - 118s - loss: 0.4339
Train on 24 samples
Epoch 1/3
24/24 - 149s - loss: 0.6260
Epoch 2/3
24/24 - 138s - loss: 0.4707
Epoch 3/3
24/24 - 153s - loss: 0.4161
Train on 24 samples
Epoch 1/3
24/24 - 164s - loss: 0.6568
Epoch 2/3
24/24 - 145s - loss: 0.4831
Epoch 3/3
24/24 - 1180s - loss: 0.4238
Train on 24 samples
Epoch 1/3
24/24 - 183s - loss: 0.6196
Epoch 2/3
24/24 - 164s - loss: 0.4690
Epoch 3/3
24/24 - 164s - loss: 0.4201
