In [1]:
!pip install ../input/sacremoses/sacremoses-master/ > /dev/null
!pip install ../input/transformers-master/transformers-master/ > /dev/null

## 改定履歴
1. Blend Ver2 First Submit (0.424)
2. Change Xlnet to two branch Model
3. BugFix output_base (0.426)
4. Update Tricks (0.435)
5. Add Large Back (0.440)
6. Update Trick with rescalling -- submission trick2 (0.447)
7. Update XlNet, Change Weight -> 0.3 (submission Overtime)
8. More Efficiency (0.449)
9. Back to Origin Weight
10. Change Bert-Large to Roberta-Large (0.452)
11. Change Bert-Finetune-1 to 0.396 New Sturcture(0.451)
12. Trick Test (add limit 0.453)
13. New Trick Test 
14. Change Activation of Bert-Base 2 (0.455) from (tanh => SELU)
15. round up
16. round down (0.443)
17. np.around -> np.floor （0.459)
18. Change Bert-Finetune-1 to 0.396 New Sturcture
19. Change Bert-Finetune-1 to 0.398 Model （0.458)
20. 

In [2]:
from torch.utils.data import DataLoader, Dataset
import numpy as np 
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import datetime
import seaborn as sns
import time
import scipy.stats as stats
import gc
import re
import operator 
import sys
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from tqdm import tqdm, tqdm_notebook
from scipy.stats import spearmanr
import os
import warnings
warnings.filterwarnings('ignore')
import pickle
import random
import shutil
import transformers
from math import floor, ceil
from glob import glob
#from xml.sax.saxutils import unescape
import tensorflow as tf
import tensorflow_hub as hub
import keras.backend as K
from numba import cuda
from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words("english"))
tqdm.pandas()

Using TensorFlow backend.


In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [4]:
SEED                = 2020
DATA_DIR            = "../input/google-quest-challenge/"
WORK_DIR            = "../working/"
BERT_VOCAB_PATH     = "../input/pretrained-bert-models-for-pytorch/bert-base-uncased-vocab.txt"
BERT_MODEL_PATH     = "../input/pretrained-bert-models-for-pytorch/bert-base-uncased/pytorch_model.bin"
BERT_CONFIG_PATH    = "../input/pretrained-bert-models-for-pytorch/bert-base-uncased/bert_config.json"
XLNET_VOCAB_PATH    = "../input/xlnetlargecased/xlnet_cased_L-24_H-1024_A-16/spiece.model"
XLNET_MODEL_PATH    = "../input/xlnetlargecased/xlnet_cased_L-24_H-1024_A-16/xlnet_model.ckpt.index"
XLNET_CONFIG_PATH   = "../input/xlnetlargecased/xlnet_cased_L-24_H-1024_A-16/xlnet_config.json"
input_columns       = ['question_title', 'question_body', 'answer']
seed_everything(SEED)
batch_size          = 64

In [5]:
test_df  = pd.read_csv(os.path.join(DATA_DIR,"test.csv"))

## Bert Tokenizer

In [6]:
bert_config = transformers.BertConfig.from_json_file(BERT_CONFIG_PATH)
bert_config.output_hidden_states = True
tokenizer = transformers.BertTokenizer.from_pretrained(BERT_VOCAB_PATH)

In [7]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q_len_head = round(q_new_len/3)
        q_len_tail = -1* (q_new_len -q_len_head)
        a_len_head = round(a_new_len/3)
        a_len_tail = -1* (a_new_len -a_len_head) 
        
        q = q[:q_len_head]+q[q_len_tail:]
        a = a[:a_len_head]+a[a_len_tail:]
    
    return t, q, a

def convert_lines(title, question, answer, max_sequence_length, tokenizer, t_max_len_seq=30, q_max_len_seq=239, a_max_len_seq=239):
    
    all_tokens   = []
    all_masks    = []
    all_segments = []
    
    longer = 0
    
    for t, q, a in tqdm(zip(title, question, answer)):
        
        tokens_t, tokens_q, tokens_a  = _trim_input(t, q, a, max_sequence_length=max_sequence_length)
        #print(tokens_t)
        #print(tokens_q)
        #print(tokens_a)
        
        stoken = ["[CLS]"] + tokens_t + ["[SEP]"] + tokens_q + ["[SEP]"] + tokens_a + ["[SEP]"] 
        ##############
        #token_ids
        ##############
        token_ids = tokenizer.convert_tokens_to_ids(stoken)
        input_ids = token_ids + [0] * (max_sequence_length-len(token_ids))
        
        #############
        #input_masks
        #############
        attention_masks = _get_masks(stoken, max_sequence_length)
        #print(attention_masks)
        
        ##############
        #input_segments
        ###############
        input_segments = _get_segments(stoken, max_sequence_length)
        
        all_tokens.append(input_ids)
        all_masks.append(attention_masks)
        all_segments.append(input_segments)
        #break
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [8]:
test_tokens, test_masks, test_segments = convert_lines(test_df["question_title"],
                                                       test_df["question_body"], 
                                                       test_df["answer"],
                                                       max_sequence_length=512, 
                                                       tokenizer=tokenizer)

476it [00:05, 80.62it/s]


## External Features 

In [9]:
module_url = "../input/universalsentenceencoderlarge4/"
embed = hub.load(module_url)

In [10]:
embeddings_test = {}

for text in input_columns:
    print(text)
    test_text = test_df[text].str.replace('?', '.').str.replace('!', '.').tolist()
    
    curr_test_emb = []
    batch_size_ = 4
    ind = 0
    while ind*batch_size_ < len(test_text):
        curr_test_emb.append(embed(test_text[ind*batch_size_: (ind + 1)*batch_size_])["outputs"].numpy())
        ind += 1
        
    embeddings_test[text + '_embedding'] = np.vstack(curr_test_emb)
    
del embed
K.clear_session()
gc.collect()

question_title
question_body
answer


302230

In [11]:
cuda.select_device(0) #clear GPU memory 
cuda.close()

In [12]:
cuda.select_device(0) #restart cuda

<weakproxy at 0x7ff69030a458 to Device at 0x7ff72a3f9a90>

In [13]:
l2_dist = lambda x, y: np.power(x - y, 2).sum(axis=1)
cos_dist = lambda x, y: (x*y).sum(axis=1)

dist_features_test = np.array([
    l2_dist(embeddings_test['question_title_embedding'], embeddings_test['answer_embedding']),
    l2_dist(embeddings_test['question_body_embedding'], embeddings_test['answer_embedding']),
    l2_dist(embeddings_test['question_body_embedding'], embeddings_test['question_title_embedding']),
    cos_dist(embeddings_test['question_title_embedding'], embeddings_test['answer_embedding']),
    cos_dist(embeddings_test['question_body_embedding'], embeddings_test['answer_embedding']),
    cos_dist(embeddings_test['question_body_embedding'], embeddings_test['question_title_embedding'])
]).T

idxs = range(len(test_df))
columns = ['l2_qt_a', 'l2_qb_a', 'l2_qb_qt',
           'cos_qt_a', 'cos_qb_a', 'cos_qb_qt',
          ]
dist_features_df = pd.DataFrame(index=idxs, columns=columns)
dist_features_df[columns] = dist_features_test
for col in columns: test_df[col] = dist_features_df[col].values

In [14]:
def add_external_features(df):
    
    #If the question is longer, it may be more clear, which may help users give a more 
    df['question_body']      = df['question_body'].progress_apply(lambda x:str(x))
    df['question_num_words'] = df.question_body.str.count('\S+')
    df['question_title_num_words'] = df.question_title.str.count('\S+')
    
    #The assumption here is that longer answer could bring more useful detail
    df['answer']            = df['answer'].progress_apply(lambda x:str(x))
    df['answer_num_words']  = df.answer.str.count('\S+')
    
    df["question_title_num_unique_words"] = df["question_title"].progress_apply(lambda x: len(set(str(x).split())))
    df["question_body_num_unique_words"]  = df["question_body"].progress_apply(lambda x: len(set(str(x).split())))
    df["answer_num_unique_words"]         = df["answer"].progress_apply(lambda x: len(set(str(x).split())))
    
    df["question_title_num_chars"] = df["question_title"].apply(lambda x: len(str(x)))
    df["question_body_num_chars"]  = df["question_body"].apply(lambda x: len(str(x)))
    df["answer_num_chars"]         = df["answer"].apply(lambda x: len(str(x)))
    
    df['qt_words'] = df['question_title'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords])
    df['q_words'] = df['question_body'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords])
    df['a_words'] = df['answer'].apply(lambda s: [f for f in s.split() if f not in eng_stopwords])
    df['qa_word_overlap'] = df.apply(lambda s: len(np.intersect1d(s['q_words'], s['a_words'])), axis = 1)
    df['qt_word_overlap'] = df.apply(lambda s: len(np.intersect1d(s['qt_words'], s['a_words'])), axis = 1)
    
    df['qa_word_overlap_norm'] = df.apply(lambda s: s['qa_word_overlap']/(len(s['a_words']) + len(s['q_words'])  - s['qa_word_overlap']) , axis = 1)
    df['qta_word_overlap_norm'] = df.apply(lambda s: s['qt_word_overlap']/(len(s['a_words']) + len(s['qt_words']) - s['qt_word_overlap']), axis = 1)
    df.drop(['q_words', 'a_words', 'qt_words'], axis = 1, inplace = True)
    
    return df

In [15]:
test_df = add_external_features(test_df)

100%|██████████| 476/476 [00:00<00:00, 213004.24it/s]
100%|██████████| 476/476 [00:00<00:00, 205717.54it/s]
100%|██████████| 476/476 [00:00<00:00, 152905.62it/s]
100%|██████████| 476/476 [00:00<00:00, 38240.32it/s]
100%|██████████| 476/476 [00:00<00:00, 30374.55it/s]


In [16]:
handmade_cols = ["question_body_num_unique_words", 'question_num_words',
                 "question_title_num_unique_words", "question_title_num_words", 
                 "answer_num_unique_words", "answer_num_words"]

with open('../input/quest-bert-3/scaler.pickle', mode='rb') as f:
    num_words_scaler = pickle.load(f)

test_df[handmade_cols]=  num_words_scaler.transform(test_df[handmade_cols].values)
#test_handmade_features= test_df[handmade_cols + ['l2_qt_a', 'l2_qb_a', 'cos_qt_a', 'cos_qb_a']].values

In [17]:
def label_encoder(x, dict_reverse):
    try:
        return dict_reverse[x]
    except:
        return 0

with open('../input/quest-bert-2/category.pickle', mode='rb') as f:
    category_dict_reverse = pickle.load(f)
test_df['category'] = test_df['category'].apply(lambda x: label_encoder(x, category_dict_reverse))

with open('../input/quest-bert-2/host.pickle', mode='rb') as f:
    host_dict_reverse = pickle.load(f)
test_df['host'] = test_df['host'].apply(lambda x: label_encoder(x, host_dict_reverse))

n_cat    = len(category_dict_reverse) + 1
cat_emb  = 128
n_host   = len(host_dict_reverse) + 1
host_emb = 128

## DataLoader

In [18]:
class QuestDataset_test(Dataset):

    def __init__(self, token_ids, masks, segments, hosts, categories, handmade_features):
                
        self.token_ids  = token_ids
        self.masks      = masks
        self.segments   = segments
        self.hosts      = hosts
        self.categories = categories
        self.handmades  = handmade_features

    def __len__(self):
        return self.token_ids.shape[0]

    def __getitem__(self, idx):
        token_id = self.token_ids[idx]
        mask     = self.masks[idx]
        segment  = self.segments[idx]
        host     = self.hosts[idx]
        category = self.categories[idx]
        handmade = self.handmades[idx]

        return [token_id, mask, segment, host, category, handmade]

## Model1 -- Bert-Base Finetune(LB:0.398)

In [19]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())

def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, torch.nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)

def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [20]:
class QuestModel(nn.Module):

    def __init__(self, n_cat, cat_emb, n_host, host_emb, num_labels):
        super().__init__()
        BERT_DIMS = 768
        model_path = os.path.join('../input/pretrained-bert-models-for-pytorch/bert-base-uncased/')
        self.bert_model = transformers.BertModel.from_pretrained(model_path, config=bert_config)
        set_trainable(self.bert_model.embeddings.word_embeddings, False)
        
        self.category_embedding = nn.Embedding(n_cat, cat_emb)
        self.host_embedding     = nn.Embedding(n_host, host_emb)
        
        self.dropout1 = nn.Dropout(0.1)
        self.fc1  = nn.Linear(BERT_DIMS*4, BERT_DIMS*4)
        self.fc2  = nn.Linear(BERT_DIMS*5 + int(cat_emb) + int(host_emb) + 12, 21)
        self.fc3  = nn.Linear(BERT_DIMS*5 + int(cat_emb) + int(host_emb) + 12, 9)
        self.activation = nn.Tanh()
        
        self._init_weights(self.category_embedding)
        self._init_weights(self.host_embedding)
        self._init_weights(self.fc1)
        self._init_weights(self.fc2)
        self._init_weights(self.fc3)
    
    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            print("initailize weight")
            module.weight.data.normal_(mean=0.0, std=0.02)
        if isinstance(module, nn.Linear) and module.bias is not None:
            print("initailize bias")
            module.bias.data.zero_()

        
    def forward(self, token_ids, masks, segments, hosts, categories, handmades):
        
        category_embed      = self.category_embedding(categories)
        host_embed          = self.host_embedding(hosts)
        
        external_features   = torch.cat((category_embed, host_embed, handmades), 1)
        _, pooled_output, hidden_states = self.bert_model(input_ids=token_ids, 
                                                          token_type_ids=segments, 
                                                          attention_mask=masks)
        
        meanpooled_output = torch.mean(torch.cat((hidden_states[-1], 
                                                  hidden_states[-2],
                                                  hidden_states[-3], 
                                                  hidden_states[-4],), 2) , 1)
        
        meanpooled_output = self.fc1(meanpooled_output)
        meanpooled_output = self.activation(meanpooled_output)
        
        pooled_output     = torch.cat((meanpooled_output,
                                       pooled_output, 
                                       external_features), 1)
    
        q_results           = self.fc2(pooled_output)
        a_results           = self.fc3(pooled_output)
        
        results             = torch.cat((q_results, a_results), 1)
        
        return results

In [21]:
def predict_result(model, test_loader, batch_size=batch_size):
    
    output = np.zeros((len(test_set), 30))
    model.eval()
    with torch.no_grad():
        for idx, inputs in enumerate(test_loader):
            start_index = idx * batch_size
            end_index   = min(start_index + batch_size, len(test_set))
            token_ids, masks, segments, hosts, categories, handmades = inputs
            token_ids   = token_ids.long().cuda()
            masks       = masks.long().cuda()
            segments    = segments.long().cuda()
            hosts       = hosts.long().cuda()
            categories  = categories.long().cuda()
            handmades   = handmades.float().cuda()
            
            predictions = model(token_ids, masks, segments, hosts, categories, handmades)
            predictions = torch.sigmoid(predictions)
            output[start_index:end_index, :] = predictions.detach().cpu().numpy()
            
    return output

In [22]:
results = []
pretrain_weighted =  glob('../input/quest-bert-5/*.pt')

In [23]:
test_handmade_features= test_df[handmade_cols + ['l2_qt_a', 'l2_qb_a', 'cos_qt_a', 'cos_qb_a',
                                                 'qa_word_overlap_norm',
                                                 'qta_word_overlap_norm']].values
test_set    = QuestDataset_test(test_tokens, test_masks, test_segments, 
                                test_df['host'].values,
                                test_df['category'].values,
                                test_handmade_features)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [24]:
model = QuestModel(n_cat, cat_emb, n_host, host_emb, num_labels=30)
model.cuda()

initailize weight
initailize weight
initailize weight
initailize bias
initailize weight
initailize bias
initailize weight
initailize bias


QuestModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [25]:
for i, weight in tqdm(enumerate(pretrain_weighted)):
    model.load_state_dict(torch.load(weight))
    results.append(predict_result(model, test_loader))
    
output_base = np.zeros((len(test_set),30))
for result in results:
    output_base += result
output_base /= len(results)

10it [01:30,  9.06s/it]


In [26]:
len(results)

10

In [27]:
output_base[0]

array([9.45578533e-01, 6.71771759e-01, 2.00605123e-01, 4.14248723e-01,
       6.44091034e-01, 4.84917250e-01, 6.85563320e-01, 6.66393864e-01,
       6.76841265e-01, 1.26994071e-03, 7.70822793e-01, 6.89096743e-01,
       4.94433356e-03, 2.47739451e-01, 1.95695126e-03, 2.10819520e-03,
       9.48266493e-02, 6.05030615e-02, 7.64178061e-01, 6.41298998e-04,
       9.33368802e-01, 9.21338159e-01, 5.54943672e-01, 9.68319619e-01,
       9.65477335e-01, 8.22495657e-01, 3.56030266e-02, 1.78340932e-02,
       8.96994001e-01, 9.29834127e-01])

In [28]:
del results
del result
gc.collect()

254

## Model2: Bert-base Li's Method (LB:0.400)

In [29]:
torch.cuda.empty_cache() ## Clear_Memory

In [30]:
class QuestModel(nn.Module):
    
    def __init__(self, n_cat, cat_emb, n_host, host_emb, num_labels):
        super().__init__()
        BERT_DIMS = 768
        model_path = os.path.join('../input/pretrained-bert-models-for-pytorch/bert-base-uncased/')
        self.bert_model = transformers.BertModel.from_pretrained(model_path, config=bert_config)
        set_trainable(self.bert_model.embeddings.word_embeddings, False)
        
        self.category_embedding = nn.Embedding(n_cat, cat_emb)
        self.host_embedding     = nn.Embedding(n_host, host_emb)
        
        #self.dropout1 = nn.Dropout(0.2)
        self.fc1  = nn.Linear(BERT_DIMS*4, BERT_DIMS*4)
        self.fc2  = nn.Linear(BERT_DIMS*4 + int(cat_emb) + int(host_emb) + 12, 21)
        self.fc3  = nn.Linear(BERT_DIMS*4 + int(cat_emb) + int(host_emb) + 12, 9)
        self.tanh = nn.SELU()
        
        self._init_weights(self.category_embedding)
        self._init_weights(self.host_embedding)
        self._init_weights(self.fc1)
        self._init_weights(self.fc2)
        self._init_weights(self.fc3)
        
    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            print("initailize weight")
            module.weight.data.normal_(mean=0.0, std=0.02)
        if isinstance(module, nn.Linear) and module.bias is not None:
            print("initailize bias")
            module.bias.data.zero_()

        
    def forward(self, token_ids, masks, segments, hosts, categories, handmades):
        
        category_embed      = self.category_embedding(categories)
        host_embed          = self.host_embedding(hosts)
        
        external_features   = torch.cat((category_embed, host_embed, handmades), 1)
        _, _, hidden_layers = self.bert_model(input_ids=token_ids, token_type_ids=segments, attention_mask=masks)
        hidden_input  = [hidden_layers[-1][:, 0, :],
                         hidden_layers[-2][:, 0, :],
                         hidden_layers[-3][:, 0, :], 
                         hidden_layers[-4][:, 0, :]]
        
        cls           = torch.cat(hidden_input, dim = -1)
        cls           = self.fc1(cls)
        cls           = self.tanh(cls)
        cls           = torch.cat((cls, external_features), 1)
       # cls           = self.dropout1(cls)
        q_results     = self.fc2(cls)
        a_results     = self.fc3(cls)
        
        results       = torch.cat((q_results, a_results), 1)
        
        return results
        
        

In [31]:
test_handmade_features= test_df[handmade_cols + ['l2_qt_a', 'l2_qb_a', 'cos_qt_a', 'cos_qb_a',
                                                 'qa_word_overlap_norm',
                                                 'qta_word_overlap_norm']].values
test_set    = QuestDataset_test(test_tokens, test_masks, test_segments, 
                                test_df['host'].values,
                                test_df['category'].values,
                                test_handmade_features)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [32]:
results = []
pretrain_weighted = glob('../input/quest-bert-3/*.pt')

In [33]:
model = QuestModel(n_cat, cat_emb, n_host, host_emb, num_labels=30)
model.cuda()

initailize weight
initailize weight
initailize weight
initailize bias
initailize weight
initailize bias
initailize weight
initailize bias


QuestModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [34]:
for i, weight in tqdm(enumerate(pretrain_weighted)):
    model.load_state_dict(torch.load(weight))
    results.append(predict_result(model, test_loader))
    
output_base2 = np.zeros((len(test_set),30))
for result in results:
    output_base2 += result
output_base2 /= len(results)

10it [01:29,  8.99s/it]


In [35]:
len(results)

10

In [36]:
output_base2[0]

array([9.53080994e-01, 6.55766457e-01, 1.39395885e-01, 4.74947986e-01,
       5.87175775e-01, 5.77031636e-01, 7.00860041e-01, 7.08363593e-01,
       5.21874544e-01, 1.11731946e-03, 8.26921171e-01, 6.31747839e-01,
       2.39375678e-03, 2.92256103e-01, 2.34182907e-03, 1.28975821e-03,
       7.69624813e-02, 6.44144397e-02, 7.56942523e-01, 4.25323534e-04,
       9.22005981e-01, 9.26921600e-01, 5.80926031e-01, 9.75345409e-01,
       9.73542809e-01, 8.21196079e-01, 1.42902730e-02, 1.88925107e-02,
       9.37293965e-01, 9.29328126e-01])

In [37]:
del results
del result
gc.collect()

0

## Model3 -- RoBerta-Large Feature_based （LB：0.403)

In [38]:
torch.cuda.empty_cache() ## Clear_Memory

In [39]:
BERT_VOCAB_PATH     = "../input/roberta-large/roberta-large-vocab.json"
BERT_MERGES_PATH    = "../input/roberta-large/roberta-large-merges.txt"
BERT_MODEL_PATH     = "../input/roberta-large/roberta-large-pytorch_model.bin"
BERT_CONFIG_PATH    = "../input/roberta-large/roberta-large-config.json"

In [40]:
bert_config = transformers.RobertaConfig.from_json_file(BERT_CONFIG_PATH)
bert_config.output_hidden_states = True
tokenizer = transformers.RobertaTokenizer(BERT_VOCAB_PATH,BERT_MERGES_PATH)

In [41]:
def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=238, a_max_len=238):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+6) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+6 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+6)))
        
        t = t[:t_new_len]
        q_len_head = round(q_new_len/3)
        q_len_tail = -1* (q_new_len -q_len_head)
        a_len_head = round(a_new_len/3)
        a_len_tail = -1* (a_new_len -a_len_head) 
        
        q = q[:q_len_head]+q[q_len_tail:]
        a = a[:a_len_head]+a[a_len_tail:]
    
    return t, q, a

def convert_lines(title, question, answer, max_sequence_length, tokenizer, t_max_len_seq=30, q_max_len_seq=238, a_max_len_seq=238):
    
    all_tokens   = []
    all_masks    = []
    all_segments = []
    
    longer = 0
    
    for t, q, a in tqdm(zip(title, question, answer)):
        
        tokens_t, tokens_q, tokens_a = _trim_input(t, q, a, max_sequence_length=max_sequence_length)
        #print(tokens_t)
        #print(tokens_q)
        #print(tokens_a)
        
        stoken = ["<s>"] + tokens_t + ["</s>"] + ["</s>"] + tokens_q + ["</s>"] + ["</s>"] + tokens_a + ["</s>"]
        ##############
        #token_ids
        ##############
        token_ids = tokenizer.convert_tokens_to_ids(stoken)
        input_ids = token_ids + [0] * (max_sequence_length-len(token_ids))
        
        #############
        #input_masks
        #############
        attention_masks = _get_masks(stoken, max_sequence_length)
        #print(attention_masks)
        
        ##############
        #input_segments
        ###############
        input_segments = _get_segments(stoken, max_sequence_length)
        #print(len(input_ids))
        
        all_tokens.append(input_ids)
        all_masks.append(attention_masks)
        all_segments.append(input_segments)
        #break
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [42]:
test_handmade_features= test_df[handmade_cols + ['qa_word_overlap_norm','qta_word_overlap_norm']].values

In [43]:
test_tokens, test_masks, test_segments = convert_lines(test_df["question_title"],
                                                       test_df["question_body"], 
                                                       test_df["answer"],
                                                       max_sequence_length=512, 
                                                       tokenizer=tokenizer)

476it [00:02, 165.12it/s]


In [44]:
test_set    = QuestDataset_test(test_tokens, test_masks, test_segments, 
                                test_df['host'].values,
                                test_df['category'].values,
                                test_handmade_features)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [45]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)

        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0

        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)

        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))

    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim),
            self.weight
        ).view(-1, step_dim)

        if self.bias:
            eij = eij + self.b

        eij = torch.tanh(eij)
        a = torch.exp(eij)

        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10
        weighted_input = x * torch.unsqueeze(a, -1)
        
        return torch.sum(weighted_input, 1)
    
class QuestModel(nn.Module):

    def __init__(self, n_cat, cat_emb, n_host, host_emb, num_labels):
        super().__init__()
        BERT_DIMS = 1024
        LSTM_UNITS = 512
        model_path = os.path.join(BERT_MODEL_PATH)
        self.bert_model = transformers.RobertaModel.from_pretrained(model_path,config=bert_config)
        set_trainable(self.bert_model, False)
        
        self.category_embedding = nn.Embedding(n_cat, cat_emb)
        self.host_embedding     = nn.Embedding(n_host, host_emb)
        self.embedding_dropout  = SpatialDropout(0.5)
        self.lstm1              = nn.LSTM(BERT_DIMS*2, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2              = nn.GRU(LSTM_UNITS*2, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.atten1             = Attention(LSTM_UNITS*2, 512)
        self.atten2             = Attention(LSTM_UNITS*2, 512)
        self.dropout1           = nn.Dropout(0.2)
        self.fc1                = nn.Linear(LSTM_UNITS*6 + int(cat_emb) + int(host_emb) + 8, 21)
        self.fc2                = nn.Linear(LSTM_UNITS*6 + int(cat_emb) + int(host_emb) + 8, 9)
        self._init_weights(self.category_embedding)
        self._init_weights(self.host_embedding)
        self._init_weights(self.fc1)
        self._init_weights(self.fc2)
    
    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            print("initailize weight")
            module.weight.data.normal_(mean=0.0, std=0.02)
        if isinstance(module, nn.Linear) and module.bias is not None:
            print("initailize bias")
            module.bias.data.zero_()
        
    def forward(self, token_ids, masks, segment, hosts, categories, handmades):
        
        category_embed = self.category_embedding(categories)
        host_embed     = self.host_embedding(hosts)
        external_features = torch.cat((category_embed, host_embed, handmades), 1)
        #print(external_features.shape)
        _, seq_output, hidden_states = self.bert_model(input_ids=token_ids,
                                                    attention_mask=masks,
                                                    token_type_ids=segment)
        
        last_four_layer = torch.cat((hidden_states[-5], hidden_states[-6]), 2)
        lstm_input      = self.embedding_dropout(last_four_layer)
        lstm1_output, _ = self.lstm1(lstm_input)
        lstm2_output, _ = self.lstm2(lstm1_output)
        
        meanpooled_output   = torch.mean(lstm2_output, 1)
        maxpooled_output, _ = torch.max(lstm2_output, 1)
        attention_output_q  = self.atten1(lstm2_output)
        attention_output_a  = self.atten2(lstm2_output)
        
        pooled_output_q = torch.cat((meanpooled_output, maxpooled_output, attention_output_q, external_features), 1)
        pooled_output_q = self.dropout1(pooled_output_q)
        
        pooled_output_a = torch.cat((meanpooled_output, maxpooled_output, attention_output_a, external_features), 1)
        pooled_output_a = self.dropout1(pooled_output_a)
        
        q_results     = self.fc1(pooled_output_q)
        a_results     = self.fc2(pooled_output_a)
        results       = torch.cat((q_results, a_results), 1)
        
        return results

In [46]:
results = []
pretrain_weighted = glob('../input/large-roberta-b32-steplr3-drop-tb-rmf-h56-1/*.pt') + glob('../input/large-roberta-b32-steplr3-drop-tb-rmf-h56-2/*.pt') 
cat_emb = 256
host_emb = 256

In [47]:
pretrain_weighted

['../input/large-roberta-b32-steplr3-drop-tb-rmf-h56-1/C_1.pt',
 '../input/large-roberta-b32-steplr3-drop-tb-rmf-h56-1/C_2.pt',
 '../input/large-roberta-b32-steplr3-drop-tb-rmf-h56-1/C_0.pt',
 '../input/large-roberta-b32-steplr3-drop-tb-rmf-h56-2/C_3.pt',
 '../input/large-roberta-b32-steplr3-drop-tb-rmf-h56-2/C_4.pt']

In [48]:
model = QuestModel(n_cat, cat_emb, n_host, host_emb, num_labels=30)
model.cuda()

initailize weight
initailize weight
initailize weight
initailize bias
initailize weight
initailize bias


QuestModel(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [49]:
for i, weight in tqdm(enumerate(pretrain_weighted)):
    model.load_state_dict(torch.load(weight))
    results.append(predict_result(model, test_loader))
    
output_large = np.zeros((len(test_set),30))
for result in results:
    output_large += result
output_large /= len(results)

5it [02:27, 29.52s/it]


In [50]:
len(results)

5

In [51]:
output_large[0]

array([9.46754766e-01, 8.00233185e-01, 1.00257943e-01, 7.24978971e-01,
       7.70410395e-01, 8.02530169e-01, 6.78361404e-01, 5.70091641e-01,
       5.60915333e-01, 2.85077139e-04, 5.07255709e-01, 7.94069052e-01,
       7.81184603e-03, 4.85957157e-02, 2.53684779e-03, 7.96837751e-03,
       2.11237656e-01, 7.43329749e-02, 5.58339846e-01, 2.37485243e-04,
       9.45630944e-01, 9.25696194e-01, 5.91083348e-01, 9.62860894e-01,
       9.65110230e-01, 8.37498164e-01, 7.71636002e-02, 4.70619030e-02,
       7.97724640e-01, 9.17057133e-01])

In [52]:
del results
del result
gc.collect()

808

## Model4: DeepAnxiety's XLNet

In [53]:
torch.cuda.empty_cache() ## Clear_Memory

In [54]:
SEED= 1414
seed_everything(SEED)

In [55]:
test_handmade_features= test_df[handmade_cols + ['qa_word_overlap_norm', 'qta_word_overlap_norm']].values

In [56]:
xlent_config = transformers.XLNetConfig.from_json_file(XLNET_CONFIG_PATH)
xlent_config.output_hidden_states = True
tokenizer = transformers.XLNetTokenizer(XLNET_VOCAB_PATH)

In [57]:
def _get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def _get_segments2(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q_len_head = round(q_new_len/3)
        q_len_tail = -1* (q_new_len -q_len_head)
        a_len_head = round(a_new_len/3)
        a_len_tail = -1* (a_new_len -a_len_head) 
        
        q = q[:q_len_head]+q[q_len_tail:]
        a = a[:a_len_head]+a[a_len_tail:]
    
    return t, q, a


def convert_xlnet(title, question, answer, max_sequence_length, tokenizer, 
                  t_max_len_seq=30, q_max_len_seq=239, a_max_len_seq=239):
    
    all_tokens   = []
    all_masks    = []
    all_segments = []
    
    longer = 0
    
    for t, q, a in tqdm(zip(title, question, answer)):
        
        tokens_t, tokens_q, tokens_a  = _trim_input(t, q, a, max_sequence_length=max_sequence_length)
        #print(tokens_t)
        #print(tokens_q)
        #print(tokens_a)
        
        stoken = tokens_t + ["[SEP]"] + tokens_q + ["[SEP]"] + tokens_a + ["[SEP]"] + ["[CLS]"]
        ##############
        #token_ids
        ##############
        token_ids = tokenizer.convert_tokens_to_ids(stoken)
        input_ids = token_ids + [0] * (max_sequence_length-len(token_ids))
        
        #############
        #input_masks
        #############
        attention_masks = _get_masks(stoken, max_sequence_length)
        
        ##############
        #input_segments
        ###############
        input_segments = _get_segments2(stoken, max_sequence_length)
        
        all_tokens.append(input_ids)
        all_masks.append(attention_masks)
        all_segments.append(input_segments)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [58]:
test_tokens, test_masks, test_segments = convert_xlnet(test_df["question_title"],
                                                       test_df["question_body"], 
                                                       test_df["answer"],
                                                       max_sequence_length=512, 
                                                       tokenizer=tokenizer)

476it [00:01, 410.03it/s]


In [59]:
class QuestModel(nn.Module):

    def __init__(self, n_cat, cat_emb, n_host, host_emb, num_labels):
        super().__init__()
        BERT_DIMS = 1024
        LSTM_UNITS = 512
        model_path = os.path.join(XLNET_MODEL_PATH)
        self.bert_model = transformers.XLNetModel.from_pretrained(model_path, from_tf=True, config=xlent_config)
        set_trainable(self.bert_model, False)
        
        self.category_embedding = nn.Embedding(n_cat, cat_emb)
        self.host_embedding     = nn.Embedding(n_host, host_emb)
        self.embedding_dropout  = SpatialDropout(0.5)
        self.lstm1              = nn.LSTM(BERT_DIMS*2, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2              = nn.GRU(LSTM_UNITS*2, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.atten1             = Attention(LSTM_UNITS*2, 512)
        self.atten2             = Attention(LSTM_UNITS*2, 512)
        self.dropout1           = nn.Dropout(0.2)
        self.fc1                = nn.Linear(LSTM_UNITS*6 + int(cat_emb) + int(host_emb) + 8, 21)
        self.fc2                = nn.Linear(LSTM_UNITS*6 + int(cat_emb) + int(host_emb) + 8, 9)
        self._init_weights(self.category_embedding)
        self._init_weights(self.host_embedding)
        self._init_weights(self.fc1)
        self._init_weights(self.fc2)
    
    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            print("initailize weight")
            module.weight.data.normal_(mean=0.0, std=0.02)
        if isinstance(module, nn.Linear) and module.bias is not None:
            print("initailize bias")
            module.bias.data.zero_()
        
    def forward(self, token_ids, masks, segment, hosts, categories, handmades):
        
        category_embed = self.category_embedding(categories)
        host_embed     = self.host_embedding(hosts)
        external_features = torch.cat((category_embed, host_embed, handmades), 1)
                                      
        seq_output, hidden_states = self.bert_model(input_ids=token_ids,
                                                    attention_mask=masks,
                                                    token_type_ids=segment)
        
        last_four_layer = torch.cat((hidden_states[-5], hidden_states[-6]), 2)
        lstm_input      = self.embedding_dropout(last_four_layer)
        lstm1_output, _ = self.lstm1(lstm_input)
        lstm2_output, _ = self.lstm2(lstm1_output)
        
        meanpooled_output   = torch.mean(lstm2_output, 1)
        maxpooled_output, _ = torch.max(lstm2_output, 1)
        attention_output_q  = self.atten1(lstm2_output)
        attention_output_a  = self.atten2(lstm2_output)
        
        pooled_output_q = torch.cat((meanpooled_output, maxpooled_output, attention_output_q, external_features), 1)
        pooled_output_q = self.dropout1(pooled_output_q)
        
        pooled_output_a = torch.cat((meanpooled_output, maxpooled_output, attention_output_a, external_features), 1)
        pooled_output_a = self.dropout1(pooled_output_a)
        
        q_results     = self.fc1(pooled_output_q)
        a_results     = self.fc2(pooled_output_a)
        results       = torch.cat((q_results, a_results), 1)
        
        return results

In [60]:
pretrain_weighted_1 = glob('../input/large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-1/*.pt')
pretrain_weighted_2 = glob('../input/large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-2/*.pt')
pretrain_weighted = pretrain_weighted_1 + pretrain_weighted_2

In [61]:
pretrain_weighted

['../input/large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-1/C_1.pt',
 '../input/large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-1/C_2.pt',
 '../input/large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-1/C_0.pt',
 '../input/large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-2/C_3.pt',
 '../input/large-xlnet-b32-steplr3-changedrop-tb-rmf-h56-2/C_4.pt']

In [62]:
test_set    = QuestDataset_test(test_tokens, test_masks, test_segments,
                                test_df['host'].values,
                                test_df['category'].values,
                                test_handmade_features)
test_loader = DataLoader(test_set, batch_size=4, shuffle=False)

In [63]:
model = QuestModel(n_cat, cat_emb, n_host, host_emb, num_labels=30)
model.cuda()

initailize weight
initailize weight
initailize weight
initailize bias
initailize weight
initailize bias


QuestModel(
  (bert_model): XLNetModel(
    (word_embedding): Embedding(32000, 1024)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=1024, out_features=4096, bias=True)
          (layer_2): Linear(in_features=4096, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwi

In [64]:
results = []
for i, weight in tqdm(enumerate(pretrain_weighted)):
    model.load_state_dict(torch.load(weight))
    results.append(predict_result(model, test_loader, batch_size=4))

5it [06:14, 74.98s/it]


In [65]:
output_xlent = np.zeros((len(test_set),30))
for result in results:
    output_xlent += result
output_xlent /= len(results)

In [66]:
output_xlent[0]

array([9.56954038e-01, 7.53987896e-01, 2.35867222e-01, 7.33301818e-01,
       6.63828313e-01, 6.98445797e-01, 7.09063935e-01, 6.02101219e-01,
       4.84228837e-01, 3.81849470e-04, 6.31998843e-01, 7.74403799e-01,
       2.44912617e-03, 6.06968559e-02, 7.66801025e-04, 3.93287488e-03,
       3.79134666e-02, 2.21464265e-02, 7.71949387e-01, 1.33851613e-04,
       9.52072585e-01, 9.35963130e-01, 6.14697981e-01, 9.70498788e-01,
       9.80776143e-01, 8.62422705e-01, 1.71736382e-02, 1.02984903e-02,
       9.34757733e-01, 9.32515740e-01])

In [67]:
del results
del result
gc.collect()

0

## Part4: Ensemble

In [68]:
test_df  = pd.read_csv(os.path.join(DATA_DIR,"test.csv"))
train_df  = pd.read_csv(os.path.join(DATA_DIR,"train.csv"))
output_categories = list(train_df.columns[11:])

In [69]:
culture_category_list = list(test_df[(test_df['host'] == 'english.stackexchange.com') | (test_df['host'] == 'ell.stackexchange.com')].index)

In [70]:
num_raters_dict = {
 'question_asker_intent_understanding': 18,
 'question_body_critical': 18,
 'question_conversational': 6,
 'question_expect_short_answer': 6,
 'question_fact_seeking': 6,
 'question_has_commonly_accepted_answer': 6,
 'question_interestingness_others': 18,
 'question_interestingness_self': 18,
 'question_multi_intent': 6,
 'question_not_really_a_question': 6,
 'question_opinion_seeking': 6,
 'question_type_choice': 6,
 'question_type_compare': 6,
 'question_type_consequence': 6,
 'question_type_definition': 6,
 'question_type_entity': 6,
 'question_type_instructions': 6,
 'question_type_procedure': 6,
 'question_type_reason_explanation': 6,
 'question_type_spelling': 3,
 'question_well_written': 18,
 'answer_helpful': 18,
 'answer_level_of_information': 18,
 'answer_plausible': 18,
 'answer_relevance': 18,
 'answer_satisfaction': 30,
 'answer_type_instructions': 6,
 'answer_type_procedure': 6,
 'answer_type_reason_explanation': 6,
 'answer_well_written': 18
 }


In [71]:
def submssion_trick1(output):
    #min_rater_value = list(num_raters_dict.values())
    predictions = np.zeros_like(output)
    for i in range(output.shape[0]):
        for j in range(output.shape[1]):
            col = output_categories[j]
            num_raters = num_raters_dict[col]
            if j == 19:
                if i not in culture_category_list:
                    predictions[i][j] = 0.0
                else:
                    predictions[i][j] = np.around(output[i][j] * num_raters) / num_raters
            else:
                predictions[i][j] = np.around(output[i][j] * num_raters) / num_raters
            if num_raters == 18:
                predictions[i][j] = max(predictions[i][j], 1/3)
            if num_raters == 30:
                predictions[i][j] = max(predictions[i][j], 1/5)
            
    return predictions

In [72]:
def submssion_trick2(output):
    #min_rater_value = list(num_raters_dict.values())
    predictions = np.zeros_like(output)
    for i in range(output.shape[0]):
        for j in range(output.shape[1]):
            col = output_categories[j]
            num_raters = num_raters_dict[col]
            if j == 19:
                if i not in culture_category_list:
                    predictions[i][j] = 0.0
                else:
                    predictions[i][j] = np.floor(output[i][j] * num_raters) / num_raters
            else:
                predictions[i][j] = np.floor(output[i][j] * num_raters) / num_raters
            if num_raters == 18:
                predictions[i][j] = max(predictions[i][j], 1/3)
            if num_raters == 30:
                predictions[i][j] = max(predictions[i][j], 1/5)
            
    return predictions

In [73]:
output = 0.7*(0.55*(output_base*0.4 + output_base2*0.6) + 0.45*output_xlent) + 0.3*output_large

In [74]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for i in range(30):
    output[:, i] = scaler.fit_transform(output[:, i].reshape(-1, 1)).reshape(-1,)

In [75]:
predictions = submssion_trick2(output)

In [76]:
#predictions1 = submssion_trick1(output)

In [77]:
submission = pd.read_csv("/kaggle/input/google-quest-challenge/sample_submission.csv")
#best_submission_right_now =  pd.read_csv("../input/quest-test/submission.csv")
#best_submission_right_now =  pd.read_csv("../input/quest-best-single-model-2/submission.csv")

In [78]:
#best_score = best_submission_right_now.iloc[: ,1:].values

In [79]:
#for i in range(30):
   #print(submission.columns[i+1], len(np.unique(best_score[:, i])))

In [80]:
#from collections import Counter

In [81]:
#for i in range(30):
    #print(submission.columns[i+1])
    #print(np.unique(predictions1[:, i]))
    #print(Counter(predictions1[:, i]))
    #print(np.unique(predictions[:, i]))
    #print(Counter(predictions[:, i]))
    #print()

In [82]:
## Submission Checker
for i in range(30): 
    if len(np.unique(predictions[:, i])) == 1:
        print(submission.columns[i+1])
        print(np.unique(predictions[:, i]))

In [83]:
#Submission Checker2
##Please Comment Out When you would like to submit.!!!!!
#total_score = 0
#for i in range(30):
    #score = spearmanr(predictions[:, i], best_score[:, i]).correlation
    #total_score += score
    #print(submission.columns[i+1], score)
    
#print(total_score / 30)

In [84]:
submission.loc[:, 'question_asker_intent_understanding':] = predictions

In [85]:
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.833333,0.666667,0.166667,0.5,0.5,0.5,0.777778,0.611111,0.5,...,0.888889,0.666667,0.333333,0.722222,0.722222,0.533333,0.0,0.0,0.833333,0.722222
1,46,0.5,0.333333,0.0,0.666667,0.666667,0.833333,0.333333,0.333333,0.166667,...,0.333333,0.833333,0.388889,0.833333,0.833333,0.7,0.833333,0.333333,0.0,0.333333
2,70,0.666667,0.611111,0.0,0.666667,0.833333,0.833333,0.388889,0.333333,0.0,...,0.777778,0.555556,0.333333,0.611111,0.555556,0.466667,0.0,0.166667,0.833333,0.444444
3,132,0.611111,0.333333,0.0,0.666667,0.5,0.833333,0.333333,0.333333,0.0,...,0.333333,0.833333,0.555556,0.833333,0.888889,0.766667,0.833333,0.333333,0.666667,0.555556
4,200,0.722222,0.333333,0.0,0.833333,0.5,0.833333,0.611111,0.611111,0.0,...,0.333333,0.555556,0.388889,0.722222,0.611111,0.5,0.166667,0.333333,0.5,0.555556
5,245,0.777778,0.888889,0.0,0.5,0.833333,0.833333,0.5,0.333333,0.166667,...,0.888889,0.777778,0.388889,0.722222,0.722222,0.733333,0.0,0.5,0.833333,0.555556
6,257,0.611111,0.333333,0.0,0.666667,0.5,0.833333,0.333333,0.333333,0.0,...,0.444444,0.888889,0.611111,0.888889,0.944444,0.833333,0.833333,0.333333,0.5,0.777778
7,267,0.888889,0.777778,0.333333,0.666667,0.5,0.666667,0.722222,0.722222,0.0,...,0.833333,0.722222,0.611111,0.833333,0.833333,0.7,0.0,0.0,0.833333,0.777778
8,284,0.555556,0.333333,0.0,0.666667,0.666667,0.833333,0.333333,0.333333,0.333333,...,0.5,0.888889,0.555556,0.888889,0.944444,0.8,0.833333,0.333333,0.333333,0.722222
9,292,0.833333,0.5,0.0,0.833333,0.833333,0.833333,0.666667,0.5,0.0,...,0.666667,0.833333,0.444444,0.888889,0.888889,0.766667,0.333333,0.166667,0.833333,0.777778


In [86]:
import pandas as pd
submission = pd.read_csv("../input/quest-best-single-model-2/submission.csv")