In [24]:
import os
import torch
import pandas as pd
from scipy import stats
import numpy as np

from tqdm import tqdm
import torch.nn as nn
import joblib

import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
import sys

https://www.kaggle.com/abhishek/bert-inference-of-tpu-model/notebook?select=model.bin

In [25]:
class BERTBaseUncased(nn.Module):
    def __init__(self, bert_path):
        super(BERTBaseUncased, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 30)

    def forward(
            self,
            ids,
            mask,
            token_type_ids
    ):
        _, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids)

        bo = self.bert_drop(o2)
        p2 = self.out(bo)
        return p2

In [26]:
class BERTDatasetTest:
    def __init__(self, qtitle, qbody, answer, tokenizer, max_length):
        self.qtitle = qtitle
        self.qbody = qbody
        self.answer = answer
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.answer)

    def __getitem__(self, item):
        question_title = str(self.qtitle[item])
        question_body = str(self.qbody[item])
        answer_text = str(self.answer[item])

        question_title = " ".join(question_title.split())
        question_body = " ".join(question_body.split())
        answer_text = " ".join(answer_text.split())

        inputs = self.tokenizer.encode_plus(
            question_title + " " + question_body,
            answer_text,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [27]:
def predict():
    DEVICE = torch.device("cuda")
    TEST_BATCH_SIZE = 8
    TEST_DATASET = "input/google-quest-challenge/test.csv"
    df = pd.read_csv(TEST_DATASET).fillna("none")

    qtitle = df.question_title.values.astype(str).tolist()
    qbody = df.question_body.values.astype(str).tolist()
    answer = df.answer.values.astype(str).tolist()
    category = df.category.values.astype(str).tolist()

    tokenizer = transformers.BertTokenizer.from_pretrained("input/bert-base-uncased/", 
                                                           do_lower_case=True)
    maxlen = 512
    
    predictions = []

    test_dataset = BERTDatasetTest(
        qtitle=qtitle,
        qbody=qbody,
        answer=answer,
        tokenizer=tokenizer,
        max_length=maxlen
    )
    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=TEST_BATCH_SIZE,
        shuffle=False,
        num_workers=4
    )

    model = BERTBaseUncased("input/bert-base-uncased/")
    model.to(DEVICE)
    model.load_state_dict(torch.load("input/tpubert/model.bin"))
    model.eval()

    tk0 = tqdm(test_data_loader, total=int(len(test_dataset) / test_data_loader.batch_size))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(DEVICE, dtype=torch.long)
        mask = mask.to(DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
        
        with torch.no_grad():
            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            outputs = torch.sigmoid(outputs).cpu().numpy()
            predictions.append(outputs)

    return np.vstack(predictions)

In [28]:
preds = predict()

  0%|          | 0/59 [00:00<?, ?it/s]Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strate

In [30]:
SAMPLE_SUBMISSION = "input/google-quest-challenge/sample_submission.csv"
sample = pd.read_csv(SAMPLE_SUBMISSION)
target_cols = list(sample.drop("qa_id", axis=1).columns)

In [31]:
sample[target_cols] = preds

In [32]:
sample.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.923931,0.653783,0.152824,0.721629,0.573089,0.499188,0.661548,0.632832,0.437972,...,0.899432,0.947756,0.722048,0.974332,0.986972,0.915564,0.128217,0.040347,0.863238,0.95955
1,46,0.89083,0.599214,0.005323,0.786957,0.901266,0.938664,0.489424,0.372734,0.032964,...,0.714049,0.945723,0.570837,0.960279,0.977561,0.883028,0.953711,0.102567,0.102702,0.920847
2,70,0.919508,0.681343,0.025285,0.820728,0.929896,0.937346,0.623493,0.476292,0.248541,...,0.901854,0.950572,0.651936,0.976627,0.987081,0.917535,0.076725,0.034214,0.912564,0.945181
3,132,0.888797,0.438063,0.005873,0.697785,0.845358,0.951296,0.458818,0.365853,0.083867,...,0.669545,0.957263,0.711225,0.985153,0.992892,0.91467,0.864915,0.095046,0.680777,0.925022
4,200,0.927154,0.672999,0.050325,0.860924,0.788344,0.825573,0.594543,0.552498,0.101299,...,0.843456,0.948001,0.650135,0.977154,0.988778,0.927008,0.180683,0.01901,0.759257,0.959902


In [33]:
sample.shape
#31 columns, 1 for qa_id, 30 are scores (targets)

(476, 31)

In [None]:
sample.to_csv("submission.csv", index=False)