In [89]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, Trainer, EvalPrediction
import os
#%pip install scipy
import scipy
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm as tqdm
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import spearmanr

N_TARGETS = 30
N_Q_TARGETS = 21
N_A_TARGETS = 9

In [90]:
#data import 
train_df = pd.read_csv('data/train.csv').fillna(' ')
test_df = pd.read_csv('data/test.csv').fillna(' ')
#display (train_df)
print (os.listdir('data'))
#dataset = Dataset.from_pandas(train_df)
#data_files = {"train": "train.csv", "test": "test.csv"}
#dataset = load_dataset("data", data_files=data_files)
#print (dataset)

['test.csv', 'train.csv']


In [91]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, DefaultDataCollator, XLNetConfig, XLNetLMHeadModel, XLNetModel
model_name = 'xlnet-base-cased' # 'xlnet-large-cased', 'tiny-xlnet-base-cased', 'jkgrad/xlnet-base-squadv2'


'data_collator = DefaultDataCollator()\ntokenizer = XLNetTokenizer.from_pretrained(model_name)\nconfig = XLNetConfig.from_pretrained(\n    "roberta-base",\n    num_labels=1,\n    id2label={ 0: "👎", 1: "👍"},\n)\nmodel = XLNetLMHeadModel.from_pretrained(\n    "roberta-base",\n    config=config,\n)'

In [92]:
class TextDataset5(Dataset):

    def __init__(self, x_features, question_ids, answer_ids, seg_question_ids, 
                    seg_answer_ids, idxs, targets=None):
        self.question_ids = question_ids[idxs].astype(np.int64) #np.long
        self.answer_ids = answer_ids[idxs].astype(np.int64)
        self.seg_question_ids = seg_question_ids[idxs].astype(np.int64)
        self.seg_answer_ids = seg_answer_ids[idxs].astype(np.int64)
        self.x_features = x_features[idxs].astype(np.float32)
        if targets is not None: self.targets = targets[idxs].astype(np.float32)
        else: self.targets = np.zeros((self.x_features.shape[0], N_TARGETS), dtype=np.float32)

    def __getitem__(self, idx):
        q_ids = self.question_ids[idx]
        a_ids = self.answer_ids[idx]
        seg_q_ids = self.seg_question_ids[idx]
        seg_a_ids = self.seg_answer_ids[idx]
        x_feats = self.x_features[idx]
        target = self.targets[idx]
        return (x_feats, q_ids, a_ids, seg_q_ids, seg_a_ids), target

    def __len__(self):
        return len(self.x_features)
#features
def get_categorical_features(train, test, feature):
    unique_vals = list(set(train[feature].unique().tolist() 
                            + test[feature].unique().tolist()))
    feat_dict = {i + 1: e for i, e in enumerate(unique_vals)}
    feat_dict_reverse = {v: k for k, v in feat_dict.items()}

    train_feat = train[feature].apply(lambda x: feat_dict_reverse[x]).values
    test_feat = test[feature].apply(lambda x: feat_dict_reverse[x]).values

    return train_feat, test_feat, feat_dict, feat_dict_reverse

In [94]:
def to_cpu(x):
    return x.contiguous().detach().cpu()


def to_numpy(x):
    return to_cpu(x).numpy()


def to_device(xs, device):
    if isinstance(xs, tuple) or isinstance(xs, list):
        return [x.to(device) for x in xs]
    else: return [xs.to(device)]
    
def infer_batch(inputs, model, device, to_numpy=True):
    inputs = to_device(inputs, device)
    predicted = model(*inputs)
    inputs = [x.cpu() for x in inputs]
    preds = torch.sigmoid(predicted)
    if to_numpy: preds = preds.cpu().detach().numpy().astype(np.float32)
    return preds


def infer(model, loader, checkpoint_file=None, device=torch.device('cuda')):
    n_obs = len(loader.dataset)
    batch_sz = loader.batch_size
    predictions = np.zeros((n_obs, N_TARGETS))

    if checkpoint_file is not None:
        print(f'Starting inference for model: {checkpoint_file}')
        checkpoint = torch.load(checkpoint_file)
        model.load_state_dict(checkpoint['model_state_dict'])
    model.float()
    model.to(device)
    model.eval()

    with torch.no_grad():
        for i, (inputs, _) in enumerate(tqdm(loader)):
            start_index = i * batch_sz
            end_index = min(start_index + batch_sz, n_obs)
            batch_preds = infer_batch(inputs, model, device)
            predictions[start_index:end_index, :] += batch_preds

    return predictions
def init_seed(seed=100):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True


class GELU(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(1.702 * x)


def lin_layer(n_in, n_out, dropout):
    return nn.Sequential(nn.Linear(n_in, n_out), GELU(), nn.Dropout(dropout))

class Head2(nn.Module):
    def __init__(self, n_h=512, n_feats=74, n_bert=768, dropout=0.2):
        super().__init__()
        n_x = n_feats + 2 * n_bert
        self.lin = lin_layer(n_in=n_x, n_out=n_h, dropout=dropout)
        self.lin_q = lin_layer(n_in=n_feats + n_bert, n_out=n_h, dropout=dropout)
        self.lin_a = lin_layer(n_in=n_feats + n_bert, n_out=n_h, dropout=dropout)
        self.head_q = nn.Linear(2 * n_h, N_Q_TARGETS)
        self.head_a = nn.Linear(2 * n_h, N_A_TARGETS)

    def forward(self, x_feats, x_q_bert, x_a_bert):
        x_q = self.lin_q(torch.cat([x_feats, x_q_bert], dim=1))
        x_a = self.lin_a(torch.cat([x_feats, x_a_bert], dim=1))
        x = self.lin(torch.cat([x_feats, x_q_bert, x_a_bert], dim=1))
        x_q = self.head_q(torch.cat([x, x_q], dim=1))
        x_a = self.head_a(torch.cat([x, x_a], dim=1))
        return torch.cat([x_q, x_a], dim=1)


In [95]:
class AvgPooledXLNet(XLNetModel):
    def forward(self, ids, seg_ids=None):
        att_mask = (ids > 0).float()
        x_bert = super().forward(ids, att_mask, token_type_ids=seg_ids)[0]
        att_mask = att_mask.unsqueeze(-1)
        return (x_bert * att_mask).sum(dim=1) / att_mask.sum(dim=1)
class CustomXLNet(nn.Module):
    def __init__(self, n_h, n_feats, head_dropout=0.2):
        super().__init__()
        #config = XLNetConfig.from_json_file('xlnet-base-cased/config.json') 
        config = XLNetConfig(d_inner=3072, d_model=768, n_head=12, n_layer=12)#using same format as example
        self.xlnet = AvgPooledXLNet(config)
        self.head = Head2(n_h, n_feats, n_bert=768, dropout=head_dropout)

    def forward(self, x_feats, q_ids, a_ids, seg_q_ids=None, seg_a_ids=None):
        x_q_bert = self.xlnet(q_ids, seg_q_ids)
        x_a_bert = self.xlnet(a_ids, seg_a_ids)
        return self.head(x_feats, x_q_bert, x_a_bert)
def get_preds(train, test, ModelClass, tokenizer, model_name, checkpoint_dir, folds):

    seg_ids_test, ids_test = {}, {}
    max_seq_len = 512
    for mode, df in [('test', test)]:
        for text, cols in [('question', ['question_title', 'question_body']), 
                            ('answer', ['question_title', 'answer'])]:
            ids, seg_ids = [], []
            for x1, x2 in tqdm(df[cols].values):
                encoded_inputs = tokenizer.encode_plus(
                    x1, x2, add_special_tokens=True, max_length=max_seq_len, truncation =True, padding = 'max_length', # pad_to_max_length=True,  #
                    return_token_type_ids=True
                )
                ids.append(encoded_inputs['input_ids'])
                seg_ids.append(encoded_inputs['token_type_ids'])
            ids_test[text] = np.array(ids)
            seg_ids_test[text] = np.array(seg_ids)

    train_category, test_category, category_dict, category_dict_reverse = \
        get_categorical_features(train, test, 'category')

    cat_features_train = train_category.reshape(-1, 1)
    cat_features_test = test_category.reshape(-1, 1)
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(cat_features_train)
    cat_features_test = ohe.transform(cat_features_test).toarray()

    num_workers = 8
    device = 'cuda'

    bs_test = 2
    test_loader = DataLoader(
        TextDataset5(cat_features_test, ids_test['question'], ids_test['answer'], 
                        seg_ids_test['question'], seg_ids_test['answer'], test.index),
        batch_size=bs_test, shuffle=False, num_workers=num_workers
    )

    init_seed()
    preds = np.zeros((len(test), N_TARGETS))
    for fold_id in folds:
        checkpoint_file = f'{checkpoint_dir}{model_name}_fold_{fold_id + 1}_best.pth'
        model = ModelClass(256, cat_features_test.shape[1]).to(device)
        test_preds = infer(model, test_loader, checkpoint_file, device)
        preds += test_preds / len(folds)

    return preds

def get_xlnet_preds(train, test):
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
    model_name = 'siamese_xlnet_1_comb'
    checkpoint_dir = 'xlnet-model/'
    return get_preds(train, test, CustomXLNet, tokenizer, model_name, checkpoint_dir, [0, 1, 2, 4, 5, 7, 8])

In [96]:
#torch_dict = torch.load('xlnet-model/siamese_xlnet_1_comb_fold_1_best.pth')
#print (torch_dict['model_state_dict'].keys())
#trainer

odict_keys(['xlnet.mask_emb', 'xlnet.word_embedding.weight', 'xlnet.layer.0.rel_attn.q', 'xlnet.layer.0.rel_attn.k', 'xlnet.layer.0.rel_attn.v', 'xlnet.layer.0.rel_attn.o', 'xlnet.layer.0.rel_attn.r', 'xlnet.layer.0.rel_attn.r_r_bias', 'xlnet.layer.0.rel_attn.r_s_bias', 'xlnet.layer.0.rel_attn.r_w_bias', 'xlnet.layer.0.rel_attn.seg_embed', 'xlnet.layer.0.rel_attn.layer_norm.weight', 'xlnet.layer.0.rel_attn.layer_norm.bias', 'xlnet.layer.0.ff.layer_norm.weight', 'xlnet.layer.0.ff.layer_norm.bias', 'xlnet.layer.0.ff.layer_1.weight', 'xlnet.layer.0.ff.layer_1.bias', 'xlnet.layer.0.ff.layer_2.weight', 'xlnet.layer.0.ff.layer_2.bias', 'xlnet.layer.1.rel_attn.q', 'xlnet.layer.1.rel_attn.k', 'xlnet.layer.1.rel_attn.v', 'xlnet.layer.1.rel_attn.o', 'xlnet.layer.1.rel_attn.r', 'xlnet.layer.1.rel_attn.r_r_bias', 'xlnet.layer.1.rel_attn.r_s_bias', 'xlnet.layer.1.rel_attn.r_w_bias', 'xlnet.layer.1.rel_attn.seg_embed', 'xlnet.layer.1.rel_attn.layer_norm.weight', 'xlnet.layer.1.rel_attn.layer_norm.bi

In [97]:
import transformers
transformers.logging.set_verbosity_error()
xlnet_pred = get_xlnet_preds(train_df, test_df)

  0%|          | 0/476 [00:00<?, ?it/s]

  0%|          | 0/476 [00:00<?, ?it/s]

Starting inference for model: xlnet-model/siamese_xlnet_1_comb_fold_1_best.pth


  0%|          | 0/238 [00:00<?, ?it/s]