In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, Trainer, EvalPrediction
import os
#%pip install scipy
import scipy
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm as tqdm
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import spearmanr
from transformers import *

N_TARGETS = 30
N_Q_TARGETS = 21
N_A_TARGETS = 9
global TARGETS 
TARGETS = [
    'question_asker_intent_understanding', 'question_body_critical',
    'question_conversational', 'question_expect_short_answer',
    'question_fact_seeking', 'question_has_commonly_accepted_answer',
    'question_interestingness_others', 'question_interestingness_self',
    'question_multi_intent', 'question_not_really_a_question',
    'question_opinion_seeking', 'question_type_choice',
    'question_type_compare', 'question_type_consequence',
    'question_type_definition', 'question_type_entity',
    'question_type_instructions', 'question_type_procedure',
    'question_type_reason_explanation', 'question_type_spelling',
    'question_well_written', 'answer_helpful',
    'answer_level_of_information', 'answer_plausible', 'answer_relevance',
    'answer_satisfaction', 'answer_type_instructions',
    'answer_type_procedure', 'answer_type_reason_explanation',
    'answer_well_written'
]



In [2]:
#data import 
train_df = pd.read_csv('data/train.csv').fillna(' ')
test_df = pd.read_csv('data/test.csv').fillna(' ')
#display (train_df)
print (os.listdir('data'))
#dataset = Dataset.from_pandas(train_df)
#data_files = {"train": "train.csv", "test": "test.csv"}
#dataset = load_dataset("data", data_files=data_files)
#print (dataset)

['sample_submission.csv', 'test.csv', 'train.csv']


In [3]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, DefaultDataCollator, XLNetConfig, XLNetLMHeadModel, XLNetModel
model_name = 'xlnet-base-cased' # 'xlnet-large-cased', 'tiny-xlnet-base-cased', 'jkgrad/xlnet-base-squadv2'


In [4]:
class TextDataset5(Dataset):

    def __init__(self, x_features, question_ids, answer_ids, seg_question_ids, 
                    seg_answer_ids, idxs, targets=None):
        self.question_ids = question_ids[idxs].astype(np.int64) #np.long
        self.answer_ids = answer_ids[idxs].astype(np.int64)
        self.seg_question_ids = seg_question_ids[idxs].astype(np.int64)
        self.seg_answer_ids = seg_answer_ids[idxs].astype(np.int64)
        self.x_features = x_features[idxs].astype(np.float32)
        if targets is not None: self.targets = targets[idxs].astype(np.float32)
        else: self.targets = np.zeros((self.x_features.shape[0], N_TARGETS), dtype=np.float32)

    def __getitem__(self, idx):
        q_ids = self.question_ids[idx]
        a_ids = self.answer_ids[idx]
        seg_q_ids = self.seg_question_ids[idx]
        seg_a_ids = self.seg_answer_ids[idx]
        x_feats = self.x_features[idx]
        target = self.targets[idx]
        return (x_feats, q_ids, a_ids, seg_q_ids, seg_a_ids), target

    def __len__(self):
        return len(self.x_features)
#features
def get_categorical_features(train, test, feature):
    unique_vals = list(set(train[feature].unique().tolist() 
                            + test[feature].unique().tolist()))
    feat_dict = {i + 1: e for i, e in enumerate(unique_vals)}
    feat_dict_reverse = {v: k for k, v in feat_dict.items()}

    train_feat = train[feature].apply(lambda x: feat_dict_reverse[x]).values
    test_feat = test[feature].apply(lambda x: feat_dict_reverse[x]).values

    return train_feat, test_feat, feat_dict, feat_dict_reverse

In [5]:
def to_cpu(x):
    return x.contiguous().detach().cpu()


def to_numpy(x):
    return to_cpu(x).numpy()


def to_device(xs, device):
    if isinstance(xs, tuple) or isinstance(xs, list):
        return [x.to(device) for x in xs]
    else: return [xs.to(device)]
    
def infer_batch(inputs, model, device, to_numpy=True):
    inputs = to_device(inputs, device)
    predicted = model(*inputs)
    inputs = [x.cpu() for x in inputs]
    preds = torch.sigmoid(predicted)
    if to_numpy: preds = preds.cpu().detach().numpy().astype(np.float32)
    return preds


def infer(model, loader, checkpoint_file=None, device=torch.device('cuda')):
    n_obs = len(loader.dataset)
    batch_sz = loader.batch_size
    predictions = np.zeros((n_obs, N_TARGETS))

    if checkpoint_file is not None:
        print(f'Starting inference for model: {checkpoint_file}')
        checkpoint = torch.load(checkpoint_file)
        model.load_state_dict(checkpoint['model_state_dict'])
    model.float()
    model.to(device)
    model.eval()

    with torch.no_grad():
        for i, (inputs, _) in enumerate(tqdm(loader)):
            start_index = i * batch_sz
            end_index = min(start_index + batch_sz, n_obs)
            batch_preds = infer_batch(inputs, model, device)
            predictions[start_index:end_index, :] += batch_preds

    return predictions
def init_seed(seed=100):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True


class GELU(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(1.702 * x)


def lin_layer(n_in, n_out, dropout):
    return nn.Sequential(nn.Linear(n_in, n_out), GELU(), nn.Dropout(dropout))

class Head2(nn.Module):
    def __init__(self, n_h=512, n_feats=74, n_bert=768, dropout=0.2):
        super().__init__()
        n_x = n_feats + 2 * n_bert
        self.lin = lin_layer(n_in=n_x, n_out=n_h, dropout=dropout)
        self.lin_q = lin_layer(n_in=n_feats + n_bert, n_out=n_h, dropout=dropout)
        self.lin_a = lin_layer(n_in=n_feats + n_bert, n_out=n_h, dropout=dropout)
        self.head_q = nn.Linear(2 * n_h, N_Q_TARGETS)
        self.head_a = nn.Linear(2 * n_h, N_A_TARGETS)

    def forward(self, x_feats, x_q_bert, x_a_bert):
        x_q = self.lin_q(torch.cat([x_feats, x_q_bert], dim=1))
        x_a = self.lin_a(torch.cat([x_feats, x_a_bert], dim=1))
        x = self.lin(torch.cat([x_feats, x_q_bert, x_a_bert], dim=1))
        x_q = self.head_q(torch.cat([x, x_q], dim=1))
        x_a = self.head_a(torch.cat([x, x_a], dim=1))
        return torch.cat([x_q, x_a], dim=1)


In [6]:
class AvgPooledXLNet(XLNetModel):
    def forward(self, ids, seg_ids=None):
        att_mask = (ids > 0).float()
        x_bert = super().forward(ids, att_mask, token_type_ids=seg_ids)[0]
        att_mask = att_mask.unsqueeze(-1)
        return (x_bert * att_mask).sum(dim=1) / att_mask.sum(dim=1)
class CustomXLNet(nn.Module):
    def __init__(self, n_h, n_feats, head_dropout=0.2):
        super().__init__()
        #config = XLNetConfig.from_json_file('xlnet-base-cased/config.json') 
        config = XLNetConfig(d_inner=3072, d_model=768, n_head=12, n_layer=12)#using same format as example
        self.xlnet = AvgPooledXLNet(config)
        self.head = Head2(n_h, n_feats, n_bert=768, dropout=head_dropout)

    def forward(self, x_feats, q_ids, a_ids, seg_q_ids=None, seg_a_ids=None):
        x_q_bert = self.xlnet(q_ids, seg_q_ids)
        x_a_bert = self.xlnet(a_ids, seg_a_ids)
        return self.head(x_feats, x_q_bert, x_a_bert)
def get_preds(train, test, ModelClass, tokenizer, model_name, checkpoint_dir, folds):

    seg_ids_test, ids_test = {}, {}
    max_seq_len = 512
    for mode, df in [('test', test)]:
        for text, cols in [('question', ['question_title', 'question_body']), 
                            ('answer', ['question_title', 'answer'])]:
            ids, seg_ids = [], []
            for x1, x2 in tqdm(df[cols].values):
                encoded_inputs = tokenizer.encode_plus(
                    x1, x2, add_special_tokens=True, max_length=max_seq_len, truncation =True, padding = 'max_length', # pad_to_max_length=True,  #
                    return_token_type_ids=True
                )
                ids.append(encoded_inputs['input_ids'])
                seg_ids.append(encoded_inputs['token_type_ids'])
            ids_test[text] = np.array(ids)
            seg_ids_test[text] = np.array(seg_ids)

    train_category, test_category, category_dict, category_dict_reverse = \
        get_categorical_features(train, test, 'category')

    cat_features_train = train_category.reshape(-1, 1)
    cat_features_test = test_category.reshape(-1, 1)
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(cat_features_train)
    cat_features_test = ohe.transform(cat_features_test).toarray()

    num_workers = 8
    device = 'cuda'

    bs_test = 2
    test_loader = DataLoader(
        TextDataset5(cat_features_test, ids_test['question'], ids_test['answer'], 
                        seg_ids_test['question'], seg_ids_test['answer'], test.index),
        batch_size=bs_test, shuffle=False, num_workers=num_workers
    )

    init_seed()
    preds = np.zeros((len(test), N_TARGETS))
    for fold_id in folds:
        checkpoint_file = f'{checkpoint_dir}{model_name}_fold_{fold_id + 1}_best.pth'
        model = ModelClass(256, cat_features_test.shape[1]).to(device)
        test_preds = infer(model, test_loader, checkpoint_file, device)
        preds += test_preds / len(folds)

    return preds

def get_xlnet_preds(train, test):
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
    model_name = 'siamese_xlnet_1_comb'
    checkpoint_dir = 'xlnet-model/'
    return get_preds(train, test, CustomXLNet, tokenizer, model_name, checkpoint_dir, [0, 1, 2, 4, 5, 7, 8])

In [7]:
def compute_spearmanr(trues, preds, n_bins=None):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        if len(np.unique(col_pred)) == 1:
            col_pred[np.random.randint(0, len(col_pred) - 1)] = col_pred.max() + 1
        rhos.append(spearmanr(col_trues, col_pred).correlation)
    return np.mean(rhos)



def compute_loss(outputs, targets, alpha=0.5, margin=0.1, question_only=False):
    if question_only:
        outputs = outputs[:, :21]
        targets = targets[:, :21]
    bce = F.binary_cross_entropy_with_logits(outputs, targets, reduction="none")
    bce = (bce * LABEL_WEIGHTS[:bce.size(-1)]).mean()
    
    batch_size = outputs.size(0)
    if batch_size % 2 == 0:
        outputs1, outputs2 = outputs.sigmoid().contiguous().view(2, batch_size // 2, outputs.size(-1))
        targets1, targets2 = targets.contiguous().view(2, batch_size // 2, outputs.size(-1))
        # 1 if first ones are larger, -1 if second ones are larger, and 0 if equals.
        ordering = (targets1 > targets2).float() - (targets1 < targets2).float()
        margin_rank_loss = (-ordering * (outputs1 - outputs2) + margin).clamp(min=0.0)
        margin_rank_loss = (margin_rank_loss * LABEL_WEIGHTS[:outputs.size(-1)]).mean()
    else:
        # batch size is not even number, so we can't devide them into pairs.
        margin_rank_loss = 0.0

    return alpha * bce + (1 - alpha) * margin_rank_loss



In [8]:
output_categories = list(train_df.columns[11:])
input_categories = list(train_df.columns[[1,2,5]])
print('\noutput categories:\n\t', output_categories)
print('\ninput categories:\n\t', input_categories)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
LABEL_WEIGHTS = torch.tensor(1.0 / train_df[output_categories].std().values, dtype=torch.float32).to(device)
LABEL_WEIGHTS = LABEL_WEIGHTS / LABEL_WEIGHTS.sum() * 30
for name, weight in zip(output_categories, LABEL_WEIGHTS.cpu().numpy()):
    print(name, "\t", weight)

def load_prep(test_df, tokenizer):
    seg_ids_test, ids_test = {}, {}
    max_seq_len = 512
    for mode, df in [('test', test_df)]:
        for text, cols in [('question', ['question_title', 'question_body']), 
                            ('answer', ['question_title', 'answer'])]:
            ids, seg_ids = [], []
            for x1, x2 in tqdm(df[cols].values):
                encoded_inputs = tokenizer.encode_plus(
                    x1, x2, add_special_tokens=True, max_length=max_seq_len, truncation =True, padding = 'max_length', # pad_to_max_length=True,  #
                    return_token_type_ids=True
                )
                ids.append(encoded_inputs['input_ids'])
                seg_ids.append(encoded_inputs['token_type_ids'])
            ids_test[text] = np.array(ids)
            seg_ids_test[text] = np.array(seg_ids)
    return seg_ids_test, ids_test


output categories:
	 ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

input categories:
	 ['question_title', 'question_body', 'answer']
cuda
question_asker_intent_understanding 	 0.9666648
question_body_cr

In [9]:
#torch_dict = torch.load('xlnet-model/siamese_xlnet_1_comb_fold_1_best.pth')
#print (torch_dict['model_state_dict'].keys())
#trainer
def train_and_predict(train_data, test_data, epochs, batch_size):
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

    #dataloader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=batch_size)
    #test_dataloader = torch.utils.data.DataLoader(test_data, shuffle=False, batch_size=batch_size)

    #integration into format for 
    num_workers = 8
    seg_ids_train, ids_train = load_prep(train_data, tokenizer)
    seg_ids_test, ids_test = load_prep(test_data, tokenizer)
    train_category, test_category, category_dict, category_dict_reverse = \
        get_categorical_features(train_data, test_data, 'category')
    cat_features_train = train_category.reshape(-1, 1)
    cat_features_test = test_category.reshape(-1, 1)
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(cat_features_train)
    cat_features_train = ohe.transform(cat_features_train).toarray()
    cat_features_test = ohe.transform(cat_features_test).toarray()
    print (len(ids_train['question'][0]))
    print (len(cat_features_train))
    train_df_targets = train_df[TARGETS].to_numpy()
   
    dataloader = DataLoader(
        TextDataset5(cat_features_train, ids_train['question'], ids_train['answer'], 
                        seg_ids_train['question'], seg_ids_train['answer'], train_df.index, targets = train_df_targets), #include targets
        batch_size=batch_size, shuffle=True, num_workers=num_workers
    )
    test_dataloader = DataLoader(
        TextDataset5(cat_features_test, ids_test['question'], ids_test['answer'], 
                        seg_ids_test['question'], seg_ids_test['answer'], test_df.index),
        batch_size=batch_size, shuffle=False, num_workers=num_workers
    ) # also needs a train_loader

    # Prepare optimizer and schedule (linear warmup and decay)

    model = CustomXLNet(256, cat_features_test.shape[1]).to(device)
    #model = Model().to(device) #change this
    test_predictions = []

    ## Q and A
    no_decay = ["bias", "LayerNorm.weight"]
    #for thing in model.parameters():
    #    print (type(thing))
    #print ([p for n, p in model.named_parameters() if p.requires_grad and "xlnet" not in n])
    params = [
        {
            "params": [p for n, p in model.named_parameters() if p.requires_grad and not any(nd in n for nd in no_decay) and "xlnet" in n],
            "weight_decay": 1e-2,
            "lr": 5e-5
        }
    ]
    optimizer = AdamW(params)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(len(dataloader) * (epochs) * 0.05),
        num_training_steps=len(dataloader) * (epochs)
    )

    for epoch in range(epochs): 
        import time
        start = time.time()
        model.train()
        train_losses = []
        train_preds = []
        train_targets = []
        for x_feats, q_ids, a_ids, seg_q_ids, seg_a_ids in tqdm(dataloader, total=len(dataloader)): 
            #input_ids, token_type_ids, attention_mask, targets #all of this stuff needs changes
            #input_ids = input_ids.to(device)
            #token_type_ids = token_type_ids.to(device)
            #attention_mask = attention_mask.to(device)
            x_feats = x_feats.to(device)
            q_ids = q_ids.to(device)
            a_ids = a_ids.to(device)
            seg_q_ids = seg_q_ids.to(device)
            seg_a_ids = seg_a_ids.to(device)
            targets = targets.to(device)
            outputs = model(x_feats, q_ids, a_ids, seg_q_ids, seg_a_ids)
            #outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) #fix model part
            train_preds.extend(outputs.detach().sigmoid().cpu().numpy())
            train_targets.extend(targets.detach().cpu().numpy())
            loss = compute_loss(outputs, targets)
            model.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_losses.append(loss.detach().cpu().item())
        model.eval()
        with torch.no_grad():
            test_preds = []
            for x_feats, q_ids, a_ids, seg_q_ids, seg_a_ids in tqdm(test_dataloader, total=len(test_dataloader)):
                x_feats = x_feats.to(device)
                q_ids = q_ids.to(device)
                a_ids = a_ids.to(device)
                seg_q_ids = seg_q_ids.to(device)
                seg_a_ids = seg_a_ids.to(device)
                outputs = model(x_feats, q_ids, a_ids, seg_q_ids, seg_a_ids)
                test_preds.extend(outputs.sigmoid().cpu().numpy())
            test_predictions.append(np.stack(test_preds))
            print()
        print("Epoch {}: Train Loss {}".format(epoch + 1, np.mean(train_losses)))
        print("\t Train Spearmanr {:.4f}".format(
            compute_spearmanr(np.stack(train_targets), np.stack(train_preds))
        ))
        print("\t elapsed: {}s".format(time.time() - start))

    return test_predictions

In [10]:
import transformers
transformers.logging.set_verbosity_error()

trainer_preds = train_and_predict(train_df, test_df, epochs = 2, batch_size=2)

#xlnet_pred = get_xlnet_preds(train_df, test_df)

  0%|          | 0/6079 [00:00<?, ?it/s]

  0%|          | 0/6079 [00:00<?, ?it/s]

  0%|          | 0/476 [00:00<?, ?it/s]

  0%|          | 0/476 [00:00<?, ?it/s]

512
6079




  0%|          | 0/3040 [00:00<?, ?it/s]