
The basic idea is from my kernel (https://www.kaggle.com/chanhu/bert-score-layer-lb-0-475).
In this kernel, I had changed several points below.
* keras -> pytorch(this is my second kernel wrote in pytorch)
* use pretrain Bert, EndpointSpanExtractor, and weight decay.
(similar to Lee's work https://www.kaggle.com/ceshine/pytorch-bert-endpointspanextractor-kfold) 
* use kfold to get a robust score.(according to the comment from Matei Ionita, and huiqin. Thanks!)

P.S: the best I can get is 0.486. 

In [1]:
import numpy as np
import pandas as pd
import spacy
from keras.preprocessing.sequence import pad_sequences
import os
from tqdm import tqdm
import torch
print(os.listdir('../input/bert-score-layer-lb-0-475'))
print(os.listdir('../input/gap-coreference'))

Using TensorFlow backend.


['best_model_1.hdf5', 'best_model_2.hdf5', '__output__.json', 'contextual_embeddings_gap_validation.json', 'contextual_embeddings_gap_train.json', 'train_dist_df.csv', 'tokenization.py', 'uncased_L-12_H-768_A-12', 'best_model_4.hdf5', 'contextual_embeddings_gap_test.json', '__results___files', '__notebook__.ipynb', 'test_dist_df.csv', 'uncased_L-12_H-768_A-12.zip', 'submission.csv', '__results__.html', 'modeling.py', 'custom.css', 'extract_features.py', '__pycache__', 'val_dist_df.csv', 'best_model_5.hdf5', 'best_model_3.hdf5']
['gap-development.tsv', 'gap-test.tsv', 'gap-validation.tsv']


In [2]:
!conda remove -y greenlet
!pip install pytorch-pretrained-bert
!pip install allennlp

Collecting package metadata: - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Solving environment: / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - 

In [3]:
from allennlp.modules.span_extractors import EndpointSpanExtractor 
from pytorch_pretrained_bert import BertTokenizer, BertModel
from spacy.lang.en import English
nlp = English()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentencizer = nlp.create_pipe('sentencizer')
nlp.add_pipe(sentencizer)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


100%|██████████| 231508/231508 [00:00<00:00, 490376.16B/s]


In [4]:
def candidate_length(candidate):
    count = 0
    for i in range(len(candidate)):
        if candidate[i] !=  " ": count += 1
    return count

def count_char(text, offset):
    count = 0
    for pos in range(offset):
        if text[pos] != " ": count +=1
    return count

def count_token_length_special(token):
    count = 0
    special_token = ["#", " "]
    for i in range(len(token)):
        if token[i] not in special_token: 
            count+=1
    return count

def find_word_index(tokenized_text, char_start, target):
    tar_len = candidate_length(target)
    char_count = 0
    word_index = []
    special_token = ["[CLS]", "[SEP]"]
    for i in range(len(tokenized_text)):
        token = tokenized_text[i]
        if char_count in range(char_start, char_start+tar_len):
            if token in special_token: # for the case like "[SEP]. she"
                continue
            word_index.append(i)
        if token not in special_token:
            token_length = count_token_length_special(token)
            char_count += token_length
    
    if len(word_index) == 1:
        return [word_index[0], word_index[0]]
    else:
        return [word_index[0], word_index[-1]]

def create_tokenizer_input(sents):
    tokenizer_input = str()
    for i, sent in enumerate(sents):
        if i == 0:
            tokenizer_input += "[CLS] "+sent.text+" [SEP] "
        elif i == len(sents) - 1:
            tokenizer_input += sent.text+" [SEP]"
        else:
            tokenizer_input += sent.text+" [SEP] "
            
    return  tokenizer_input

def create_inputs(dataframe):
    
    idxs = dataframe.index
    columns = ['indexed_token', 'offset']
    features_df = pd.DataFrame(index=idxs, columns=columns)
    max_len = 0
    for i in tqdm(range(len(dataframe))):
        text           = dataframe.loc[i, 'Text']
        Pronoun_offset = dataframe.loc[i, 'Pronoun-offset']
        A_offset       = dataframe.loc[i, "A-offset"]
        B_offset       = dataframe.loc[i, "B-offset"]
        Pronoun        = dataframe.loc[i, "Pronoun"]
        A              = dataframe.loc[i, "A"]
        B              = dataframe.loc[i, "B"]
        doc            = nlp(text)
        
        sents = []
        for sent in doc.sents: sents.append(sent)
        token_input = create_tokenizer_input(sents)
        token_input = token_input.replace("#", "*")
        tokenized_text = tokenizer.tokenize(token_input)
        if len(tokenized_text) > max_len: max_len = len(tokenized_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        
        A_char_start, B_char_start = count_char(text, A_offset), count_char(text, B_offset)
        Pronoun_char_start         = count_char(text, Pronoun_offset)
        
        word_indexes = []
        for char_start, target in zip([A_char_start, B_char_start, Pronoun_char_start], [A, B, Pronoun]):
            word_indexes.append(find_word_index(tokenized_text, char_start, target))
        features_df.iloc[i] = [indexed_tokens, word_indexes]
    print('max length of sentence:', max_len)
    
    return features_df

In [5]:
train_df = pd.read_table('../input/gap-coreference/gap-test.tsv')
test_df  = pd.read_table('../input/gap-coreference/gap-development.tsv')
val_df   = pd.read_table('../input/gap-coreference/gap-validation.tsv')
new_train_df = create_inputs(train_df)
new_test_df  = create_inputs(test_df)
new_val_df   = create_inputs(val_df)

100%|██████████| 2000/2000 [00:09<00:00, 204.62it/s]
  1%|          | 22/2000 [00:00<00:09, 211.51it/s]

max length of sentence: 357


100%|██████████| 2000/2000 [00:08<00:00, 224.29it/s]
 17%|█▋        | 76/454 [00:00<00:01, 251.27it/s]

max length of sentence: 353


100%|██████████| 454/454 [00:01<00:00, 249.52it/s]

max length of sentence: 237





In [6]:
def get_label(dataframe):
    labels = []
    for i in range(len(dataframe)):
        if dataframe.loc[i, 'A-coref']:
            labels.append(0)
        elif dataframe.loc[i, 'B-coref']:
            labels.append(1)
        else:
            labels.append(2)
            
    return labels

new_train_df['label'] = get_label(train_df)
new_val_df['label']   = get_label(val_df)
new_df = pd.concat([new_train_df, new_val_df])
new_df = new_df.reset_index(drop=True)
new_df.to_csv('train.csv', index=False)
new_test_df['label'] = get_label(test_df)
new_test_df.to_csv('test.csv', index=False)

In [7]:
del new_df
del new_val_df
del new_test_df
del new_train_df

In [8]:
import gc
gc.collect()

49

In [9]:
from torch.utils.data import Dataset
from torchvision import transforms
from ast import literal_eval

class MyDataset(Dataset):
    
    def __init__(self, dataframe, transform=None):
        self.df = dataframe
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        
        index_token = self.df.loc[idx, 'indexed_token']
        index_token = literal_eval(index_token) # Change string to list
        index_token = pad_sequences([index_token], maxlen=360, padding='post')[0] #pad 
        
        offset = self.df.loc[idx, 'offset']
        offset = literal_eval(offset)
        offset = np.asarray(offset, dtype='int32')
        label  = int(self.df.loc[idx, 'label'])
        
        distP_A = self.df.loc[idx, 'D_PA']
        distP_B = self.df.loc[idx, 'D_PB']
        
        if self.transform:
            index_token = self.transform(index_token)
            offset = self.transform(offset)
            label = self.transform(label)
        
        return (index_token, offset, distP_A, distP_B), label

In [10]:
import torch.nn.functional as F

class score(torch.nn.Module):
    
    def __init__(self, embed_dim, hidden_dim):
        super(score, self).__init__()
        self.score = torch.nn.Sequential(
                     torch.nn.Linear(embed_dim, hidden_dim),
                     torch.nn.LayerNorm(hidden_dim),
                     torch.nn.ReLU(inplace=True),
                     torch.nn.Dropout(0.6),
                     torch.nn.Linear(hidden_dim, hidden_dim),
                     torch.nn.LayerNorm(hidden_dim),
                     torch.nn.ReLU(inplace=True),
                     torch.nn.Dropout(0.6),
                     torch.nn.Linear(hidden_dim, 1))
        
    def forward(self, x):
        return self.score(x)
    
class mentionpair_score(torch.nn.Module):
    
    def __init__(self, input_dim, hidden_dim):
        super(mentionpair_score, self).__init__()
        self.score = score(input_dim, hidden_dim)
    
    def forward(self, g1, g2, dist_embed): #sent_repres
        
        element_wise = g1 * g2
        pair_score   = self.score(torch.cat((g1, g2, element_wise, dist_embed), dim=-1)) #sent_repres
        
        return pair_score

class score_model(torch.nn.Module):
    
    def __init__(self):
        super(score_model, self).__init__()
        self.buckets        = [1, 2, 3, 4, 5, 8, 16, 32, 64] 
        self.bert           = BertModel.from_pretrained('bert-base-uncased')
        self.embedding      = torch.nn.Embedding(len(self.buckets)+1, 20)
        self.span_extractor = EndpointSpanExtractor(768, "x,y,x*y")
        self.pair_score     = mentionpair_score(2304*3+20, 100)
        
    def forward(self, sent, offsets, distP_A, distP_B):
        
        bert_output, _   = self.bert(sent, output_all_encoded_layers=False) # (batch_size, max_len, 768)
        #Distance Embeddings
        distPA_embed     = self.embedding(distP_A)
        distPB_embed     = self.embedding(distP_B)
        
        #Span Representation
        span_repres     = self.span_extractor(bert_output, offsets) #(batch, 3, 2304)
        span_repres     = torch.unbind(span_repres, dim=1) #[A: (bath, 2304), B: (bath, 2304), Pronoun:  (bath, 2304)]
        span_norm = []
        for i in range(len(span_repres)): 
            span_norm.append(F.normalize(span_repres[i], p=2, dim=1)) #avoid overfitting
    
        ap_score = self.pair_score(span_norm[2], span_norm[0], distPA_embed)
        bp_score = self.pair_score(span_norm[2], span_norm[1], distPB_embed)
        nan_score = torch.zeros_like(ap_score)
        output = torch.cat((ap_score, bp_score, nan_score), dim=1)
        
        return output

In [11]:
# The Code from https://www.kaggle.com/ceshine/pytorch-bert-endpointspanextractor-kfold

def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())

def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, torch.nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)

            
def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [12]:
train_dist = pd.read_csv('../input/bert-score-layer-lb-0-475/train_dist_df.csv')
val_dist   = pd.read_csv('../input/bert-score-layer-lb-0-475/val_dist_df.csv')
test_dist  = pd.read_csv('../input/bert-score-layer-lb-0-475/test_dist_df.csv')

train_dist = pd.concat([train_dist, val_dist])
train_dist = train_dist.reset_index(drop=True)


In [13]:
from sklearn.model_selection import StratifiedKFold
n_split = 5

train = pd.read_csv('../working/train.csv')
test  = pd.read_csv('../working/test.csv')

train = pd.concat([train, train_dist], axis=1)
test  = pd.concat([test, test_dist], axis=1)
train.head()
Kfold = StratifiedKFold(n_splits=n_split, random_state=2019).split(train, train['label'])

In [14]:
import time
from sklearn.model_selection import StratifiedKFold
#from torch.optim.lr_scheduler import 

def softmax(x):
    exp_x = np.exp(x)
    y = exp_x / np.sum(exp_x, axis=1, keepdims=True)
    return y

output = np.zeros((len(test_df), 3))
testset = MyDataset(test)
test_loader = torch.utils.data.DataLoader(testset, batch_size=20)
n_epochs = 30

for n_fold, (train_index, val_index) in enumerate(Kfold):
    min_val_loss = 100.0
    count = 0
    PATH = "./best_model.hdf5"
    
    train_df = train.loc[train_index]
    train_df = train_df.reset_index(drop=True)
    val_df   = train.loc[val_index]
    val_df   = val_df.reset_index(drop=True)
    
    trainset = MyDataset(train_df)
    train_loader = torch.utils.data.DataLoader(trainset, batch_size=10, shuffle=True)
    valset = MyDataset(val_df)
    val_loader = torch.utils.data.DataLoader(valset, batch_size=20, shuffle=True)
    
    model = score_model()
    set_trainable(model.bert, False)
    set_trainable(model.embedding, True)
    set_trainable(model.pair_score, True)
    model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.00001)
    criterion = torch.nn.CrossEntropyLoss().cuda()
    
    print('n_fold:', n_fold+1)
    for i in range(n_epochs):
        
        start_time = time.time()
        model.train() 
        avg_loss = 0.
        for idx, (inputs, label) in enumerate(train_loader):
            index_token, offset, distP_A, distP_B = inputs
            index_token = index_token.type(torch.LongTensor).cuda()
            offset      = offset.type(torch.LongTensor).cuda()
            label       = label.type(torch.LongTensor).cuda()
            distP_A     = distP_A.type(torch.LongTensor).cuda()
            distP_B     = distP_B.type(torch.LongTensor).cuda()
            
            optimizer.zero_grad()
            output_train = model(index_token, offset, distP_A, distP_B)
            loss = criterion(output_train, label)
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        avg_val_loss = 0.
        model.eval()
        with torch.no_grad():
            for idx, (inputs, label) in enumerate(val_loader):
                index_token, offset, distP_A, distP_B = inputs
                index_token = index_token.type(torch.LongTensor).cuda()
                offset      = offset.type(torch.LongTensor).cuda()
                label       = label.type(torch.LongTensor).cuda()
                distP_A     = distP_A.type(torch.LongTensor).cuda()
                distP_B     = distP_B.type(torch.LongTensor).cuda()
                
                output_test =  model(index_token, offset, distP_A, distP_B)
                avg_val_loss += criterion(output_test, label).item() / len(val_loader)
                
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
                i + 1, n_epochs, avg_loss, avg_val_loss, elapsed_time))
        
        # Simple Early Stop
        if min_val_loss > avg_val_loss:
            min_val_loss = avg_val_loss 
            torch.save(model.state_dict(), PATH)
        
    
    del model
    
    model = score_model()
    model.load_state_dict(torch.load(PATH))
    model.cuda()
    model.eval()
    with torch.no_grad():
        for idx, (inputs, label) in enumerate(test_loader):
            index_token, offset, distP_A, distP_B = inputs
            index_token = index_token.type(torch.LongTensor).cuda()
            offset      = offset.type(torch.LongTensor).cuda()
            label       = label.type(torch.LongTensor).cuda()
            distP_A     = distP_A.type(torch.LongTensor).cuda()
            distP_B     = distP_B.type(torch.LongTensor).cuda()
                
            y_pred = model(index_token, offset, distP_A, distP_B)
            y_pred = softmax(y_pred.cpu().numpy())
            start = idx * 20
            end = start + 20
            output[start:end, :] += y_pred                

100%|██████████| 407873900/407873900 [00:12<00:00, 32149456.41B/s]


n_fold: 1
Epoch 1/30 	 loss=0.9957 	 val_loss=0.8718 	 time=38.64s
Epoch 2/30 	 loss=0.8997 	 val_loss=0.7223 	 time=38.19s
Epoch 3/30 	 loss=0.7969 	 val_loss=0.6650 	 time=38.10s
Epoch 4/30 	 loss=0.7409 	 val_loss=0.6261 	 time=38.13s
Epoch 5/30 	 loss=0.7115 	 val_loss=0.5980 	 time=38.14s
Epoch 6/30 	 loss=0.6610 	 val_loss=0.5978 	 time=38.20s
Epoch 7/30 	 loss=0.6422 	 val_loss=0.5793 	 time=38.13s
Epoch 8/30 	 loss=0.6094 	 val_loss=0.5777 	 time=37.90s
Epoch 9/30 	 loss=0.6032 	 val_loss=0.5995 	 time=37.77s
Epoch 10/30 	 loss=0.5796 	 val_loss=0.5868 	 time=37.88s
Epoch 11/30 	 loss=0.5471 	 val_loss=0.5934 	 time=37.67s
Epoch 12/30 	 loss=0.5509 	 val_loss=0.5749 	 time=37.77s
Epoch 13/30 	 loss=0.5402 	 val_loss=0.5789 	 time=37.78s
Epoch 14/30 	 loss=0.5296 	 val_loss=0.5988 	 time=37.79s
Epoch 15/30 	 loss=0.4896 	 val_loss=0.5864 	 time=37.68s
Epoch 16/30 	 loss=0.4955 	 val_loss=0.5936 	 time=37.77s
Epoch 17/30 	 loss=0.4576 	 val_loss=0.6287 	 time=37.70s
Epoch 18/30 	

In [15]:
import os
output /= 5
sub_df_path = os.path.join('../input/gendered-pronoun-resolution/', 'sample_submission_stage_1.csv')
sub_df = pd.read_csv(sub_df_path)
sub_df.loc[:, 'A'] = pd.Series(output[:, 0])
sub_df.loc[:, 'B'] = pd.Series(output[:, 1])
sub_df.loc[:, 'NEITHER'] = pd.Series(output[:, 2])

sub_df.head(20)

Unnamed: 0,ID,A,B,NEITHER
0,development-1,0.600046,0.357497,0.042457
1,development-2,0.990822,0.000411,0.008768
2,development-3,0.031095,0.931218,0.037688
3,development-4,0.045824,0.155976,0.798199
4,development-5,0.004394,0.989948,0.005657
5,development-6,0.866779,0.127916,0.005305
6,development-7,0.935494,0.020891,0.043615
7,development-8,0.329409,0.626504,0.044087
8,development-9,0.000182,0.994971,0.004847
9,development-10,0.721386,0.229466,0.049147


In [16]:
sub_df.to_csv("submission.csv", index=False)

In [17]:
y_test = pd.read_csv('../working/test.csv')['label']

from sklearn.metrics import log_loss
y_one_hot = np.zeros((2000, 3))
for i in range(len(y_test)):
    y_one_hot[i, y_test[i]] = 1
log_loss(y_one_hot, output)


0.4793930958369195

In [18]:
_output = np.argmax(output, axis=1)
print('acc:', np.asarray(np.where(_output == y_test)).shape[1]/ 2000)

acc: 0.8015
