In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import dgl
from dgl import DGLGraph
from dgl.data import MiniGCDataset
import dgl.function as fn

from functools import partial

import re
import numpy as np
import pandas as pd

from pytorch_pretrained_bert import BertTokenizer

import spacy
import pickle
import collections

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1'
torch.cuda.set_device(1)

In [None]:
'''
Load results from BERT Embeddings Generatation with Punc.ipynb and BERT Embeddings Generatation without Punc.ipynb
'''

token_lst = pickle.load(open('token_lst_wto_padding.pkl', "rb")) # tokens of every sentence without padding
bert_outputs_lst = pickle.load(open('bert_outputs.pkl', "rb")) # list of outputs of bert for every sentence
offsets_lst = pickle.load(open('offsets_lst.pkl', "rb"))

test_token_lst = pickle.load(open('test_token_lst_wto_padding.pkl', "rb")) # tokens of every sentence without padding
test_bert_outputs_lst = pickle.load(open('test_bert_outputs.pkl', "rb")) # list of outputs of bert for every sentence
test_offsets_lst = pickle.load(open('test_offsets_lst.pkl', "rb"))

others_bert_outputs = pickle.load(open('others_bert_outputs.pkl', "rb"))
test_others_bert_outputs  = pickle.load(open('test_others_bert_outputs.pkl', "rb"))


train_df = pd.concat([
    pd.read_csv("gap-test.tsv", delimiter="\t"),
    pd.read_csv("gap-validation.tsv", delimiter="\t")
], axis=0)

test_df = pd.read_csv("gap-development.tsv", delimiter="\t")

# Model Building

## RGCNLayer

In [None]:
class RGCNLayer(nn.Module):
    def __init__(self, feat_size, num_rels, activation=None, gated = True):
        
        super(RGCNLayer, self).__init__()
        self.feat_size = feat_size
        self.num_rels = num_rels
        self.activation = activation
        self.gated = gated

        self.weight = nn.Parameter(torch.Tensor(self.num_rels, self.feat_size, 256))
        # init trainable parameters
        nn.init.xavier_uniform_(self.weight,gain=nn.init.calculate_gain('relu'))
        
        if self.gated:
            self.gate_weight = nn.Parameter(torch.Tensor(self.num_rels, self.feat_size, 1))
            nn.init.xavier_uniform_(self.gate_weight,gain=nn.init.calculate_gain('sigmoid'))
        
    def forward(self, g):
        
        weight = self.weight
        gate_weight = self.gate_weight
        
        def message_func(edges):
            w = weight[edges.data['rel_type']]
            msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
            msg = msg * edges.data['norm']
            
            if self.gated:
                gate_w = gate_weight[edges.data['rel_type']]
                gate = torch.bmm(edges.src['h'].unsqueeze(1), gate_w).squeeze().reshape(-1,1)
                gate = torch.sigmoid(gate)
                msg = msg * gate
                
            return {'msg': msg}
    
        def apply_func(nodes):
            h = nodes.data['h']
            h = self.activation(h)
            return {'h': h}

        g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)

## Define Full RGCN Model

In [None]:
class RGCNModel(nn.Module):
    def __init__(self, h_dim, num_rels, num_hidden_layers=1, gated = True):
        super(RGCNModel, self).__init__()

        self.h_dim = h_dim
        self.num_rels = num_rels
        self.num_hidden_layers = num_hidden_layers
        self.gated = gated
        
        # create rgcn layers
        self.build_model()
       
    def build_model(self):        
        self.layers = nn.ModuleList() 
        for _ in range(self.num_hidden_layers):
            rgcn_layer = RGCNLayer(self.h_dim, self.num_rels, activation=F.relu, gated = self.gated)
            self.layers.append(rgcn_layer)
    
    def forward(self, g):
        for layer in self.layers:
            layer(g)
        
        rst_hidden = []
        for sub_g in dgl.unbatch(g):
            rst_hidden.append(  sub_g.ndata['h']   )
        return rst_hidden

## Design the Main Model (R-GCN + FFNN)

In [None]:
class Head(nn.Module):
    """The MLP submodule"""
    def __init__(self, gcn_out_size: int, bert_out_size: int):
        super().__init__()
        self.bert_out_size = bert_out_size
        self.gcn_out_size = gcn_out_size
        
        self.fc = nn.Sequential(
            nn.BatchNorm1d(bert_out_size * 3 + gcn_out_size * 3),
            nn.Dropout(0.5),
            nn.Linear(bert_out_size * 3 + gcn_out_size * 3, 256),    
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.5),
            nn.Linear(256, 3),
        )
        for i, module in enumerate(self.fc):
            if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
                nn.init.constant_(module.weight, 1)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.Linear):
                if getattr(module, "weight_v", None) is not None:
                    nn.init.uniform_(module.weight_g, 0, 1)
                    nn.init.kaiming_normal_(module.weight_v)
                    assert model[i].weight_g is not None
                else:
                    nn.init.kaiming_normal_(module.weight)
                nn.init.constant_(module.bias, 0)
                
    def forward(self, gcn_outputs, offsets_gcn, bert_embeddings):
        
        gcn_extracted_outputs = [gcn_outputs[i].unsqueeze(0).gather(1, offsets_gcn[i].unsqueeze(0).unsqueeze(2)
                                       .expand(-1, -1, gcn_outputs[i].unsqueeze(0).size(2))).view(gcn_outputs[i].unsqueeze(0).size(0), -1) for i in range(len(gcn_outputs))]
        
        gcn_extracted_outputs = torch.stack(gcn_extracted_outputs, dim=0).squeeze()
        
        embeddings = torch.cat((gcn_extracted_outputs, bert_embeddings), 1) 
        
        return self.fc(embeddings)


class BERT_Head(nn.Module):
    def __init__(self, bert_hidden_size: int):
        super().__init__()
        self.fc = nn.Sequential(
            nn.BatchNorm1d(bert_hidden_size * 3),
            nn.Dropout(0.5),
            nn.Linear(bert_hidden_size * 3, 512 * 3),   
            nn.ReLU(),
        )

        for i, module in enumerate(self.fc):
            if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
                nn.init.constant_(module.weight, 1)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.Linear):
                if getattr(module, "weight_v", None) is not None:
                    nn.init.uniform_(module.weight_g, 0, 1)
                    nn.init.kaiming_normal_(module.weight_v)
                    assert model[i].weight_g is not None
                else:
                    nn.init.kaiming_normal_(module.weight)
                nn.init.constant_(module.bias, 0)
                
    def forward(self, bert_embeddings):
        #print('BERT_Head bert_embeddings: ', bert_embeddings, bert_embeddings.view(bert_embeddings.shape[0],-1).shape)
        outputs = self.fc(bert_embeddings.view(bert_embeddings.shape[0],-1))
        return outputs
        
        
    
    
    
class GPRModel(nn.Module):
    """The main model."""
    def __init__(self):
        super().__init__()
        self.RGCN =  RGCNModel(h_dim = 1024, num_rels = 3, gated = True)
        self.BERThead = BERT_Head(1024) # bert output size
        self.head = Head(256, 512)  # gcn output   berthead output
    
    
    def forward(self, offsets_bert, offsets_gcn, bert_embeddings, g):
        gcn_outputs = self.RGCN(g)
        bert_head_outputs = self.BERThead(bert_embeddings)
        head_outputs = self.head(gcn_outputs, offsets_gcn, bert_head_outputs)
        return head_outputs            


# Data Input

## Generate All Syntactic Graphs with BERT embeddings

In [None]:
parser = spacy.load('en_core_web_lg')

BERT_MODEL = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, never_split = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", "[THISISA]", "[THISISB]", "[THISISP]"))

tokenizer.vocab["[THISISA]"] = -1
tokenizer.vocab["[THISISB]"] = -1
tokenizer.vocab["[THISISP]"] = -1

### Training Data

In [None]:
def is_target(i, target_offset_list):
    return i in target_offset_list

def transfer_n_e(nodes, edges):

    num_nodes = len(nodes)
    new_edges = []
    for e1, e2 in edges:
        new_edges.append( [nodes[e1], nodes[e2]] ) 
    return num_nodes, new_edges

all_graphs = []
gcn_offsets = []
for i, sent_token in enumerate(token_lst):
    sent_token = token_lst[i]

    sent = ' '.join([re.sub("[#]","",token)   for token in tokenizer.convert_ids_to_tokens(sent_token[1:-1])])
    doc = parser(sent)
    parse_rst = doc.to_json()

    target_offset_list = [item - 1 for item in offsets_lst[i]]
    
    nodes = collections.OrderedDict()
    edges = []
    edge_type = []
    for i_word, word in enumerate(parse_rst['tokens']):
        if not (is_target(i_word, target_offset_list) or is_target(word['head'], target_offset_list)):
            continue

        if i_word not in nodes:
            nodes[i_word] = len(nodes) 
            edges.append( [i_word, i_word] )
            edge_type.append(0)
        if word['head'] not in nodes:
            nodes[word['head']] = len(nodes) 
            edges.append( [word['head'], word['head']] )
            edge_type.append(0)

        if word['dep'] != 'ROOT':
                edges.append( [word['head'], word['id']] )
                edge_type.append(1)
                edges.append( [word['id'], word['head']] )
                edge_type.append(2)

    num_nodes, tran_edges = transfer_n_e(nodes, edges)
    
    gcn_offset = [nodes[offset] for offset in target_offset_list]
    gcn_offsets.append(gcn_offset)
    
    G = dgl.DGLGraph()
    G.add_nodes(num_nodes)
    G.add_edges(list(zip(*tran_edges))[0],list(zip(*tran_edges))[1]) 

    for i_word, word in enumerate(parse_rst['tokens']):
        if not (is_target(i_word, target_offset_list) or is_target(word['head'], target_offset_list)):
            continue
        if is_target(i_word, target_offset_list): 
            G.nodes[ [ nodes[i_word] ]].data['h'] = others_bert_outputs[i][0][target_offset_list.index(i_word)].unsqueeze(0).cuda()
        else:
            G.nodes[ [ nodes[i_word] ]].data['h'] = bert_outputs_lst[i][0][i_word + 1].unsqueeze(0).cuda()
        if is_target(word['head'], target_offset_list):
            G.nodes[ [ nodes[word['head']] ]].data['h'] = others_bert_outputs[i][0][target_offset_list.index(word['head'])].unsqueeze(0).cuda()
        else:   
            G.nodes[ [ nodes[word['head']] ]].data['h'] = bert_outputs_lst[i][0][word['head'] + 1].unsqueeze(0).cuda()

    edge_norm = []
    for e1, e2 in tran_edges:
        if e1 == e2:
            edge_norm.append(1)
        else:
            edge_norm.append( 1 / (G.in_degree(e2) - 1 ) )


    edge_type = torch.from_numpy(np.array(edge_type))
    edge_norm = torch.from_numpy(np.array(edge_norm)).unsqueeze(1).float().cuda()

    G.edata.update({'rel_type': edge_type,})
    G.edata.update({'norm': edge_norm})
    all_graphs.append(G)

### Test Data

In [None]:
test_all_graphs = []
test_gcn_offsets = []
for i, sent_token in enumerate(test_token_lst):
    sent_token = test_token_lst[i]

    sent = ' '.join([re.sub("[#]","",token)   for token in tokenizer.convert_ids_to_tokens(sent_token[1:-1])])
    doc = parser(sent)
    parse_rst = doc.to_json()

    target_offset_list = [item - 1 for item in test_offsets_lst[i]]
    
    nodes = collections.OrderedDict()
    edges = []
    edge_type = []
    for i_word, word in enumerate(parse_rst['tokens']):
        if not (is_target(i_word, target_offset_list) or is_target(word['head'], target_offset_list)):
            continue

        if i_word not in nodes:
            nodes[i_word] = len(nodes) 
            edges.append( [i_word, i_word] )
            edge_type.append(0)
        if word['head'] not in nodes:
            nodes[word['head']] = len(nodes) 
            edges.append( [word['head'], word['head']] )
            edge_type.append(0)

        if word['dep'] != 'ROOT':
                edges.append( [word['head'], word['id']] )
                edge_type.append(1)
                edges.append( [word['id'], word['head']] )
                edge_type.append(2)

    num_nodes, tran_edges = transfer_n_e(nodes, edges)
    
    test_gcn_offset = [nodes[offset] for offset in target_offset_list]
    test_gcn_offsets.append(test_gcn_offset)
    
    G = dgl.DGLGraph()
    G.add_nodes(num_nodes)
    G.add_edges(list(zip(*tran_edges))[0],list(zip(*tran_edges))[1]) 

    for i_word, word in enumerate(parse_rst['tokens']):
        if not (is_target(i_word, target_offset_list) or is_target(word['head'], target_offset_list)):
            continue
        
        if is_target(i_word, target_offset_list): 
            G.nodes[ [ nodes[i_word] ]].data['h'] = test_others_bert_outputs[i][0][target_offset_list.index(i_word)].unsqueeze(0).cuda()
        else:
            G.nodes[ [ nodes[i_word] ]].data['h'] = test_bert_outputs_lst[i][0][i_word + 1].unsqueeze(0).cuda()
        if is_target(word['head'], target_offset_list):
            G.nodes[ [ nodes[word['head']] ]].data['h'] = test_others_bert_outputs[i][0][target_offset_list.index(word['head'])].unsqueeze(0).cuda()
        else:   
            G.nodes[ [ nodes[word['head']] ]].data['h'] = test_bert_outputs_lst[i][0][word['head'] + 1].unsqueeze(0).cuda()

    edge_norm = []
    for e1, e2 in tran_edges:
        if e1 == e2:
            edge_norm.append(1)
        else:
            edge_norm.append( 1 / (G.in_degree(e2) - 1 ) )


    edge_type = torch.from_numpy(np.array(edge_type))
    edge_norm = torch.from_numpy(np.array(edge_norm)).unsqueeze(1).float().cuda()

    G.edata.update({'rel_type': edge_type,})
    G.edata.update({'norm': edge_norm})
    test_all_graphs.append(G)

## Design Dataloader and Dataset

In [None]:
class GPRDataset(Dataset):
    def __init__(self, original_df, graphs, bert_offsets, gcn_offsets, bert_embeddings):
        


        tmp = original_df[["A-coref", "B-coref"]].copy()
        tmp["Neither"] = ~(original_df["A-coref"] | original_df["B-coref"])
        self.y = tmp.values.astype("bool")

        self.graphs = graphs
        self.bert_offsets = bert_offsets  # 已经+1了
        self.bert_embeddings = bert_embeddings  # 有[CLS]
        self.gcn_offsets = gcn_offsets
        
    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        return self.graphs[idx], self.bert_offsets[idx], self.gcn_offsets[idx], self.bert_embeddings[idx], self.y[idx]

In [None]:
def collate(samples):
    
    graphs, bert_offsets, gcn_offsets, bert_embeddings, labels = map(list, zip(*samples))
    
    batched_graph = dgl.batch(graphs)
    offsets_bert = torch.stack([torch.LongTensor(x) for x in bert_offsets], dim=0)
    offsets_gcn = torch.stack([torch.LongTensor(x) for x in gcn_offsets], dim=0)
    
    one_hot_labels = torch.stack([torch.from_numpy(x.astype("uint8")) for x in labels], dim=0)
    _, labels = one_hot_labels.max(dim=1)
    
    bert_embeddings = torch.stack(bert_embeddings, dim=0).squeeze()
    
    
    return batched_graph, offsets_bert, offsets_gcn, bert_embeddings, labels

## Test DataLoarder

In [None]:
test_dataset = GPRDataset(original_df = test_df, graphs = test_all_graphs, bert_offsets = test_offsets_lst, gcn_offsets = test_gcn_offsets, bert_embeddings = test_others_bert_outputs)
#train_dataset = GPRDataset(original_df = train_df, graphs = all_graphs, bert_offsets = offsets_lst, gcn_offsets= gcn_offsets, bert_embeddings = others_bert_outputs)

In [None]:
#train_dataloarder = DataLoader(
#    train_dataset,
#    collate_fn = collate,
#    batch_size = 4,
#    shuffle=True,
#)

test_dataloarder = DataLoader(
    test_dataset,
    collate_fn = collate,
    batch_size = 4,
)

# Training Part

In [None]:
def send_graph_to_cpu(g):
    # nodes
    labels = g.node_attr_schemes()
    for l in labels.keys():
        g.ndata[l] = g.ndata.pop(l).cpu()
    # edges
    labels = g.edge_attr_schemes()
    for l in labels.keys():
        g.edata[l] = g.edata.pop(l).cpu()
    return g

In [None]:
lr_value = 0.0001
total_epoch = 100
def adjust_learning_rate(optimizers, epoch):
    # warm up
    if epoch < 10:
        lr_tmp = 0.00001
    else:
        lr_tmp = lr_value * pow((1 - 1.0 * epoch / 100), 0.9)
    
    if epoch > 36:
        lr_tmp =  0.000015 * pow((1 - 1.0 * epoch / 100), 0.9)
    
    for optimizer in optimizers:
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr_tmp

    return lr_tmp

In [None]:
# generate labels
tmp = train_df[["A-coref", "B-coref"]].copy()
tmp["Neither"] = ~(train_df["A-coref"] | train_df["B-coref"])
train_y = tmp.values.astype("bool").argmax(1)

## 5 Fold

In [None]:
from operator import itemgetter

kfold = StratifiedKFold(n_splits = 5)
test_predict_lst = [] # the test output for every fold
for train_index, test_index in kfold.split(train_df, train_y):
    print("=" * 20)
    print(f"Fold {len(test_predict_lst) + 1}")
    print("=" * 20)
    
    val_dataset = GPRDataset(original_df = train_df.iloc[test_index] , 
                               graphs = list(itemgetter(*test_index)(all_graphs)), 
                               bert_offsets = list(itemgetter(*test_index)(offsets_lst)), 
                               gcn_offsets= list(itemgetter(*test_index)(gcn_offsets)) , 
                               bert_embeddings = list(itemgetter(*test_index)(others_bert_outputs)) )
    
    train_dataset = GPRDataset(original_df = train_df.iloc[train_index] , 
                               graphs = list(itemgetter(*train_index)(all_graphs)), 
                               bert_offsets = list(itemgetter(*train_index)(offsets_lst)), 
                               gcn_offsets= list(itemgetter(*train_index)(gcn_offsets)) , 
                               bert_embeddings = list(itemgetter(*train_index)(others_bert_outputs)) )
    
    train_dataloarder = DataLoader(
    train_dataset,
    collate_fn = collate,
    batch_size = 4,
    shuffle=True,)
    
    val_dataloarder = DataLoader(
    val_dataset,
    collate_fn = collate,
    batch_size = 4,)

    model = GPRModel().cuda()
    loss_func = nn.CrossEntropyLoss() 
    optimizer = optim.Adam(model.parameters(), lr=lr_value)
    reg_lambda = 0.035

    print('Dataloader Success---------------------')
    
    best_val_loss = 11
    #ce_losses = []
    #epoch_losses = []
    #val_looses = []
    for epoch in range(total_epoch):
        
        if epoch % 5 == 0:
            print('|',">" * epoch," "*(80-epoch),'|')
        
        lr = adjust_learning_rate([optimizer],epoch)
        #print("Learning rate = %4f\n" % lr)
        model.train()
        #epoch_loss = 0
        #reg_loss = 0
        #ce_loss = 0
        for iter, (batched_graph, offsets_bert, offsets_gcn, bert_embeddings, labels) in enumerate(train_dataloarder):

            bert_embeddings = bert_embeddings.cuda()
            labels = labels.cuda()
            offsets_gcn = offsets_gcn.cuda()


            prediction = model(offsets_bert, offsets_gcn, bert_embeddings, batched_graph)

            l2_reg = None
            for w in model.RGCN.parameters():
                if not l2_reg:
                    l2_reg = w.norm(2)
                else:
                    l2_reg = l2_reg + w.norm(2)  
            for w in model.head.parameters():
                if not l2_reg:
                    l2_reg = w.norm(2)
                else:
                    l2_reg = l2_reg + w.norm(2)   
            for w in model.BERThead.parameters():
                if not l2_reg:
                    l2_reg = w.norm(2)
                else:
                    l2_reg = l2_reg + w.norm(2) 
            loss = loss_func(prediction, labels) + l2_reg * reg_lambda
            #loss = loss_func(prediction, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #epoch_loss += loss.detach().item()
            #reg_loss += (l2_reg*reg_lambda).detach().item()
            #ce_loss += (loss_func(prediction, labels)).detach().item()
        #epoch_loss /= (iter + 1)
        #ce_loss /= (iter + 1)
        #reg_loss /= (iter + 1)
        #print('Epoch {}, loss {:.4f}, ce_loss {:.4f}, reg_loss {:.4f}'.format(epoch, epoch_loss, ce_loss, reg_loss))
        #print('Epoch {}, loss {:.4f}'.format(epoch, epoch_loss))
        #epoch_losses.append(epoch_loss)
        #ce_losses.append(ce_loss)
    
        val_loss = 0
        model.eval()
        with torch.no_grad():
            for iter, (batched_graph, offsets_bert, offsets_gcn, bert_embeddings, labels) in enumerate(val_dataloarder):
                offsets_gcn = offsets_gcn.cuda()
                bert_embeddings = bert_embeddings.cuda()
                labels = labels.cuda()
                prediction = model(offsets_bert, offsets_gcn, bert_embeddings, batched_graph)
                loss = loss_func(prediction, labels)
                val_loss += loss.detach().item()
            val_loss = val_loss/(iter + 1)
            
            
        if epoch%20 == 0:
            print('Epoch {}, val_loss {:.4f}'.format(epoch, val_loss))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            if epoch > 20:
                torch.save(model.state_dict(), 'best_model.pth') 
            if epoch > 36: print('Best val loss found: ', best_val_loss)
            
            
            
        #print('Epoch {}, val_loss {:.4f}'.format(epoch, val_loss))
        #val_looses.append(val_loss)
        
        #if val_loss < best_val_loss:
        #    best_val_loss = val_loss
        
        #if epoch > 0 and epoch % 5 == 0:
        #    plt.figure()
        #    plt.plot(ce_losses[len(ce_losses) * 2 // 5 :])
        #    plt.plot(val_looses[len(val_looses) * 2 // 5:] )
        #    plt.show()
    
    print('This fold, the best val loss is: ', best_val_loss)
    
    test_loss = 0.
    test_predict = None
    
    model = GPRModel()
    model.load_state_dict(torch.load('best_model.pth', map_location=torch.device('cpu')))
    model.eval()
    
    with torch.no_grad():
        for iter, (batched_graph, offsets_bert, offsets_gcn, bert_embeddings, labels) in enumerate(test_dataloarder):
            
            offsets_gcn = offsets_gcn
            bert_embeddings = bert_embeddings.cpu()
            labels = labels
            batched_graph = send_graph_to_cpu(batched_graph)
            
            prediction = model(offsets_bert, offsets_gcn, bert_embeddings, batched_graph)
            if test_predict is None:
                test_predict = prediction
            else:
                test_predict = torch.cat((test_predict, prediction), 0) 
            loss = loss_func(prediction, labels)
            test_loss += loss
            # test_loss += loss.detach().item()
    
    test_loss /= (iter + 1)
    print('This fold, the test loss is: ', test_loss)
    test_predict_lst.append(test_predict)

# Test Part

In [None]:
test_predict_arr = [torch.softmax(pre, -1).clamp(1e-4, 1-1e-4).numpy() for pre in test_predict_lst]

In [None]:
final_test_preds = np.mean(test_predict_arr, axis=0)

In [None]:
from sklearn.metrics import log_loss

def extract_target(df):
    df["Neither"] = 0
    df.loc[~(df['A-coref'] | df['B-coref']), "Neither"] = 1
    df["target"] = 0
    df.loc[df['B-coref'] == 1, "target"] = 1
    df.loc[df["Neither"] == 1, "target"] = 2
    return df
test_df = extract_target(test_df)
log_loss(test_df.target, final_test_preds)

In [None]:
# df_sub = pd.DataFrame(final_test_preds, columns=["A", "B", "NEITHER"])
# df_sub["ID"] = test_df.ID
# df_sub = df_sub[['ID',"A", "B", "NEITHER"]]
# df_sub.to_csv("submission_415_copy3.csv", index=False)
# df_sub.head()

In [None]:
#df_sub.to_csv("submission_414_lg.csv", index=False)