In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt

In [10]:
!pip install dgl

Collecting dgl
[?25l  Downloading https://files.pythonhosted.org/packages/71/c4/ce24841375cf4393787dbf9a645e271c19a03d2d9a0e5770b08ba76bcfde/dgl-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.4MB)
[K     |████████████████████████████████| 4.4MB 6.5MB/s 
Installing collected packages: dgl
Successfully installed dgl-0.6.1


In [11]:
import dgl
from dgl import DGLGraph
from dgl.data import MiniGCDataset
import dgl.function as fn

DGL backend not selected or invalid.  Assuming PyTorch for now.
Using backend: pytorch


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [13]:
!pip install pytorch_pretrained_bert

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 11.7MB/s eta 0:00:01[K     |█████▎                          | 20kB 16.2MB/s eta 0:00:01[K     |████████                        | 30kB 10.5MB/s eta 0:00:01[K     |██████████▋                     | 40kB 8.3MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 5.3MB/s eta 0:00:01[K     |███████████████▉                | 61kB 5.2MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 5.7MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 6.0MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 6.4MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 6.6MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 6.6MB/s eta 0:00:01[K     |████████████████████

In [14]:
from functools import partial

import re
import numpy as np
import pandas as pd

from pytorch_pretrained_bert import BertTokenizer

import spacy
import pickle
import collections

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import os


In [15]:
!wget gap-test.tsv https://raw.githubusercontent.com/ianycxu/RGCN-with-BERT/master/Final%20Codes/gap-test.tsv

--2021-05-12 00:15:19--  http://gap-test.tsv/
Resolving gap-test.tsv (gap-test.tsv)... failed: Name or service not known.
wget: unable to resolve host address ‘gap-test.tsv’
--2021-05-12 00:15:20--  https://raw.githubusercontent.com/ianycxu/RGCN-with-BERT/master/Final%20Codes/gap-test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1075889 (1.0M) [text/plain]
Saving to: ‘gap-test.tsv’


2021-05-12 00:15:20 (16.8 MB/s) - ‘gap-test.tsv’ saved [1075889/1075889]

FINISHED --2021-05-12 00:15:20--
Total wall clock time: 0.5s
Downloaded: 1 files, 1.0M in 0.06s (16.8 MB/s)


In [17]:
!wget gap-validation.tsv https://raw.githubusercontent.com/ianycxu/RGCN-with-BERT/master/Final%20Codes/gap-validation.tsv

--2021-05-12 00:16:01--  http://gap-validation.tsv/
Resolving gap-validation.tsv (gap-validation.tsv)... failed: Name or service not known.
wget: unable to resolve host address ‘gap-validation.tsv’
--2021-05-12 00:16:02--  https://raw.githubusercontent.com/ianycxu/RGCN-with-BERT/master/Final%20Codes/gap-validation.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 245089 (239K) [text/plain]
Saving to: ‘gap-validation.tsv’


2021-05-12 00:16:02 (6.98 MB/s) - ‘gap-validation.tsv’ saved [245089/245089]

FINISHED --2021-05-12 00:16:02--
Total wall clock time: 0.5s
Downloaded: 1 files, 239K in 0.03s (6.98 MB/s)


In [None]:
from google.colab import files 
uploaded = files.upload()

In [18]:
train_df = pd.concat([
    pd.read_csv("gap-test.tsv", delimiter="\t"),
    pd.read_csv("gap-validation.tsv", delimiter="\t")
], axis=0)

#**RGCN**

In [19]:
class RGCNLayer(nn.Module):
    def __init__(self, feat_size, num_rels, activation=None, gated = True):
        
        super(RGCNLayer, self).__init__()
        self.feat_size = feat_size
        self.num_rels = num_rels
        self.activation = activation
        self.gated = gated

        self.weight = nn.Parameter(torch.Tensor(self.num_rels, self.feat_size, 256))
        # init trainable parameters
        nn.init.xavier_uniform_(self.weight,gain=nn.init.calculate_gain('relu'))
        
        if self.gated:
            self.gate_weight = nn.Parameter(torch.Tensor(self.num_rels, self.feat_size, 1))
            nn.init.xavier_uniform_(self.gate_weight,gain=nn.init.calculate_gain('sigmoid'))
        
    def forward(self, g):
        
        weight = self.weight
        gate_weight = self.gate_weight
        
        def message_func(edges):
            w = weight[edges.data['rel_type']]
            msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
            msg = msg * edges.data['norm']
            
            if self.gated:
                gate_w = gate_weight[edges.data['rel_type']]
                gate = torch.bmm(edges.src['h'].unsqueeze(1), gate_w).squeeze().reshape(-1,1)
                gate = torch.sigmoid(gate)
                msg = msg * gate
                
            return {'msg': msg}
    
        def apply_func(nodes):
            h = nodes.data['h']
            h = self.activation(h)
            return {'h': h}

        g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)

In [20]:
class RGCNModel(nn.Module):
    def __init__(self, h_dim, num_rels, num_hidden_layers=1, gated = True):
        super(RGCNModel, self).__init__()

        self.h_dim = h_dim
        self.num_rels = num_rels
        self.num_hidden_layers = num_hidden_layers
        self.gated = gated
        
        # create rgcn layers
        self.build_model()
       
    def build_model(self):        
        self.layers = nn.ModuleList() 
        for _ in range(self.num_hidden_layers):
            rgcn_layer = RGCNLayer(self.h_dim, self.num_rels, activation=F.relu, gated = self.gated)
            self.layers.append(rgcn_layer)
    
    def forward(self, g):
        for layer in self.layers:
            layer(g)
        
        rst_hidden = []
        for sub_g in dgl.unbatch(g):
            rst_hidden.append(  sub_g.ndata['h']   )
        return rst_hidden

In [21]:
class Head(nn.Module):
    """The MLP submodule"""
    def __init__(self, gcn_out_size: int, bert_out_size: int):
        super().__init__()
        self.bert_out_size = bert_out_size
        self.gcn_out_size = gcn_out_size
        
        self.fc = nn.Sequential(
            nn.BatchNorm1d(bert_out_size * 3 + gcn_out_size * 3),
            nn.Dropout(0.5),
            nn.Linear(bert_out_size * 3 + gcn_out_size * 3, 256),    
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.5),
            nn.Linear(256, 3),
        )
        for i, module in enumerate(self.fc):
            if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
                nn.init.constant_(module.weight, 1)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.Linear):
                if getattr(module, "weight_v", None) is not None:
                    nn.init.uniform_(module.weight_g, 0, 1)
                    nn.init.kaiming_normal_(module.weight_v)
                    assert model[i].weight_g is not None
                else:
                    nn.init.kaiming_normal_(module.weight)
                nn.init.constant_(module.bias, 0)
                
    def forward(self, gcn_outputs, offsets_gcn, bert_embeddings):
        
        gcn_extracted_outputs = [gcn_outputs[i].unsqueeze(0).gather(1, offsets_gcn[i].unsqueeze(0).unsqueeze(2)
                                       .expand(-1, -1, gcn_outputs[i].unsqueeze(0).size(2))).view(gcn_outputs[i].unsqueeze(0).size(0), -1) for i in range(len(gcn_outputs))]
        
        gcn_extracted_outputs = torch.stack(gcn_extracted_outputs, dim=0).squeeze()
        
        embeddings = torch.cat((gcn_extracted_outputs, bert_embeddings), 1) 
        
        return self.fc(embeddings)


class BERT_Head(nn.Module):
    def __init__(self, bert_hidden_size: int):
        super().__init__()
        self.fc = nn.Sequential(
            nn.BatchNorm1d(bert_hidden_size * 3),
            nn.Dropout(0.5),
            nn.Linear(bert_hidden_size * 3, 512 * 3),   
            nn.ReLU(),
        )

        for i, module in enumerate(self.fc):
            if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
                nn.init.constant_(module.weight, 1)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.Linear):
                if getattr(module, "weight_v", None) is not None:
                    nn.init.uniform_(module.weight_g, 0, 1)
                    nn.init.kaiming_normal_(module.weight_v)
                    assert model[i].weight_g is not None
                else:
                    nn.init.kaiming_normal_(module.weight)
                nn.init.constant_(module.bias, 0)
                
    def forward(self, bert_embeddings):
        #print('BERT_Head bert_embeddings: ', bert_embeddings, bert_embeddings.view(bert_embeddings.shape[0],-1).shape)
        outputs = self.fc(bert_embeddings.view(bert_embeddings.shape[0],-1))
        return outputs
        
        
    
    
    
class GPRModel(nn.Module):
    """The main model."""
    def __init__(self):
        super().__init__()
        self.RGCN =  RGCNModel(h_dim = 1024, num_rels = 3, gated = True)
        self.BERThead = BERT_Head(1024) # bert output size
        self.head = Head(256, 512)  # gcn output   berthead output
    
    
    def forward(self, offsets_bert, offsets_gcn, bert_embeddings, g):
        gcn_outputs = self.RGCN(g)
        bert_head_outputs = self.BERThead(bert_embeddings)
        head_outputs = self.head(gcn_outputs, offsets_gcn, bert_head_outputs)
        return head_outputs

In [24]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
BERT_MODEL = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, never_split = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", "[THISISA]", "[THISISB]", "[THISISP]"))

tokenizer.vocab["[THISISA]"] = -1
tokenizer.vocab["[THISISB]"] = -1
tokenizer.vocab["[THISISP]"] = -1

In [25]:
def is_target(i, target_offset_list):
    return i in target_offset_list

def transfer_n_e(nodes, edges):

    num_nodes = len(nodes)
    new_edges = []
    for e1, e2 in edges:
        new_edges.append( [nodes[e1], nodes[e2]] ) 
    return num_nodes, new_edges

all_graphs = []
gcn_offsets = []

# **Bert** **Embedding**

In [32]:
import gc
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader

from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

In [33]:
def insert_tag(row):
    """
    Insert custom tags to help us find the position of A, B, and the pronoun after tokenization.
    """
    to_be_inserted = sorted([(row["A-offset"], " THISISA "),(row["B-offset"], " THISISB "),(row["Pronoun-offset"], " THISISP ")], key=lambda x: x[0], reverse=True)  # 从大往小插入这样才不会乱顺序    
    text = row["Text"]    
    for offset, tag in to_be_inserted:
        text = text[:offset] + tag + text[offset:]
    return text

def clean_and_replace_target_name(row):
    '''' 
    Only alphabet left
    replace all target name with fake name
    '''
    
    text = row['TextClean']
    text = re.sub("[^a-zA-Z]"," ",text)  
    A = re.sub("[^a-zA-Z]"," ",row['A'])   
    B = re.sub("[^a-zA-Z]"," ",row['B']) 
    
    # replace names
    text = re.sub(str(A), tokenizer.tokenize(A)[0], text)
    text = re.sub(str(B), tokenizer.tokenize(B)[0], text)
    
    text = re.sub(r"THISISA", r"[THISISA]", text)
    text = re.sub(r"THISISB", r"[THISISB]", text)
    text = re.sub(r"THISISP", r"[THISISP]", text)
    
    text = re.sub(' +', ' ', text)
    return text

def generate_text(row):
    row.loc['TextClean'] = insert_tag(row)
    text = clean_and_replace_target_name(row)
    return text

In [35]:
BERT_MODEL = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, never_split = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", "[THISISA]", "[THISISB]", "[THISISP]"))
tokenizer.vocab["[THISISA]"] = -1
tokenizer.vocab["[THISISB]"] = -1
tokenizer.vocab["[THISISP]"] = -1

def tokenize(text, tokenizer):
    """
    Returns a list of tokens and the positions of A, B, and the pronoun.
    """
    entries = {}
    final_tokens = []
    for token in tokenizer.tokenize(text):
        if token in ("[THISISA]", "[THISISB]", "[THISISP]"):
            entries[token] = len(final_tokens) + 1
            continue
        final_tokens.append(token)
    return final_tokens, (entries["[THISISA]"], entries["[THISISB]"], entries["[THISISP]"])

In [36]:
offsets_lst = []
tokens_lst = []

In [38]:
train_df

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,http://en.wikipedia.org/wiki/Norberto_Alonso
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,http://en.wikipedia.org/wiki/Aladhadh
3,test-4,"At the trial, Pisciotta said: ``Those who have...",his,526,Alliata,377,False,Pisciotta,536,True,http://en.wikipedia.org/wiki/Gaspare_Pisciotta
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,http://en.wikipedia.org/wiki/Chasers
...,...,...,...,...,...,...,...,...,...,...,...
449,validation-450,"He then agrees to name the gargoyle Goldie, af...",He,305,Lucien,252,False,Abel,264,False,http://en.wikipedia.org/wiki/Goldie_(DC_Comics)
450,validation-451,"Disgusted with the family's ``mendacity'', Bri...",she,365,Maggie,242,False,Mae,257,False,http://en.wikipedia.org/wiki/Cat_on_a_Hot_Tin_...
451,validation-452,She manipulates Michael into giving her custod...,she,306,Scarlett,255,False,Alice,291,True,http://en.wikipedia.org/wiki/Michael_Moon_(Eas...
452,validation-453,"On April 4, 1986, Donal Henahan wrote in the N...",her,330,Aida,250,False,Miss Millo,294,True,http://en.wikipedia.org/wiki/Aprile_Millo


In [62]:
for _, row in train_df.iterrows():
    text = generate_text(row)
    tokens, offsets = tokenize(text, tokenizer)
    offsets_lst.append(offsets)
    tokens_lst.append(tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens + ["[SEP]"]))
    

In [66]:
offsets_lst

[(70, 72, 75),
 (58, 64, 68),
 (47, 54, 57),
 (73, 102, 100),
 (81, 106, 79),
 (52, 57, 63),
 (33, 42, 54),
 (51, 56, 61),
 (23, 31, 41),
 (15, 26, 37),
 (58, 67, 63),
 (36, 38, 52),
 (38, 41, 59),
 (37, 45, 52),
 (1, 13, 22),
 (61, 74, 54),
 (33, 49, 51),
 (30, 41, 52),
 (37, 43, 48),
 (1, 36, 22),
 (52, 54, 59),
 (47, 51, 57),
 (39, 49, 57),
 (52, 89, 90),
 (62, 73, 77),
 (43, 58, 56),
 (1, 5, 7),
 (51, 62, 48),
 (36, 41, 47),
 (39, 54, 52),
 (44, 45, 73),
 (47, 54, 56),
 (66, 68, 71),
 (58, 71, 52),
 (47, 49, 56),
 (43, 52, 54),
 (16, 19, 40),
 (18, 24, 41),
 (80, 86, 91),
 (24, 31, 34),
 (53, 54, 50),
 (91, 97, 85),
 (37, 42, 46),
 (60, 68, 77),
 (31, 36, 38),
 (48, 50, 66),
 (52, 60, 67),
 (2, 23, 43),
 (34, 44, 52),
 (48, 65, 77),
 (14, 27, 53),
 (41, 56, 64),
 (62, 71, 67),
 (32, 71, 26),
 (34, 47, 48),
 (34, 60, 62),
 (51, 53, 60),
 (36, 45, 47),
 (56, 57, 60),
 (50, 61, 64),
 (59, 90, 87),
 (1, 14, 34),
 (36, 42, 62),
 (33, 53, 56),
 (1, 34, 9),
 (39, 48, 50),
 (43, 45, 47),
 

In [67]:
tokens_lst

[[101,
  2588,
  2037,
  9920,
  2046,
  1996,
  12849,
  26730,
  15758,
  3873,
  2223,
  2139,
  2187,
  6435,
  2000,
  3696,
  1037,
  3206,
  1999,
  2762,
  2007,
  15501,
  2278,
  1049,
  13316,
  10222,
  1997,
  1996,
  3972,
  2006,
  2238,
  2044,
  11847,
  1996,
  2446,
  2528,
  2007,
  1996,
  1049,
  13316,
  10222,
  2136,
  1999,
  2002,
  2187,
  1996,
  2252,
  1998,
  2001,
  3856,
  2039,
  2011,
  3507,
  3972,
  2217,
  15501,
  2278,
  4702,
  9695,
  1999,
  2251,
  2280,
  7097,
  2121,
  5639,
  10514,
  3334,
  1998,
  4386,
  12968,
  3960,
  2024,
  2139,
  1055,
  27328,
  2010,
  5542,
  2003,
  5135,
  3748,
  1055,
  6585,
  2952,
  4575,
  10514,
  3334,
  102],
 [101,
  2090,
  1996,
  2086,
  2314,
  2180,
  2176,
  2334,
  4486,
  1998,
  2150,
  2028,
  1997,
  1996,
  2087,
  6450,
  2780,
  1999,
  1996,
  2088,
  2007,
  1037,
  2034,
  2136,
  17649,
  11320,
  4226,
  2652,
  1999,
  2223,
  2399,
  1998,
  2019,
  8053,
  8919,
  2117,
  

In [63]:
max((len(x) for x in tokens_lst))

269

In [64]:
# truncate each row to the size of max_len

max_len = 269  
tokens = np.zeros((len(tokens_lst), max_len), dtype=np.int64)
for i, row in enumerate(tokens_lst):
    row = np.array(row[:269])
    tokens[i, :len(row)] = row

# All sentenses
token_tensor = torch.from_numpy(tokens)

In [65]:
bert = BertModel.from_pretrained(BERT_MODEL)

100%|██████████| 1248501532/1248501532 [00:28<00:00, 44397477.17B/s]


In [None]:
bert_outputs = []
with torch.no_grad():
    for i in range(len(token_tensor)):
        if i % 40 == 0:
            print(i)
        bert_output, _ =  bert(
                    token_tensor[i].unsqueeze(0), 
                    attention_mask=(token_tensor[i].unsqueeze(0) > 0).long(), 
                    token_type_ids=None, 
                    output_all_encoded_layers=False) 

        bert_outputs.append(bert_output)