In [1]:
import pandas as pd
import numpy as np
import os
import nltk
import spacy
from spacy.lang.en import English
from nltk import Tree
from tqdm import tqdm
from keras.layers import *
from keras.models import Model, Sequential
import keras.backend as K
print(os.listdir('../input'))
print(os.listdir('../input/gap-coreference'))

Using TensorFlow backend.


['gap-coreference', 'gendered-pronoun-resolution']
['gap-development.tsv', 'gap-test.tsv', 'gap-validation.tsv']


In [2]:
test_df = pd.read_table('../input/gap-coreference/gap-development.tsv')
train_df  = pd.read_table('../input/gap-coreference/gap-test.tsv')
val_df   = pd.read_table('../input/gap-coreference/gap-validation.tsv')
nlp = spacy.load('en_core_web_lg')
train_df.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,http://en.wikipedia.org/wiki/Norberto_Alonso
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,http://en.wikipedia.org/wiki/Aladhadh
3,test-4,"At the trial, Pisciotta said: ``Those who have...",his,526,Alliata,377,False,Pisciotta,536,True,http://en.wikipedia.org/wiki/Gaspare_Pisciotta
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,http://en.wikipedia.org/wiki/Chasers


In [3]:
def bs(lens, target):

    low, high = 0, len(lens) -1

    while low < high:
        mid = low + int((high - low) / 2)

        if target > lens[mid]:
            low = mid+1
        elif target < lens[mid]:
            high = mid
        else:
            return mid+1
        
    return low

class Embeding_features():

    def __init__(self):

        self.nlp = nlp

    def create(self, charoffset, text):

        doc = self.nlp(text)

        lens = [token.idx for token in doc] #The Charactor offset the token within the parent
        mention_offset = bs(lens, charoffset) - 1 # The target in which index of tokens
        mention = doc[mention_offset] #mention

        dependency_parent = mention.head #The syntactic parent, or "governor", of this token.
        nbor = mention.nbor()

        sent_lens = [len(sent) for sent in doc.sents] #the sentence length
        acc_lens  = sent_lens
        pre_lens  = 0
        for i in range(0, len(sent_lens)):
            pre_lens += acc_lens[i]
            acc_lens[i] = pre_lens
        sent_index = bs(acc_lens, mention_offset) #to Find out the charoffset in which sentence
        current_sent = list(doc.sents)[sent_index]
        current_sent = [token for token in current_sent]

        preceding3 = self.n_preceding_words(3, doc, mention_offset)
        following3 = self.n_following_words(3, doc, mention_offset)

        proceeding = []
        if sent_index - 1 >= 0:
            proceeding = [token for token in list(doc.sents)[sent_index-1]]
            
        if sent_index+1 < len(list(doc.sents)):
            succeeding = list(doc.sents)[sent_index+1]
            succeeding = [token for token in succeeding]
        else:
            succeeding = []
        
        return mention, dependency_parent, nbor, preceding3, following3, proceeding, current_sent, succeeding
    
    def n_preceding_words(self, n, tokens, offset):

        start = offset - n
        precedings = [None] * max(0, 0 - start)
        start = max(0, start)
        precedings += tokens[start: offset]

        return precedings

    def n_following_words(self, n, tokens, offset):

        end = offset + n
        followings = [None] * max(0, end - len(tokens))
        end = min(end, len(tokens))
        followings += tokens[offset: end]
        
        return followings    

In [4]:
def extract_embedding_features(df, text_column, offset_column, max_len=100, embed_dim = 300):
    text_offset_list = df[[text_column, offset_column]].values.tolist()
    extractor        = Embeding_features()

    feature_map1 = np.zeros(shape=(len(text_offset_list), 3, embed_dim))
    feature_map2 = np.zeros(shape=(len(text_offset_list), 6, embed_dim))
    feature_map3 = np.zeros(shape=(len(text_offset_list), max_len, embed_dim))
    
    for text_offset_index in range(len(text_offset_list)):
        text_offset = text_offset_list[text_offset_index]
        mention, parent, nbor, preceding3, following3, proceeding, current_sent, succeeding = extractor.create(text_offset[1], text_offset[0])

        #Feature Map1
        feature_map1[text_offset_index, 0, :] = parent.vector
        feature_map1[text_offset_index, 1, :] = mention.vector
        feature_map1[text_offset_index, 2, :] = nbor.vector
        
        #Feature Map2
        feature_map2[text_offset_index, 0:3, :] = np.asarray([token.vector if token is not None else np.zeros((embed_dim,)) for token in preceding3])
        feature_map2[text_offset_index, 3:6, :] = np.asarray([token.vector if token is not None else np.zeros((embed_dim,)) for token in following3])
        
        #Feature Map3
        index = 0
        if len(proceeding) > 0: 
            if len(proceeding) > 50:
                feature_map3[text_offset_index, index: index+50, :]  = np.asarray([token.vector if token is not None else np.zeros((embed_dim,)) for token in proceeding[len(proceeding)-50:len(proceeding)]])
                index += 50
            else:
                feature_map3[text_offset_index, index: index+len(proceeding), :]  = np.asarray([token.vector if token is not None else np.zeros((embed_dim,)) for token in proceeding])
                index += len(proceeding)
        
        end = min(max_len, index+len(current_sent))
        feature_map3[text_offset_index, index:end, :]  = np.asarray([token.vector if token is not None else np.zeros((embed_dim,)) for token in current_sent[:end-index]])
        index += len(current_sent)        
        if end == max_len: continue
        if len(succeeding) > 0:
            end = min(max_len, index+len(succeeding)) 
            feature_map3[text_offset_index, index:end, :] = np.asarray([token.vector if token is not None else np.zeros((embed_dim,)) for token in succeeding[:end-index]])
        
    return feature_map1, feature_map2, feature_map3

In [5]:
p_emb_1, p_emb_2, p_emb_3 = extract_embedding_features(train_df, 'Text', 'Pronoun-offset')
p_emb_dev_1, p_emb_dev_2, p_emb_dev_3 = extract_embedding_features(val_df, 'Text', 'Pronoun-offset')
a_emb_1, a_emb_2, a_emb_3 = extract_embedding_features(train_df, 'Text', 'A-offset')
a_emb_dev_1, a_emb_dev_2, a_emb_dev_3 = extract_embedding_features(val_df, 'Text', 'A-offset')
b_emb_1, b_emb_2, b_emb_3  = extract_embedding_features(train_df, 'Text', 'B-offset')
b_emb_dev_1, b_emb_dev_2, b_emb_dev_3 = extract_embedding_features(val_df, 'Text', 'B-offset')

In [6]:
class Distance_features():

    def __init__(self):

        self.nlp = spacy.load('en_core_web_lg')
        self.buckets = [1, 2, 3, 4, 5, 8, 16, 32, 64]

    def create(self, char_offsetA, char_offsetB, text):

        doc = self.nlp(text)

        lens = [token.idx for token in doc]
        mention_offsetA = bs(lens, char_offsetA) - 1
        mention_offsetB = bs(lens, char_offsetB) - 1

        mention_dist = mention_offsetA - mention_offsetB
        mention_dist = self._bs(self.buckets, mention_dist)

        sent_lens = [len(sent) for sent in doc.sents] #the sentence length
        acc_lens = sent_lens
        pre_lens = 0
        for i in range(0, len(sent_lens)):
            pre_lens += acc_lens[i]
            acc_lens[i] = pre_lens

        sentA_index = bs(acc_lens, mention_offsetA)
        sentB_index = bs(acc_lens, mention_offsetB)
        
        sent_dist =  sentA_index - sentB_index
        sent_dist =  self._bs(self.buckets, sent_dist)

        sentA = list(doc.sents)[sentA_index]
        sentB = list(doc.sents)[sentB_index]

        posA = mention_offsetA + 1
        if sentA_index > 0:
            posA = mention_offsetA - acc_lens[sentA_index-1] #The Distance from first word to mention
        posA = self._bs(self.buckets, posA)
        posA_end = len(sentA) - posA #The Distance from last word to mention In sentence
        posA_end = self._bs(self.buckets, posA_end)

        posB = mention_offsetB + 1
        if sentB_index > 0:
            posB = mention_offsetB - acc_lens[sentB_index-1]
        posB = self._bs(self.buckets, posB)
        posB_end = len(sentB) - posB #The Distance from last word to mention
        posB_end = self._bs(self.buckets, posB_end)

        return [mention_dist, sent_dist, posA, posB, posA_end, posB_end]

    def _bs(self, lens, dist):

        low, high = 0, len(lens)

        while low < high:
            mid = low + int((high-low) / 2)
            if dist > lens[mid]:
                low = mid + 1
            elif dist < lens[mid]:
                high = mid
            else:
                return mid
        return low

def extract_dist_features(df, text_column, pronoun_offset_column, name_offset_column, num_features=45):
    text_offset_list = df[[text_column, pronoun_offset_column, name_offset_column]].values.tolist()
    extractor = Distance_features()
    
    dist_feas = []

    pos_feature_matrix = np.zeros(shape=(len(text_offset_list), num_features))
    for text_offset_index in range(len(text_offset_list)):
        text_offset = text_offset_list[text_offset_index]
        dist_fea = extractor.create(text_offset[1], text_offset[2], text_offset[0])
        dist_feas.append(dist_fea)

    return np.asarray(dist_feas)

In [7]:
pa_pos_tra = extract_dist_features(train_df, 'Text', 'Pronoun-offset', 'A-offset')
pa_pos_dev = extract_dist_features(val_df, 'Text', 'Pronoun-offset', 'A-offset')

pb_pos_tra = extract_dist_features(train_df, 'Text', 'Pronoun-offset', 'B-offset')
pb_pos_dev = extract_dist_features(val_df, 'Text', 'Pronoun-offset', 'B-offset')

In [8]:
pa_pos_tra.shape

(2000, 6)

In [9]:
from keras.layers import *
from keras.models import *
import keras.backend as K

class MentionPairEmbeding(object):

    def __init__(self, filters=60, embed_size=300):
        self.filters = filters
        self.embed_size = embed_size
        self.buckets = [1, 2, 3, 4, 5, 8, 16, 32, 64]
        
    def build(self):

        M1 = Input(shape=(3, self.embed_size))  # Embedding of Parents, Mention, and Suceeding Word: String Features
        M2 = Input(shape=(6, self.embed_size))  # Embeddings of 3 proceedings words, 3 succedings words of m
        M3 = Input(shape=(100, self.embed_size))  # Average Embedding of 2 proceeding sentence, 1 succeding sentence, and current sentence
        A1 = Input(shape=(3, self.embed_size))
        A2 = Input(shape=(6, self.embed_size))
        A3 = Input(shape=(100, self.embed_size))
        B1 = Input(shape=(3, self.embed_size))
        B2 = Input(shape=(6, self.embed_size))
        B3 = Input(shape=(100, self.embed_size))
        
        inpM_A = Input(shape=(6,)) #Mention and Antecedent A
        inpM_B = Input(shape=(6,)) #Mention and Antecedent B

        Mention_embedding = self.mention_embed(M1, M2, M3)
        A_embedding = self.mention_embed(A1, A2, A3)
        B_embedding = self.mention_embed(B1, B2, B3)        
        
        
        Dist_Embed = Embedding(len(self.buckets)+1, 20) #Dist Features can be learned from model 
        inpM_A_embed = Dist_Embed(inpM_B)
        inpM_B_embed = Dist_Embed(inpM_A)
        inpM_A_embed = Flatten()(inpM_A_embed)
        inpM_B_embed = Flatten()(inpM_B_embed)

        Mention_Pair1 = self.mentionpair_embed(Mention_embedding, A_embedding)
        Mention_Pair2 = self.mentionpair_embed(Mention_embedding, B_embedding)
        Mention_Pair1 = Concatenate()([Mention_Pair1, inpM_A_embed])
        Mention_Pair2 = Concatenate()([Mention_Pair2, inpM_B_embed])
        
        Cluster_Embedding = self.CONVs(Mention_embedding)
        M_Cluster_Embedding1 = self.CONVp(Mention_Pair1)
        M_Cluster_Embedding2 = self.CONVp(Mention_Pair2)
        Mention_embedding = Flatten()(Mention_embedding)
        
        output = Concatenate()([M_Cluster_Embedding1, M_Cluster_Embedding2, Cluster_Embedding, Mention_embedding])
        output = Dense(self.filters, use_bias=True, activation='relu')(output)
        output = Dense(self.filters, use_bias=True, activation='relu')(output)
        output = Dense(3, use_bias=True, activation='softmax')(output)
        
        model = Model([M1, M2, M3, A1, A2, A3, B1, B2, B3, inpM_A, inpM_B], output)

        return model

    def mention_embed(self, inp1, inp2, inp3):

        Conv1_fea1 = self.Conv1k(inp1, [1, 2, 3])
        Conv1_fea2 = self.Conv1k(inp2, [1, 2, 3])
        Conv1_fea3 = self.Conv1k(inp3, [1, 2, 3])

        self.Expand_dim = Lambda(lambda x: K.expand_dims(x, axis=1))
        Conv1_fea1 = self.Expand_dim(Conv1_fea1)
        Conv1_fea2 = self.Expand_dim(Conv1_fea2)
        Conv1_fea3 = self.Expand_dim(Conv1_fea3)
        Conv2_Input = Concatenate(axis=1)([Conv1_fea1, Conv1_fea2, Conv1_fea3])

        x = Conv2D(self.filters, kernel_size=(3, 3), activation='tanh')(Conv2_Input)
        x = MaxPool2D(pool_size=(1,1))(x)
        x = Lambda(lambda x: K.squeeze(x, axis=1))(x)

        return x

    def mentionpair_embed(self, M1, M2):
        
        x = Concatenate(axis=1)([M1, M2])
        x = Conv1D(self.filters, kernel_size=2, activation='tanh')(x)
        x = MaxPool1D(pool_size=1)(x)
        x = Dropout(0.8)(x)
        x = Flatten()(x)
        x = Dense(self.filters, use_bias=True, activation='relu')(x)

        return x

    def Conv1k(self, x, kernels):
        
        assert len(kernels) != 0
        convs = []
        shape = x.get_shape().as_list()
        for kernel in kernels:
            conv = Conv1D(self.filters, kernel, activation='tanh')(x)
            pool = MaxPool1D(pool_size=int(shape[1] - kernel + 1))(conv)
            pool = Dropout(0.8)(pool)
            convs.append(pool)

        convs = Concatenate(axis=1)(convs)
        
        return convs
    
    def CONVs(self, x):
        
        x1 = GlobalAvgPool1D()(x)
        x2 = GlobalMaxPool1D()(x)
        x1 = self.Expand_dim(x1)
        x2 = self.Expand_dim(x2)
        x = Concatenate(axis=1)([x1, x2])
        x = Conv1D(self.filters, kernel_size=2)(x)
        x = MaxPool1D(pool_size=1)(x)
        x = Flatten()(x)
        
        return x
    
    def CONVp(self, x):
        
        x  = self.Expand_dim(x)
        x1 = GlobalAvgPool1D()(x)
        x2 = GlobalMaxPool1D()(x)
        x1 = self.Expand_dim(x1)
        x2 = self.Expand_dim(x2)
        x  = Concatenate(axis=1)([x1, x2])
        x  = Conv1D(self.filters, kernel_size=2)(x)
        x  = MaxPool1D(pool_size=1)(x)
        x  = Flatten()(x)
        
        return x

In [10]:
model = MentionPairEmbeding().build()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6, 300)       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 100, 300)     0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_5 (I

In [11]:
X_train = [p_emb_1, p_emb_2, p_emb_3, a_emb_1, a_emb_2, a_emb_3, b_emb_1, b_emb_2, b_emb_3, pa_pos_tra, pb_pos_tra]
X_dev = [p_emb_dev_1, p_emb_dev_2, p_emb_dev_3, a_emb_dev_1, a_emb_dev_2, a_emb_dev_3, b_emb_dev_1, b_emb_dev_2, b_emb_dev_3, pa_pos_dev, pb_pos_dev]

In [12]:
def _row_to_y(row):
    if row.loc['A-coref']:
        return 0
    if row.loc['B-coref']:
        return 1
    return 2

y_tra = train_df.apply(_row_to_y, axis=1)
y_dev = val_df.apply(_row_to_y, axis=1)
y_test = test_df.apply(_row_to_y, axis=1)

In [13]:
from keras import callbacks

model.compile(optimizer='adam', loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"])
file_path = "best_mlp_model.hdf5"
check_point = callbacks.ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = callbacks.EarlyStopping(monitor = "val_loss", mode = "min", patience=3)
history = model.fit(X_train, y_tra, batch_size=20, epochs=20, validation_data=(X_dev, y_dev), shuffle=True, callbacks = [check_point, early_stop])

Train on 2000 samples, validate on 454 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.98116, saving model to best_mlp_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.98116 to 0.94452, saving model to best_mlp_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.94452 to 0.92229, saving model to best_mlp_model.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.92229 to 0.91583, saving model to best_mlp_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.91583 to 0.83085, saving model to best_mlp_model.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 0.83085 to 0.82091, saving model to best_mlp_model.hdf5
Epoch 7/20

Epoch 00007: val_loss improved from 0.82091 to 0.81058, saving model to best_mlp_model.hdf5
Epoch 8/20

Epoch 00008: val_loss improved from 0.81058 to 0.77072, saving model to best_mlp_model.hdf5
Epoch 9/20

Epoch 00009: val_loss improved from 0.77072 to 0.75326, saving model to best_mlp_model.hdf5
Epoch 10/20

Epoch 00

In [14]:
p_emb_test1, p_embed_test2, p_embed_test3 = extract_embedding_features(test_df, 'Text', 'Pronoun-offset')
a_emb_test1, a_embed_test2, a_embed_test3 = extract_embedding_features(test_df, 'Text', 'A-offset')
b_emb_test1, b_embed_test2, b_embed_test3 = extract_embedding_features(test_df, 'Text', 'B-offset')
pa_pos_test = extract_dist_features(test_df, 'Text', 'Pronoun-offset', 'A-offset')
pb_pos_test = extract_dist_features(test_df, 'Text', 'Pronoun-offset', 'B-offset')


In [15]:
X_test = [p_emb_test1, p_embed_test2, p_embed_test3, a_emb_test1, a_embed_test2, a_embed_test3, b_emb_test1, b_embed_test2, b_embed_test3, pa_pos_test, pb_pos_test]

In [16]:
del model

In [17]:
model = MentionPairEmbeding().build()
model.load_weights("./best_mlp_model.hdf5")

y_preds = model.predict(X_test, batch_size = 1024, verbose = 1)

sub_df_path = os.path.join('../input/gendered-pronoun-resolution/', 'sample_submission_stage_1.csv')
sub_df = pd.read_csv(sub_df_path)
sub_df.loc[:, 'A'] = pd.Series(y_preds[:, 0])
sub_df.loc[:, 'B'] = pd.Series(y_preds[:, 1])
sub_df.loc[:, 'NEITHER'] = pd.Series(y_preds[:, 2])

sub_df.head()



Unnamed: 0,ID,A,B,NEITHER
0,development-1,0.091052,0.866027,0.042922
1,development-2,0.971875,0.027439,0.000687
2,development-3,0.437214,0.451358,0.111428
3,development-4,0.214174,0.515242,0.270584
4,development-5,0.127052,0.615245,0.257704


In [18]:
sub_df.to_csv("submission.csv", index=False)