In [1]:
import pandas as pd
import numpy as np
import os
import spacy
from spacy.lang.en import English
from tqdm import tqdm
from keras.layers import *
from keras.models import Model, Sequential
import keras.backend as K
from keras import callbacks
from keras.engine.topology import Layer

print(os.listdir('../input'))
print(os.listdir('../input/gap-coreference'))

Using TensorFlow backend.


['gendered-pronoun-resolution', 'gap-coreference']
['gap-development.tsv', 'gap-test.tsv', 'gap-validation.tsv']


In [2]:
test_df  = pd.read_table('../input/gap-coreference/gap-development.tsv')
train_df = pd.read_table('../input/gap-coreference/gap-test.tsv')
val_df   = pd.read_table('../input/gap-coreference/gap-validation.tsv')
nlp      = spacy.load('en_core_web_lg')
train_df.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,http://en.wikipedia.org/wiki/Norberto_Alonso
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,http://en.wikipedia.org/wiki/Aladhadh
3,test-4,"At the trial, Pisciotta said: ``Those who have...",his,526,Alliata,377,False,Pisciotta,536,True,http://en.wikipedia.org/wiki/Gaspare_Pisciotta
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,http://en.wikipedia.org/wiki/Chasers


In [3]:
def bs(lens, target):
    low, high = 0, len(lens) - 1

    while low < high:
        mid = low + int((high - low) / 2)

        if target > lens[mid]:
            low = mid + 1
        elif target < lens[mid]:
            high = mid
        else:
            return mid + 1

    return low

class Mention_Features():

    def __init__(self):

        self.nlp = nlp

    def create(self, charoffset, text):

        doc = self.nlp(text)

        lens = [token.idx for token in doc]  # The Charactor offset the token within the parent
        mention_offset = bs(lens, charoffset) - 1  # The target in which index of tokens
        mention = doc[mention_offset]  # mention

        dependency_parent = mention.head  # The syntactic parent, or "governor", of this token.
        nbor = mention.nbor()  # The following word of nbor

        sent_lens = [len(sent) for sent in doc.sents]  # the sentence length
        acc_lens = sent_lens
        pre_lens = 0
        for i in range(0, len(sent_lens)):
            pre_lens += acc_lens[i]
            acc_lens[i] = pre_lens
        sent_index = bs(acc_lens, mention_offset)  # to Find out the charoffset which sentence
        current_sent = list(doc.sents)[sent_index]
        current_sent = [token for token in current_sent]

        preceding3 = self.n_preceding_words(3, doc, mention_offset)
        following3 = self.n_following_words(3, doc, mention_offset)

        proceed_sents = [] # 3 proceeding sentence
        for i in range(sent_index - 3, sent_index):
            if i < 0: continue
            proceeding = [token for token in list(doc.sents)[sent_index - 1]]
            proceed_sents.extend(proceeding)

        if sent_index + 1 < len(list(doc.sents)): #1 succeeding sentence
            succeeding = list(doc.sents)[sent_index + 1]
            succeed_sent = [token for token in succeeding]
        else:
            succeed_sent = []

        return mention, dependency_parent, nbor, preceding3, following3, proceed_sents, current_sent, succeed_sent

    def n_preceding_words(self, n, tokens, offset):

        start = offset - n
        precedings = [None] * max(0, 0 - start)
        start = max(0, start)
        precedings += tokens[start: offset]

        return precedings

    def n_following_words(self, n, tokens, offset):

        end = offset + n
        followings = [None] * max(0, end - len(tokens))
        end = min(end, len(tokens))
        followings += tokens[offset: end]

        return followings

class Distance_Features():

    def __init__(self):

        self.nlp = nlp

    def create(self, char_offsetA, char_offsetB, text):

        doc = self.nlp(text)

        lens = [token.idx for token in doc]
        mention_offsetA = bs(lens, char_offsetA) - 1
        mention_offsetB = bs(lens, char_offsetB) - 1

        mention_dist = mention_offsetA - mention_offsetB
        #dist_oh = self.one_hot(self.buckets, dist)

        sent_lens = [len(sent) for sent in doc.sents] #the sentence length
        acc_lens = sent_lens
        pre_lens = 0
        for i in range(0, len(sent_lens)):
            pre_lens += acc_lens[i]
            acc_lens[i] = pre_lens

        sentA_index = bs(acc_lens, mention_offsetA)
        sentB_index = bs(acc_lens, mention_offsetB)

        sent_dist = sentA_index - sentB_index

        return [mention_dist, sent_dist]

def extract_embedding_features(df, text_column, offset_column,  embed_dim=300):
    text_offset_list = df[[text_column, offset_column]].values.tolist()
    extractor = Mention_Features()

    feature_map1 = np.zeros(shape=(len(text_offset_list), 3, embed_dim))
    feature_map2 = np.zeros(shape=(len(text_offset_list), 6, embed_dim))
    feature_map3 = np.zeros(shape=(len(text_offset_list), 3, embed_dim))

    for text_offset_index in range(len(text_offset_list)):
        text_offset = text_offset_list[text_offset_index]
        mention, dependency_parent, nbor, preceding3, following3, proceed_sents, current_sent, succeed_sent = extractor.create( text_offset[1], text_offset[0])

        # Feature Map1
        feature_map1[text_offset_index, 0, :] = dependency_parent.vector
        feature_map1[text_offset_index, 1, :] = mention.vector
        feature_map1[text_offset_index, 2, :] = nbor.vector

        # Feature Map2
        feature_map2[text_offset_index, 0:3, :] = np.asarray(
            [token.vector if token is not None else np.zeros((embed_dim,)) for token in preceding3])
        feature_map2[text_offset_index, 3:6, :] = np.asarray(
            [token.vector if token is not None else np.zeros((embed_dim,)) for token in following3])

        # Feature Map3
        feature_map3[text_offset_index, 0, :] = np.mean(np.asarray([token.vector for token in proceed_sents]),
                                                        axis=0) if len(proceed_sents) > 0 else np.zeros(embed_dim)
        feature_map3[text_offset_index, 1, :] =  np.mean(np.asarray([token.vector for token in current_sent]),
                                                         axis=0) if len(current_sent) > 0 else np.zeros(embed_dim)
        feature_map3[text_offset_index, 2, :] =  np.mean(np.asarray([token.vector for token in succeed_sent]),
                                                         axis=0) if len(succeed_sent) > 0 else np.zeros(embed_dim)

    return feature_map1, feature_map2, feature_map3

def extract_dist_features(df, text_column, pronoun_offset_column, name_offset_column):
    text_offset_list = df[[text_column, pronoun_offset_column, name_offset_column]].values.tolist()
    extractor = Distance_Features()
    dist_feas = []

    for text_offset_index in range(len(text_offset_list)):
        text_offset = text_offset_list[text_offset_index]
        dist_fea = extractor.create(text_offset[1], text_offset[2], text_offset[0])
        dist_feas.append(dist_fea)

    return np.asarray(dist_feas)

In [4]:
p_emb_1, p_emb_2, p_emb_3 = extract_embedding_features(train_df, 'Text', 'Pronoun-offset')
p_emb_dev_1, p_emb_dev_2, p_emb_dev_3 = extract_embedding_features(val_df, 'Text', 'Pronoun-offset')
a_emb_1, a_emb_2, a_emb_3 = extract_embedding_features(train_df, 'Text', 'A-offset')
a_emb_dev_1, a_emb_dev_2, a_emb_dev_3 = extract_embedding_features(val_df, 'Text', 'A-offset')
b_emb_1, b_emb_2, b_emb_3  = extract_embedding_features(train_df, 'Text', 'B-offset')
b_emb_dev_1, b_emb_dev_2, b_emb_dev_3 = extract_embedding_features(val_df, 'Text', 'B-offset')

In [5]:
pa_pos_tra = extract_dist_features(train_df, 'Text', 'Pronoun-offset', 'A-offset')
pa_pos_dev = extract_dist_features(val_df, 'Text', 'Pronoun-offset', 'A-offset')
pb_pos_tra = extract_dist_features(train_df, 'Text', 'Pronoun-offset', 'B-offset')
pb_pos_dev = extract_dist_features(val_df, 'Text', 'Pronoun-offset', 'B-offset')

In [6]:
class Mention_Embedding(object):

    def __init__(self, filters=120, embed_size=300):
        
        self.filters = filters
        self.embed_size = embed_size

    def build(self):
        
        P_Fea1 = Input(shape=(3, self.embed_size))  # Embedding of Parents, Mention, and Suceeding Word: String Features
        P_Fea2 = Input(shape=(6, self.embed_size))  # Embeddings of 3 proceedings words, 3 succedings words of m
        P_Fea3 = Input(shape=(3, self.embed_size))
        Antecedent_Fea1 = Input(shape=(3, self.embed_size))
        Antecedent_Fea2 = Input(shape=(6, self.embed_size))
        Antecedent_Fea3 = Input(shape=(3, self.embed_size))
        Dist_Fea = Input(shape=(2,))
        
        Dist_Embed = Dense(self.filters, use_bias=True)(Dist_Fea)

        Mention_Represent1 = self.mention_embed(P_Fea1, P_Fea2, P_Fea3, 'Mention')
        Mention_Represent2 = self.mention_embed(Antecedent_Fea1, Antecedent_Fea2, Antecedent_Fea3, 'Antecedent')

        x = self.mentionpair_embed(Mention_Represent1, Mention_Represent2)
        x = Concatenate(name='Mention_Pair_Embedding')([x, Dist_Embed])

        model = Model([P_Fea1, P_Fea2, P_Fea3, Antecedent_Fea1, Antecedent_Fea2, Antecedent_Fea3, Dist_Fea], x)

        return model

    def mention_embed(self, inp1, inp2, inp3, target):
        
        Conv1_fea1 = self.Conv1k(inp1, [1, 2, 3]) # n_gram
        Conv1_fea2 = self.Conv1k(inp2, [1, 2, 3]) # n_gram
        Conv1_fea3 = self.Conv1k(inp3, [1, 2, 3])

        self.Expand_dim = Lambda(lambda x: K.expand_dims(x, axis=1))
        Conv1_fea1 = self.Expand_dim(Conv1_fea1)
        Conv1_fea2 = self.Expand_dim(Conv1_fea2)
        Conv1_fea3 = self.Expand_dim(Conv1_fea3)
        Conv2_Input = Concatenate(axis=1)([Conv1_fea1, Conv1_fea2, Conv1_fea3])

        x = Conv2D(self.filters, kernel_size=(3, 3), activation='tanh')(Conv2_Input)
        x = MaxPool2D(pool_size=(1, 1))(x)
        x = Lambda(lambda x: K.squeeze(x, axis=1), name="{}_Embed".format(target))(x)

        return x

    def Conv1k(self, x, kernels):
        
        assert len(kernels) != 0
        convs = []
        shape = x.get_shape().as_list()
        for kernel in kernels:
            conv = Conv1D(self.filters, kernel, activation='tanh')(x)
            pool = MaxPool1D(pool_size=int(shape[1] - kernel + 1))(conv)
            pool = Dropout(0.8)(pool)
            convs.append(pool)

        convs = Concatenate(axis=1)(convs)

        return convs

    def mentionpair_embed(self, M1, M2):
        
        x = Concatenate(axis=1)([M1, M2])
        x = Conv1D(self.filters, kernel_size=2, activation='tanh')(x)
        x = MaxPool1D(pool_size=1)(x)
        x = Dropout(0.8)(x)
        x = Flatten()(x)

        return x

In [7]:
class Coreference_Classifier(object):

    def __init__(self, Mention_Pair, Mention_Embedding, filters=120, embed_size=300):
        
        self.filters = filters
        self.embed_size = embed_size
        self.Mention_Pair = Mention_Pair
        self.Mention_Embed = Mention_Embedding
        
    def build(self):

        M1 = Input(shape=(3, self.embed_size))  # Embedding of Parents, Mention, and Suceeding Word: String Features
        M2 = Input(shape=(6, self.embed_size))  # Embeddings of 3 proceedings words, 3 succedings words of m
        M3 = Input(shape=(3, self.embed_size))  # Average Embedding of 3 proceeding sentence, 1 succeding sentence, and current sentence
        A1 = Input(shape=(3, self.embed_size))
        A2 = Input(shape=(6, self.embed_size))
        A3 = Input(shape=(3, self.embed_size))
        B1 = Input(shape=(3, self.embed_size))
        B2 = Input(shape=(6, self.embed_size))
        B3 = Input(shape=(3, self.embed_size))
        Dist_M_A = Input(shape=(2,)) #Mention and Antecedent A
        Dist_M_B = Input(shape=(2,)) #Mention and Antecedent B
        
        # Define layer 
        self.Expand_dim = Lambda(lambda x: K.expand_dims(x, axis=1))
        
        Mention_Pair1 = self.Mention_Pair([M1, M2, M3, A1, A2, A3, Dist_M_A])
        Mention_Pair2 = self.Mention_Pair([M1, M2, M3, B1, B2, B3, Dist_M_B])
        Mention_embedding= self.Mention_Embed([M1, M2, M3])
        Mention_Embedding = Flatten()(Mention_embedding)
        
        output1 = Concatenate()([Mention_Pair1, Mention_Pair2, Mention_Embedding]) 
        output1 = BatchNormalization()(output1)
        output1 = Dense(self.filters, use_bias=True, activation='relu')(output1)
        output1 = Dense(self.filters, use_bias=True, activation='relu')(output1)
        output1 = Dense(3, use_bias=True, activation='softmax', name='cluster_output')(output1)
        
        pair1 = self.cluster_classifier(Mention_Pair1, "pair1")
        pair2 = self.cluster_classifier(Mention_Pair2, "pair2")
        output2 = Add(name='singleton_output')([pair1, pair2]) # if output2 great and equal 1, there is a ancedent to represent pronoun
                                                               # if ouptut2 less than 1, there 
        
        model = Model([M1, M2, M3, A1, A2, A3, B1, B2, B3, Dist_M_A, Dist_M_B], [output1, output2])

        return model
    
    def cluster_classifier(self, x, _name):
        
        x = Dense(self.filters, use_bias=True, activation='relu')(x)
        x = Dense(1, use_bias=True, activation='sigmoid', name='{}_output'.format(_name))(x)
        
        return x

In [8]:
Embedding_model = Mention_Embedding().build()
Embedding_model.summary()
layer_name = 'Mention_Embed'
Mention_Pair = Model(Embedding_model.inputs, Embedding_model.output)
Mention_Embedding = Model([Embedding_model.inputs[0], Embedding_model.inputs[1], Embedding_model.inputs[2]],
Embedding_model.get_layer(layer_name).output)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6, 300)       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_5 (I

In [9]:
model = Coreference_Classifier(Mention_Pair, Mention_Embedding).build()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 6, 300)       0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 3, 300)       0                                            
__________________________________________________________________________________________________
input_12 (

In [10]:
X_train = [p_emb_1, p_emb_2, p_emb_3, a_emb_1, a_emb_2, a_emb_3, b_emb_1, b_emb_2, b_emb_3, pa_pos_tra, pb_pos_tra]
X_dev = [p_emb_dev_1, p_emb_dev_2, p_emb_dev_3, a_emb_dev_1, a_emb_dev_2, a_emb_dev_3, b_emb_dev_1, b_emb_dev_2, b_emb_dev_3, pa_pos_dev, pb_pos_dev]

In [11]:
def _row_to_y(row):
    if row.loc['A-coref']:
        return 0
    if row.loc['B-coref']:
        return 1
    return 2

y_tra = train_df.apply(_row_to_y, axis=1)
y_dev = val_df.apply(_row_to_y, axis=1)
y_test = test_df.apply(_row_to_y, axis=1)

def _row_to_y_AB(row):
    if row.loc['B-coref'] or row.loc['A-coref']:
        return 1
    return 0

y_tra_AB = train_df.apply(_row_to_y_AB, axis=1)
y_dev_AB = val_df.apply(_row_to_y_AB, axis=1)

In [12]:
def custom_mse(y_true, y_pred):
    y_pred = K.clip(y_pred, 0, 1)
    return  K.mean(K.square(y_pred - y_true), axis=-1)

In [13]:
model.compile(optimizer='adam', 
              loss={'cluster_output':'sparse_categorical_crossentropy', 'singleton_output':custom_mse},
              loss_weights={'cluster_output': 1.0, 'singleton_output': 3.0},
              metrics={'cluster_output':"sparse_categorical_accuracy"})
file_path = "best_model.hdf5"
check_point = callbacks.ModelCheckpoint(file_path, monitor = "val_cluster_output_loss", verbose = 1, save_best_only = True, mode = "min")
early_stop = callbacks.EarlyStopping(monitor = "val_cluster_output_loss", mode = "min", patience=3)
history = model.fit(X_train,{'cluster_output': y_tra, 'singleton_output':y_tra_AB} , 
                    batch_size=20, epochs=20, 
                    validation_data=(X_dev, {'cluster_output': y_dev, 'singleton_output':y_dev_AB}), 
                    shuffle=True, callbacks = [check_point, early_stop])

Train on 2000 samples, validate on 454 samples
Epoch 1/20

Epoch 00001: val_cluster_output_loss improved from inf to 1.01221, saving model to best_model.hdf5
Epoch 2/20

Epoch 00002: val_cluster_output_loss improved from 1.01221 to 0.95898, saving model to best_model.hdf5
Epoch 3/20

Epoch 00003: val_cluster_output_loss improved from 0.95898 to 0.91786, saving model to best_model.hdf5
Epoch 4/20

Epoch 00004: val_cluster_output_loss improved from 0.91786 to 0.87049, saving model to best_model.hdf5
Epoch 5/20

Epoch 00005: val_cluster_output_loss improved from 0.87049 to 0.77097, saving model to best_model.hdf5
Epoch 6/20

Epoch 00006: val_cluster_output_loss improved from 0.77097 to 0.76751, saving model to best_model.hdf5
Epoch 7/20

Epoch 00007: val_cluster_output_loss improved from 0.76751 to 0.74200, saving model to best_model.hdf5
Epoch 8/20

Epoch 00008: val_cluster_output_loss did not improve from 0.74200
Epoch 9/20

Epoch 00009: val_cluster_output_loss improved from 0.74200 to 

In [14]:
p_emb_test1, p_embed_test2, p_embed_test3 = extract_embedding_features(test_df, 'Text', 'Pronoun-offset')
a_emb_test1, a_embed_test2, a_embed_test3 = extract_embedding_features(test_df, 'Text', 'A-offset')
b_emb_test1, b_embed_test2, b_embed_test3 = extract_embedding_features(test_df, 'Text', 'B-offset')
pa_pos_test = extract_dist_features(test_df, 'Text', 'Pronoun-offset', 'A-offset')
pb_pos_test = extract_dist_features(test_df, 'Text', 'Pronoun-offset', 'B-offset')

In [15]:
X_test = [p_emb_test1, p_embed_test2, p_embed_test3, a_emb_test1, a_embed_test2, a_embed_test3, b_emb_test1, b_embed_test2, b_embed_test3, pa_pos_test, pb_pos_test]

In [16]:
model.load_weights('./best_model.hdf5')
layer_name = "cluster_output"
predict_model = Model(model.inputs, model.get_layer(layer_name).output)

y_preds = predict_model.predict(X_test, batch_size = 1024, verbose = 1)

sub_df_path = os.path.join('../input/gendered-pronoun-resolution/', 'sample_submission_stage_1.csv')
sub_df = pd.read_csv(sub_df_path)
sub_df.loc[:, 'A'] = pd.Series(y_preds[:, 0])
sub_df.loc[:, 'B'] = pd.Series(y_preds[:, 1])
sub_df.loc[:, 'NEITHER'] = pd.Series(y_preds[:, 2])

sub_df.head(20)



Unnamed: 0,ID,A,B,NEITHER
0,development-1,0.113092,0.790919,0.095988
1,development-2,0.987404,0.006889,0.005707
2,development-3,0.157487,0.681982,0.160531
3,development-4,0.102882,0.395021,0.502098
4,development-5,0.035743,0.945575,0.018682
5,development-6,0.879141,0.101694,0.019165
6,development-7,0.544976,0.138216,0.316808
7,development-8,0.03094,0.942208,0.026851
8,development-9,0.086205,0.829222,0.084573
9,development-10,0.316897,0.415479,0.267624


In [17]:
sub_df.to_csv("submission.csv", index=False)