In [1]:
#!git clone https://github.com/google-research-datasets/gap-coreference.git

In [2]:
!dir gap-coreference

constants.py	 gap-development.tsv  gap-test.tsv	  LICENSE
CONTRIBUTING.md  gap_scorer.py	      gap-validation.tsv  README.md


In [5]:
train_file = 'gap-coreference/gap-development.tsv'
val_file = 'gap-coreference/gap-validation.tsv'
test_file = 'gap-coreference/gap-test.tsv'

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
train_df = pd.read_csv(train_file,sep = '\t')
val_df = pd.read_csv(train_file,sep = '\t')

train_df.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,development-1,Zoe Telford -- played the police officer girlf...,her,274,Cheryl Cassidy,191,True,Pauline,207,False,http://en.wikipedia.org/wiki/List_of_Teachers_...
1,development-2,"He grew up in Evanston, Illinois the second ol...",His,284,MacKenzie,228,True,Bernard Leach,251,False,http://en.wikipedia.org/wiki/Warren_MacKenzie
2,development-3,"He had been reelected to Congress, but resigne...",his,265,Angeloz,173,False,De la Sota,246,True,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_...
3,development-4,The current members of Crime have also perform...,his,321,Hell,174,False,Henry Rosenthal,336,True,http://en.wikipedia.org/wiki/Crime_(band)
4,development-5,Her Santa Fe Opera debut in 2005 was as Nuria ...,She,437,Kitty Oppenheimer,219,False,Rivera,294,True,http://en.wikipedia.org/wiki/Jessica_Rivera


## Prepare training data

Note down token positions of P, A and B

In [13]:
import torch
from torch.utils.data import (Dataset,DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

from pytorch_transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [67]:
def get_tokens_array(row):
        p_offset = row['Pronoun-offset']
        a_offset = row['A-offset']
        b_offset = row['B-offset']
        offsets = {'p':p_offset,'a':a_offset,'b':b_offset}
        
        keys = sorted(offsets,key=lambda z : offsets[z])
        positions = sorted(offsets.values())
        lengths = [positions[i] - positions[i-1] if i >0 else pos for i,pos in enumerate(positions)]
        
        text = row['Text']
        positions.append(len(text))
        
        all_tokens = []
        prev_pos = 0
        for pos in positions:
            subtext = text[prev_pos:pos]
            prev_pos = pos
            tokens = tokenizer.tokenize(subtext)
            all_tokens.append(tokens)
        
        return all_tokens, keys



def get_token_pos(all_tokens, keys):
    token_positions = {}
    prev = 0
    for i,tokens in enumerate(all_tokens[:-1]):
        position = prev+len(tokens)
        prev = position
        token_positions[keys[i]] = position
    return token_positions


def get_tokens_with_positions(row):
    '''
    Returns 
    final_tokens : Text onverted to tokens
    token_positions : dictionary containing token position for a,b and p
    
    Example:
    index = 5
    print(train_df.iloc[index])
    final_tokens, token_positions = get_tokens_with_positions(train_df.iloc[index])
    print(final_tokens, token_positions)
    [final_tokens[v] for v in token_positions.values()]
    '''
    tokens_arr, keys = get_tokens_array(row)
    
    final_tokens = []
    for tokens in tokens_arr:
        final_tokens = final_tokens + tokens 
    
    token_positions = get_token_pos(tokens_arr, keys)
    return final_tokens, token_positions



In [79]:
index = 5
row = train_df.iloc[index]
print(row)
final_tokens, token_positions = get_tokens_with_positions(row)
print(final_tokens, token_positions)


assert(row['Pronoun'].lower() == final_tokens[token_positions['p']])

print([final_tokens[v] for v in token_positions.values()])
[token_positions[key] for key in 'pab']

ID                                                    development-6
Text              Sandra Collins is an American DJ. She got her ...
Pronoun                                                         She
Pronoun-offset                                                  411
A                                                           Collins
A-offset                                                        236
A-coref                                                        True
B                                                                DJ
B-offset                                                        347
B-coref                                                       False
URL                     http://en.wikipedia.org/wiki/Sandra_Collins
Name: 5, dtype: object
['sandra', 'collins', 'is', 'an', 'american', 'dj', '.', 'she', 'got', 'her', 'start', 'on', 'the', 'west', 'coast', 'of', 'the', 'u', '.', 's', '.', 'in', 'phoenix', ',', 'arizona', 'and', 'into', 'reside', '##ncies', 'in', 'los

[84, 50, 73]

In [74]:
if not train_df.iloc[0]['A-coref']:
    print ('yes')

Output Format:
    
text_tokens	Pindex,Aindex,Bindex	label(0,1,2)

labels:
0= Neither
1 = A
2 = B

Without knowing P

Softmax over [CLS],A,B index, loss with labels

With knowing P
Softmax over [CLS] * P ,A * P ,B * P index, loss with labels

In [119]:
import numpy as np
def create_features(df):
    processed_df = pd.DataFrame()
    
    for i in range(len(df)):
        #print(i)
        row = df.iloc[i]
        final_tokens, token_positions = get_tokens_with_positions(row)
        
        
        assert(final_tokens[token_positions['p']] in row['Pronoun'].lower())
        assert(final_tokens[token_positions['a']] in row['A'].lower()), print(row)
        assert(final_tokens[token_positions['b']] in row['B'].lower())
        
        ids = tokenizer.convert_tokens_to_ids(final_tokens)
        pab_position = [token_positions[key] for key in 'pab']
        label = 1 if row['A-coref'] else ( 2 if row['B-coref'] else 0)
        
        processed_df = processed_df.append({'input':np.array(ids), 'pab_pos':np.array(pab_position), 'label':label}, ignore_index=True)
    return processed_df

# process validation data

In [126]:
processed_df = create_features(val_df)
print(processed_df.head())

#for reading purpose
processed_df.to_csv('val_processed.tsv', sep='\t')

#saving to TSV will not store the data types of ndarray. It converts them to str
processed_df.to_pickle('val_processed.pkl')
df = pd.read_pickle('val_processed.pkl')
df.head()

                                               input  label        pab_pos
0  [11199, 10093, 3877, 1011, 1011, 2209, 1996, 2...    1.0   [61, 41, 44]
1  [2002, 3473, 2039, 1999, 6473, 2669, 1010, 430...    1.0   [60, 50, 53]
2  [2002, 2018, 2042, 20847, 2000, 3519, 1010, 20...    2.0   [62, 40, 56]
3  [1996, 2783, 2372, 1997, 4126, 2031, 2036, 286...    2.0   [66, 35, 70]
4  [2014, 4203, 10768, 3850, 2834, 1999, 2384, 20...    2.0  [104, 53, 71]


Unnamed: 0,input,label,pab_pos
0,"[11199, 10093, 3877, 1011, 1011, 2209, 1996, 2...",1.0,"[61, 41, 44]"
1,"[2002, 3473, 2039, 1999, 6473, 2669, 1010, 430...",1.0,"[60, 50, 53]"
2,"[2002, 2018, 2042, 20847, 2000, 3519, 1010, 20...",2.0,"[62, 40, 56]"
3,"[1996, 2783, 2372, 1997, 4126, 2031, 2036, 286...",2.0,"[66, 35, 70]"
4,"[2014, 4203, 10768, 3850, 2834, 1999, 2384, 20...",2.0,"[104, 53, 71]"


# Process Train data

In [127]:
processed_df = create_features(train_df)
print(processed_df.head())

processed_df.to_csv('train_processed.tsv', sep='\t')

processed_df.to_pickle('train_processed.pkl')
df = pd.read_pickle('train_processed.pkl')

df.head()

                                               input  label        pab_pos
0  [11199, 10093, 3877, 1011, 1011, 2209, 1996, 2...    1.0   [61, 41, 44]
1  [2002, 3473, 2039, 1999, 6473, 2669, 1010, 430...    1.0   [60, 50, 53]
2  [2002, 2018, 2042, 20847, 2000, 3519, 1010, 20...    2.0   [62, 40, 56]
3  [1996, 2783, 2372, 1997, 4126, 2031, 2036, 286...    2.0   [66, 35, 70]
4  [2014, 4203, 10768, 3850, 2834, 1999, 2384, 20...    2.0  [104, 53, 71]


Unnamed: 0,input,label,pab_pos
0,"[11199, 10093, 3877, 1011, 1011, 2209, 1996, 2...",1.0,"[61, 41, 44]"
1,"[2002, 3473, 2039, 1999, 6473, 2669, 1010, 430...",1.0,"[60, 50, 53]"
2,"[2002, 2018, 2042, 20847, 2000, 3519, 1010, 20...",2.0,"[62, 40, 56]"
3,"[1996, 2783, 2372, 1997, 4126, 2031, 2036, 286...",2.0,"[66, 35, 70]"
4,"[2014, 4203, 10768, 3850, 2834, 1999, 2384, 20...",2.0,"[104, 53, 71]"
