# Sentence Boundary detection with NER features

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import torch
import pandas as pd
import numpy as np
import transformers
from tqdm import tqdm
import csv

In [3]:
train_file = './data/test.tsv'
test_file = './data/test.tsv'
#test_file = './data_v1/europarl-sbd-eval.tsv'

In [4]:
train_df = pd.read_csv(train_file, delimiter='\t', engine='python', encoding='UTF-8', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
train_df.head()
#train_df = pd.read_excel(train_file, delimiter='\t', encoding='UTF-8', error_bad_lines=False, header=None)
#train_df.head()

Unnamed: 0,0,1,2,3
0,La,DET:ART,O,B-SENT
1,cérémonie,NOM,O,O
2,aura,VER:futu,O,O
3,lieu,NOM,O,O
4,le,DET:ART,O,O


In [5]:
test_df = pd.read_csv(test_file, delimiter='\t', engine='python', encoding='UTF-8', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
test_df.head()

Unnamed: 0,0,1,2,3
0,La,DET:ART,O,B-SENT
1,cérémonie,NOM,O,O
2,aura,VER:futu,O,O
3,lieu,NOM,O,O
4,le,DET:ART,O,O


In [6]:
def set_sentence_num(df): 

    sent_num = 0
    df['sent_num'] = sent_num
    for idx in range(len(df)):
        df['sent_num'][idx] = sent_num
        if df[0][idx]=='.' and df[1][idx]=="SENT":
            sent_num +=1
    df.head()
    print(sent_num)
    
    return df

In [7]:
train_df = set_sentence_num(train_df)
test_df = set_sentence_num(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


3500
3500


In [8]:
train_df = train_df[train_df['sent_num'] < 100]
test_df = test_df[test_df['sent_num'] < 100]

In [9]:
sentence = train_df[train_df['sent_num']==2]
token_list =  ' '.join([token for token in sentence[0]])
print(token_list)

La famille remercie toutes les personnes qui s' associeront à sa peine .


In [38]:
class FSBDataset():
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.idxtob = {'B-SENT': 1}
        self.idxtoPOS = {'DET:ART': 0,'NAM': 1,'KON': 2,'PUN': 3,'DET:POS': 4,'NOM': 5,'VER:pres': 6,'PRP': 7,'PRO:PER': 8,'VER:infi': 9,'PRP:det': 10,'VER:simp': 11,'VER:pper': 12,'NUM': 13,'SENT': 14,'ABR': 15,'VER:futu': 16,'PRO:DEM': 17,'ADJ': 18,
 'PRO:REL': 19,'PRO:IND': 20,'ADV': 21,'SYM': 22,'PUN:cit': 23,'VER:impf': 24,'VER:subp': 25,'VER:subi': 26,'VER:ppre': 27,'VER:cond': 28,'PRO:POS': 29,'VER:impe': 30}
        self.idxtoNER = {'I-LOC': 0, 'I-PER': 1, 'O': 2, 'I-ORG': 3}
        
        
    def __len__(self):
        return self.data['sent_num'].max()
    
    def __getitem__(self, item):
        
        sentence = train_df[train_df['sent_num']==item]
        token_list =  [token for token in sentence[0]]
        target_list =  [target for target in sentence[3]]
        target_ids_list =  [1 if token=="B-SENT" else 0 for token in sentence[3]]
        pos_ids = [self.idxtoPOS.get(pos) for pos in sentence[1]]
        ner_ids = [self.idxtoNER.get(ner) for ner in sentence[2]]
        

        
        encoded = self.tokenizer.encode_plus(' '.join(token_list),
                                            None,
                                            add_special_tokens=True,
                                            max_length=self.max_length,
                                            truncation=True,
                                            pad_to_max_length=True)
        
        ids = encoded['input_ids']
        mask = encoded['attention_mask']
        
        bpe_head_mask = [0]; upos_ids = [-1] # --> CLS token
        
        for word, target in zip(token_list, target_list):
            bpe_len = len(self.tokenizer.tokenize(word))
            head_mask = [1] + [0]*(bpe_len-1)
            bpe_head_mask.extend(head_mask)
            upos_mask = [self.idxtob.get(target,0)] + [-1]*(bpe_len-1)
            upos_ids.extend(upos_mask)
            #print("head_mask", head_mask)
        
        bpe_head_mask.append(0); upos_ids.append(-1) # --> END token
        bpe_head_mask.extend([0] * (self.max_length - len(bpe_head_mask))); upos_ids.extend([-1] * (self.max_length - len(upos_ids))) ## --> padding by max_len

        
        print(target_ids_list)
        print(pos_ids)
        print(ner_ids)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            #'target': torch.tensor(target_list, dtype=torch.long),
            'bpe_head_mask': torch.tensor(bpe_head_mask, dtype=torch.long),
            'target_ids': torch.tensor(upos_ids, dtype=torch.long),
            'pos_ids': torch.tensor(pos_ids, dtype=torch.long)
            #'ner_ids': torch.tensor(ner_ids, dtype=torch.long)
            
        }
        
        
        
        

In [12]:
class CharModel(torch.nn.Module):

    def __init__(self, num_char, dim_char, hidden_size=200, layer_size=1, drop_out=0.3, mlp_size=100, num_out=100):
        super(CharModel, self).__init__()

        self.dim_char = dim_char 
        self.hidden_size = hidden_size
        self.layer_size = layer_size
        self.dropout = 0.3
        self.directions = 2
        self.mlp_use = True
        self.char_out = num_out if self.mlp_use else (self.hidden_size * self.directions * 2)

        self.char_emb = nn.Embedding(num_char, dim_char, padding_idx=0)
        self.char_LSTM = nn.LSTM(dim_char, self.hidden_size, self.layer_size, dropout=self.dropout, bidirectional=True)
        self.char_mlp = MLP(self.hidden_size * self.directions * 2, self.char_out, 0.3)
        self.drop_out = nn.Dropout(p=self.dropout)
 #       self.att_score = nn.Softmax(dim=1)
 #       self.char_linear = nn.Linear(self.mlp_size * self.hidden_size * self.directions, num_out, bias=True)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
    def forward(self, char_seq, char_len):
        
        pad_seqs = char_seq
        char_vecs = self.char_emb(pad_seqs)
        char_len_list = np.array([len(sentence) for sentence in char_vecs])   #  np.array(char_len)
        
#        self.char_LSTM.flatten_parameters() # To apply multi-gpu excution, https://discuss.pytorch.org/t/rnn-module-weights-are-not-part-of-single-contiguous-chunk-of-memory/6011/14
#        total_length = input_emb.size(1) # To apply multi-gpu excution, https://pytorch.org/docs/stable/notes/faq.html

        h0 = self.init_hidden(char_vecs.size(0)) # initial state of LSTM
        global_indices = np.argsort(-np.array(char_len_list)).astype(np.int64) #sorting based on seq_len
        char_len_list_ordered = char_len_list[global_indices] #Sorting "char_len_list" based on the indices
        char_vecs_ordered = torch.stack([char_vecs[order] for order in global_indices]) #Stacking batches as an embedding based on dec order                
        char_sentence_packed = nn.utils.rnn.pack_padded_sequence(char_vecs_ordered, char_len_list_ordered, batch_first=True)
        char_lstm_batch_out = self.char_LSTM(char_sentence_packed, h0)[0] # (Batch * seq_len x  2*LSTM_hidden)
        char_lstm_batch_out = nn.utils.rnn.pad_packed_sequence(char_lstm_batch_out, batch_first=True)[0] # (Batch x seq_len x 2*LSTM_hidden)
        char_lstm_batch_out = char_lstm_batch_out.index_select(dim=0, index=_model_var(self, torch.from_numpy(np.argsort(global_indices).astype(np.int64)))) #make it back unorder

        batch_size = char_vecs.size(0)
        word_len = [len(sentence) for sentence in char_len]
        word_max = max(word_len) 
        char_max = max(char_len_list)

        char_feature = torch.zeros(batch_size, word_max, self.hidden_size * self.directions *2 ).to(self.device)
        mask_s = torch.zeros(batch_size, char_max, dtype=torch.uint8).to(self.device)
        mask_e = torch.zeros(batch_size, char_max, dtype=torch.uint8).to(self.device)
        for batch_idx in range(batch_size):
            ch_start_idx, ch_end_idx = 0,0
            for token_idx, length in enumerate(char_len[batch_idx]):
                token_lenth = length
                ch_end_idx = ch_start_idx + token_lenth 
                mask_s[batch_idx, ch_start_idx] = 1 #char_global_lstm_batch_out[0, 0:6, :] ==> *root*
                mask_e[batch_idx, ch_end_idx] = 1 #char_global_lstm_batch_out[0, 0:6, :] ==> *root*
                ch_start_idx = ch_end_idx + 1 # +1 == white space KKL

        char_s_masked = char_lstm_batch_out[mask_s,:]
        char_e_masked = char_lstm_batch_out[mask_e,:]
        char_masked = torch.cat((char_s_masked, char_e_masked),-1)
        
        w_idx = 0
        for batch, seq in zip(range(batch_size), word_len):
            char_feature[batch,:seq,:] = char_masked[w_idx:w_idx+seq,:]
            w_idx += seq
            
        char_feature = self.char_mlp(char_feature) if self.mlp_use else char_feature

        return char_feature
    
    
    def init_hidden(self, batch_size):
        '''Create initial hidden state of zeros: 2-tuple of num_layers x batch size x hidden dim'''
        num_layers = self.layer_size * self.directions
        init = torch.zeros(self.layer_size, batch_size, self.hidden_size).to(self.device)
        init = torch.nn.init.xavier_normal_(init)
        h0 = (init, init.clone())



class XLMRobertaBaseline(torch.nn.Module):
    def __init__(self):
        super(XLMRobertaBaseline, self).__init__()
        
        self.bert = transformers.XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.dropout = torch.nn.Dropout(0.33)
        self.classfier = torch.nn.Linear(768, 2)
        
        num_pos = 31
        num_ner = 4
        pos_dim = 50
        ner_dim = 25
        
        self.classfier_pos = torch.nn.Linear(768, 31)
        self.classfier_ner = torch.nn.Linear(768, 4)
        
        self.pos_emb = torch.nn.Embedding(num_pos, pos_dim)
        self.ner_emb = torch.nn.Embedding(num_ner, ner_dim)
        
        
    def forward(self, ids, mask):
        
        o1, o2 = self.bert(ids, mask)
        out = self.dropout(o1)
        
        #Step1: predict POS tags for the entire toekns
        pos_logits = self.classfier_pos(out)
        ner_logits = self.classfier_ner(out)
        
        pos_idx = torch.argmax(pos_logits)
        ner_idx = torch.argmax(ner_logits)
        
        pos_emb = self.pos_emb(pos_idx)
        ner_emb = self.ner_emb(ner_idx)
        
        logits = self.classfier(out)
        
        return logits
        
       

In [39]:
MAX_LEN = 640
tokenizer = transformers.XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
train_dataset = FSBDataset(train_df, tokenizer, MAX_LEN)
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, num_workers=1, batch_size=2)
test_dataset = FSBDataset(test_df, tokenizer, MAX_LEN)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, num_workers=1, batch_size=2)

In [14]:
model = XLMRobertaBaseline()
model = torch.nn.DataParallel(model)
model = model.cuda()

In [15]:
optimizer = transformers.AdamW(params=model.parameters(), lr=0.000005)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [16]:
def f1_score(total_pred, total_targ):
    
    p = 0 # (retrived SB and real SB) / retrived SB  # The percentage of (the number of correct predictions) / (the number of predction that system predicts as B-SENT)
    r = 0
    f1= 0
    
    #print("total_pred", total_pred, len(total_pred))
    #print("total_targ", total_targ, len(total_targ))
    
    np_total_pred = np.array(total_pred)
    np_total_tag = np.array(total_targ)
    
    #precision
    incidence_nopad = np.where(np_total_tag != -1) ## eliminate paddings
    #print("incidence_nopad", incidence_nopad)
    
    np_total_pred_nopad = np_total_pred[incidence_nopad]
    np_total_tag_nopad = np_total_tag[incidence_nopad]
    
    incidence_nopad_sb = np.where(np_total_pred_nopad == 1)
    np_total_pred_nopad_sb = np_total_pred_nopad[incidence_nopad_sb]
    np_total_tag_nopad_sb = np_total_tag_nopad[incidence_nopad_sb]
    
    count_active_tokens_p = len(np_total_pred_nopad_sb)
    count_correct_p = np.count_nonzero((np_total_pred_nopad_sb==np_total_tag_nopad_sb) == True)
    
    '''
    np_total_pred_incid = np_total_pred[incidence_p]
    print("np_total_pred_incid", np_total_pred_incid)
    ids_sb_pred_p = np.where(np_total_pred_incid==1)
    np_total_pred_p = np_total_pred_incid[ids_sb_pred_p]
    np_total_tag_p = np_total_tag[ids_sb_pred_p]
    
    print("ids_sb_pred_p", ids_sb_pred_p)
    print("np_total_pred_p", np_total_pred_p)
    print("np_total_tag_p", np_total_tag_p)
    
    count_active_tokens_p = len(np_total_pred_p)
    count_correct_p = np.count_nonzero((np_total_pred_p==np_total_tag_p) == True)
    '''
    
    print("count_correct_p", count_correct_p)
    print("count_active_tokens_p", count_active_tokens_p)
    
    p = count_correct_p/count_active_tokens_p
    print("precision:", p)

    
    #recall
    ids_sb_pred_r = np.where(np_total_tag==1)
    np_total_pred_r = np_total_pred[ids_sb_pred_r]
    np_total_tag_r = np_total_tag[ids_sb_pred_r]
    
    #print("ids_sb_pred_r", ids_sb_pred_r)
    #print("np_total_pred_r", np_total_pred_r)
    #print("np_total_tag_r", np_total_tag_r)
    
    count_active_tokens_r = len(np_total_pred_r)
    count_correct_r = np.count_nonzero((np_total_pred_r==np_total_tag_r) == True)
    
    print("count_active_tokens_r", count_active_tokens_r)
    print("count_correct_r", count_correct_r)
    
    r = count_correct_r/count_active_tokens_r
    print("recall:", r)
    
    
    #F1
    f1 = 2*(p*r) / (p+r)
    print("F1:", f1)
    
    #count_active_tokens_recall = np.count_nonzero(np.array(total_targ) > -1)
    #print("count_active_tokens_recall", count_active_tokens_recall)
    #count_active_tokens_precision = np.count_nonzero(np.array(total_targ) > -1)
    
    #count_correct = np.count_nonzero((np.array(total_pred)==np.array(total_targ)) == True)
    #print("count_correct",count_correct)
    #print("ACCURACY:", count_correct/count_active_tokens)
    

In [17]:
def train_loop_fn(train_loader, model, optimizer, DEVICE=None, scheduler=None):
    model.train()
    
    total_pred = []
    total_targ = []
    total_loss = []
    
    for idx, batch in tqdm(enumerate(train_loader), total=len(train_loader)):
        optimizer.zero_grad()
        #print(batch['ids'], len(batch['ids']), batch['ids'].size() )
        #print(batch['mask'], len(batch['mask']))
        #print(batch['bpe_head_mask'], len(batch['bpe_head_mask']))
        #print(batch['upos_ids'], len(batch['upos_ids']))

        logists = model(batch['ids'].cuda(), batch['mask'].cuda())
        #print(logists, logists.size())
        #print(batch['upos_ids'], batch['upos_ids'].size())
        #print(logists.view(45,9), logists.view(45,9).size())
        #print(batch['upos_ids'].view(45), batch['upos_ids'].view(45).size())
        b,s,l = logists.size()
        loss = loss_fn(logists.view(b*s,l), batch['target_ids'].cuda().view(b*s))
        total_loss.append(loss.item())
        total_pred.extend(torch.argmax(logists.view(b*s,l), 1).cpu().tolist())
        total_targ.extend(batch['target_ids'].cuda().view(b*s).cpu().tolist())
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
            
        #print("batch",batch)
        #break
    #print(total_pred, len(total_pred))
    #print(total_targ, len(total_targ))
    count_active_tokens = np.count_nonzero(np.array(total_targ) > -1)
    count_correct = np.count_nonzero((np.array(total_pred)==np.array(total_targ)) == True)
    #print("TRAINING ACCURACY:", count_correct/count_active_tokens)
    f1_score(total_pred, total_targ)
    #f1_score(total_pred[2:], total_targ[2:])
    #print(count_active_tokens)
    #print(count_correct)

    
def dev_loop_fn(dev_loader, model, optimizer, DEVICE=None, scheduler=None):
    model.eval()
    
    total_pred = []
    total_targ = []
    total_loss = []
    total_middle_pred = []
    total_middle_targ = []

    with torch.no_grad():
        for idx, batch in tqdm(enumerate(dev_loader), total=len(dev_loader)):

            logists = model(batch['ids'].cuda(), batch['mask'].cuda())
            b,s,l = logists.size()
            #print(b,s,l)
            loss = loss_fn(logists.view(b*s,l), batch['target_ids'].cuda().view(b*s))
            total_loss.append(loss.item())
            total_pred.extend(torch.argmax(logists.view(b*s,l), 1).cpu().tolist())
            total_targ.extend(batch['target_ids'].cuda().view(b*s).cpu().tolist())
            

            logists2 = logists[:,2:,]
            b,s,l = logists2.size()
            #print(b,s,l)
            total_middle_pred.extend(torch.argmax(logists2.contiguous().view(b*s,l), 1).cpu().tolist())
            total_middle_targ.extend(batch['target_ids'][:,2:].cuda().contiguous().view(b*s).cpu().tolist())

    f1_score(total_pred, total_targ)
    f1_score(total_middle_pred, total_middle_targ)

    
    
    #count_active_tokens = np.count_nonzero(np.array(total_targ) > -1)
    #count_correct = np.count_nonzero((np.array(total_pred)==np.array(total_targ)) == True)
    #print("TESTING ACC:", count_correct/count_active_tokens)
    

In [40]:
for idx in range(100):
    train_loop_fn(train_loader, model, optimizer)
    break
    dev_loop_fn(test_loader, model, optimizer)







  0%|          | 0/50 [00:00<?, ?it/s][A[A[A[A[A

[1, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 5, 6, 12, 7, 4, 5, 12, 14]
[2, 2, 2, 2, 2, 2, 2, 2, 2]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 5, 7, 5, 16, 12, 0, 5, 13, 5, 13, 3, 7, 5, 3, 7, 0, 5, 7, 1, 14]
[2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[4, 5, 16, 5, 5, 13, 5, 3, 7, 5, 3, 7, 0, 5, 7, 5, 3, 12, 7, 0, 5, 10, 5, 14]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2]


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/work/.local/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/work/.local/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/home/work/.local/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 74, in default_collate
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File "/home/work/.local/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 74, in <dictcomp>
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File "/home/work/.local/lib/python3.6/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [9] at entry 0 and [21] at entry 1


[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[5, 1, 6, 7, 0, 5, 18, 7, 0, 5, 7, 1, 14]
[2, 1, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[5, 1, 6, 10, 5, 1, 3, 13, 3, 5, 10, 1, 7, 1, 3, 21, 10, 5, 3, 14]
[2, 1, 2, 2, 2, 1, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 5, 6, 7, 5, 20, 0, 5, 19, 16, 5, 7, 4, 5, 14]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [None]:
'''
for batch in train_loader:
    
    model.train()
    
    print(batch)
    logits = model(batch['ids'], batch['mask'])
    print(logits)
    
    break
    
    '''

In [None]:
tmp = train_df.groupby('sent_num').max()
print(tmp)

In [None]:
tt = train_df[train_df['sent_num'] == 12025]

In [None]:
tt

In [None]:
tt.head(60)

In [None]:
tt.tail(60)

In [None]:
def build_voca(self, file):
    
    with open(file, encoding='UTF-8') as f:
        f.readline()
    
    return voca

In [None]:
file = test_file

In [None]:
with open(file, encoding='UTF-8') as f:
    print(f.readline())

In [None]:
word_dist = set(list(test_df[0]))

In [None]:
from collections import defaultdict
from collections import Counter
charsCount = Counter()

In [None]:
pos_list = list(train_df[2].unique())

In [None]:
pos_dic = {pos:idx for idx, pos in enumerate(pos_list)}

In [None]:
pos_dic

In [None]:
for word in word_dist:
    charsCount.update(list(word))


In [None]:
charsCount