Fine tune a NER model to predict player names, times, teams in score list, card list and substitution list

### Load training data

In [37]:
import json
with open('train/train.jsonl',encoding="utf-8") as f:
    train = [json.loads(jline) for jline in f.read().splitlines()]
len(train)

701

In [32]:
def item_to_string(item):
    s=''
    for para in item['original_doc']['_source']['body']:
        s+=para['text']+'\n'
    return s

In [2]:
max_length=0
for item in train[:30]:
    for para in item['original_doc']['_source']['body']:
        max_length=max(max_length,len(para['text'].split()))
print(max_length)

270


## Tokenize and generate NER training data

### Load VNcoreNLP

In [146]:
from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP("VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

# Input 
text = "Diego Costa"

# To perform word (and sentence) segmentation
sentences = rdrsegmenter.tokenize(text) 
print(sentences)

[['Diego_Costa']]


In [111]:
def match_arr(sub, arr, idx):
    for i in range(len(sub)):
        if sub[i]!=arr[idx+i]:
            return False
    return True

def find_array(sub,arr):
    if len(sub)==0 or len(arr)==0:
        return -1
    for i in range(len(arr)-len(sub)+1):
        if match_arr(sub,arr,i):
            return i
    return -1

def find_all_array(sub,arr):
    result=[]
    for i in range(len(arr)-len(sub)+1):
        if match_arr(sub,arr,i):
            result.append(i)
    return result

def ref_id_to_info(ref_id,match_summary):
    ''' Input: event id
        Output: information in match_summary refer to that event
    '''
    output=[]
    for category in match_summary:
        if category in {'score_list','card_list','substitution_list'}:
            if isinstance(match_summary[category],list):
                output+=[(category,item) for item in match_summary[category] if str(ref_id) in item['ref_event_ids']]
            else:
                if ref_id in match_summary[category]['ref_event_ids']:
                    output.append((category,match_summary[category]))
    return output


In [155]:
import itertools
from bs4 import BeautifulSoup

def tokenize(text):
    token_sentences = rdrsegmenter.tokenize(text)
    return list(itertools.chain.from_iterable(token_sentences))

def generate_ner_annotation(item,idx_item):
    ''' From item object --> ner annotation format'''
    result=[]

    for idx,html in enumerate(item["html_annotation"]): 
        para_tokens = tokenize(item['original_doc']['_source']['body'][idx]['text'])
        tags = ['O']*len(para_tokens)

        soup = BeautifulSoup(html)
        events = soup.find_all("span", {"class": "tag"})
        for e in events:
            event_tokens = tokenize(e.text)
            event_pos = find_array(event_tokens,para_tokens)
            if event_pos==-1:
                print(event_tokens,para_tokens)
                print(idx_item, idx,'---------------------------')
                return []
                # print('--',event_tokens)
                # print('++',para_tokens)

            event_info_list = ref_id_to_info(e['event_id'],item['match_summary'])
            # print(event_info_list)
            for info in event_info_list:
                for attr in info[1]:
                    if attr!='ref_event_ids' and info[1][attr].strip()!='':
                        attr_tokens = tokenize(info[1][attr])
                        attr_pos_list = find_all_array(attr_tokens,event_tokens)
                        # if len(attr_pos_list)==0:
                            # print('---',info[1][attr])
                        for attr_pos in attr_pos_list:
                            tags[event_pos+attr_pos]='B'+'-'+info[0]+'_'+attr
                            for i in range(1,len(attr_tokens)):
                                tags[event_pos+attr_pos+i]='I'+'-'+info[0]+'_'+attr
                            # print(info[1][attr],tags[event_pos+attr_pos])

        # print(tags)
        result.append(list(zip(para_tokens,tags)))
    return result


### Generate NER annotation file for train dataset

In [152]:
import codecs
with codecs.open('train.txt', 'w', "utf-8") as f:
    # json.dump(doc, modified,ensure_ascii=False,indent=4)
    for idx,item in enumerate(train):
        try:
            ner_ann = generate_ner_annotation(item,idx)
        except:
            print('***********',idx)
        if len(ner_ann)==0:
            break
        for para in ner_ann:
            for token in para:
                f.write('{} {}\n'.format(token[0],token[1]))
            f.write('\n')

24 9 ---------------------------


### Generate NER annotation file for test set (all with anotation 'O')

In [None]:
import json,codecs
with open('/mnt/d/zaloAI2020/zac2020/data/release/private_test/private_test/private_test.jsonl',encoding="utf-8") as f:
    test = [json.loads(jline) for jline in f.read().splitlines()]

cnt=0
write_cnt=0
with codecs.open('/home/hoanganh/zac2020-IE/ZaloAIChallenge-2020/examples/token-classification/zac/test_private.txt', 'w', "utf-8") as f:
    # json.dump(doc, modified,ensure_ascii=False,indent=4)
    for idx, item in enumerate(test):
        for para in item['original_doc']['_source']['body']:
            try:
                tokens = tokenize(para['text'])
            except:
                print('***********',idx)
            if len(tokens)>0:
                write_cnt+=1
                for token in tokens:
                    f.write('{} O\n'.format(token))
                f.write('\n')

## From NER predicted result --> generate match summary information

In [34]:
def ner_output_to_para(file_name,para_file_name='test_pred_para.txt'):
    ''' From formated ner output file, generate readable paragraph with ner tag'''
    para_cnt=0
    with open(para_file_name,'w',encoding='utf-8') as fw:
        with open(file_name,encoding='utf-8') as f:
            line = ''
            para=''
            ner_token=''
            ner_type=''
            while True:
                line = f.readline()
                if not line:
                    break
                if line.strip()=='':
                    para_cnt+=1
                    fw.write(para + '\n------------\n')
                    para=''
                else:
                    token, type = line.strip().split()
                    if type[0]=='I':
                        ner_token+=' '+token
                    elif type!='O':
                        ner_type=type[2:]
                        ner_token=token
                    else:
                        if ner_type!='':
                            para+=f'<{ner_type}>({ner_token})'
                            ner_type=ner_token=''                        
                        para+=f' {token}'
                
            if ner_type!='':
                para+=f'<{ner_type}>({ner_token})'

    print(para_cnt)

ner_output_to_para('/home/hoanganh/zac2020-IE/test_predictions (2).txt')          


### Only generate goal scorers

In [2]:
def para_id_to_item_id(dataset):
    para_cnt=[]
    for item in dataset:
        para_cnt.append(len(item['original_doc']['_source']['body']))
    return para_cnt
    
def ner_pred_to_scorers(ner_file,para_cnt_list,tag='score_list_player_name'):
    para_cnt=0
    item_id=0
    ner_token=''
    scorers_list=[]
    scorers=[]
    with open(ner_file,encoding='utf-8') as f:
        while True:
            line=f.readline()
            if not line:
                break
            if line.strip()=='':
                para_cnt+=1
                if para_cnt>=para_cnt_list[item_id]:
                    para_cnt=0
                    item_id+=1
                    scorers_list.append(scorers)
                    scorers=[]
                continue
            token,type=line.strip().split()
            if type=='B-'+tag:
                ner_token=token
            elif type=='I-'+tag:
                ner_token+=token
            elif ner_token!='':
                scorers.append(ner_token)
                ner_token=''
    return scorers_list

In [61]:
import json
with open('public_test/public_test.jsonl',encoding="utf-8") as f:
    test = [json.loads(jline) for jline in f.read().splitlines()]

test_para_cnt=para_id_to_item_id(test)

scorers_list = ner_pred_to_scorers('/home/hoanganh/zac2020-IE/test_predictions (2).txt',test_para_cnt)

#### Append score list to the submit file with team names and scores

In [16]:
import json
with open('submit_scorer.jsonl',encoding="utf-8") as f:
    submit = [json.loads(jline) for jline in f.read().splitlines()]

In [84]:
for id,item in enumerate(submit):
    if len(scorers_list[id])>0:
        item['match_summary']['card_list']=[]
        for scorer in scorers_list[id]:
            item['match_summary']['score_list'].append({'player_name': scorer, 'time': '', 'team': ''})

{'test_id': '21464024',
 'match_summary': {'players': {'team1': 'Chelsea', 'team2': 'Arsenal'},
  'score_board': {'score1': 3, 'score2': 1},
  'score_list': [{'player_name': 'Marcos_Alonso', 'time': '', 'team': ''},
   {'player_name': 'Hazard', 'time': '', 'team': ''},
   {'player_name': 'Cesc_Fabregas', 'time': '', 'team': ''},
   {'player_name': 'Olivier_Giroud', 'time': '', 'team': ''}],
  'card_list': [{'player_name': '', 'time': '', 'team': ''}],
  'substitution_list': [{'player_in': '', 'time': '', 'player_out': ''}]}}

In [20]:
for item in submit:
    for goal in item['match_summary']['card_list']:
        goal['player_name']=goal['player_name'].replace('_',' ')


In [21]:
import codecs
with codecs.open('submit_scorer_card.jsonl', 'w', "utf-8") as f:
    for item in submit:
        json.dump(item,f,ensure_ascii=False)
        f.write('\n')

### Generate match summary for score list, card list and substitution list

In [1]:
def merge_ner_tokens(ner_file):
    para_list=[]
    para=[]
    ner_token=''
    ner_type='O'
    token_id=0

    with open(ner_file,encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break

            if line.strip()=='':
                if ner_type!='O':
                    para.append((token_id,ner_type,ner_token))
                para_list.append(para)
                para=[]
                ner_token=''
                ner_type='O'
                token_id=0
                continue

            token,type=line.strip().split()
            if type!='O':
                if type[0]=='B':
                    ner_type=type[2:]
                    ner_token=token.replace('_',' ')
                    token_id+=1
                else:
                    ner_token+=' '+ token.replace('_',' ')
            else:
                if ner_type!='O':
                    para.append((token_id,ner_type,ner_token))
                ner_type='O'
                token_id+=1
    
    return para_list

In [2]:
ner_para_list = merge_ner_tokens('/home/hoanganh/zac2020-IE/test_predictions (2).txt')

In [3]:
def connect_ner_tokens(ner_para):
    def nearest(ner, ner_list):
        nearest_idx=-1
        distance=1000
        for idx,el in enumerate(ner_list):
            if abs(el[0]-ner[0])<distance:
                nearest_idx=idx
                distance=abs(el[0]-ner[0])
        return nearest_idx

    table={'card_list':[],'score_list':[],'substitution_list':[]}
    substitute_queue=[]
    for ner in ner_para:
        if ner[1]=='card_list_player_name':
            table['card_list'].append((ner[0],{"player_name": ner[2], "time": "", "team": ""}))
        elif ner[1]=='score_list_player_name':
            table['score_list'].append((ner[0],{"player_name": ner[2], "time": "", "team": ""}))
        elif 'substitution_list_player' in ner[1]:
            if len(substitute_queue)>0 and substitute_queue[0][1]!=ner[1]:
                if ner[1]=='substitution_list_player_in':
                    table['substitution_list'].append((ner[0],{"player_in": ner[2], "time": "", "player_out": substitute_queue.pop(0)[2]}))
                else:
                    table['substitution_list'].append((ner[0],{"player_in": substitute_queue.pop(0)[2], "time": "", "player_out": ner[2]}))
            else:
                substitute_queue.append(ner)
    for ner in substitute_queue:
        if ner[1]=='substitution_list_player_in':
            table['substitution_list'].append((ner[0],{"player_in": ner[2], "time": "", "player_out": ''}))
        else:
            table['substitution_list'].append((ner[0],{"player_in": '', "time": "", "player_out": ner[2]}))

    for ner in ner_para:
        if ner[1]=='card_list_time':
            idx= nearest(ner,table['card_list'])
            if idx!=-1:
                table['card_list'][idx][1]['time']=ner[2]
        elif ner[1]=='card_list_team':
            idx= nearest(ner,table['card_list'])
            if idx!=-1:
                table['card_list'][idx][1]['team']=ner[2]

        elif ner[1]=='score_list_time':
            idx= nearest(ner,table['score_list'])
            if idx!=-1:
                table['score_list'][idx][1]['time']=ner[2]
        elif ner[1]=='score_list_team':
            idx= nearest(ner,table['score_list'])
            if idx!=-1:
                table['score_list'][idx][1]['team']=ner[2]
        
        elif ner[1]=='substitution_list_time':
            idx= nearest(ner,table['substitution_list'])
            if idx!=-1:
                table['substitution_list'][idx][1]['time']=ner[2]
            
    return table

In [80]:
import json
with open('public_test/public_test.jsonl',encoding="utf-8") as f:
    test = [json.loads(jline) for jline in f.read().splitlines()]
    
para_cnt=[]
for item in test:
    para_cnt.append(len(item['original_doc']['_source']['body']))

In [86]:
import json
with open('/home/hoanganh/zac2020-IE/submit.jsonl',encoding="utf-8") as f:
    submit = [json.loads(jline) for jline in f.read().splitlines()]

first_para_idx=0

for idx, item in enumerate(submit):

    for key in ['card_list','score_list','substitution_list']:
        item['match_summary'][key]=[]

    for para_idx in range(first_para_idx,first_para_idx+para_cnt[idx]):
        table = connect_ner_tokens(ner_para_list[para_idx])
        for key in table:
            for ner in table[key]:
                # if key=='card_list':
                #     print(ner)
                item['match_summary'][key].append(ner[1])
    first_para_idx+=para_cnt[idx]


(39, {'player_name': 'Diop', 'time': '', 'team': ''})
(14, {'player_name': 'Sergio Aguero', 'time': '', 'team': ''})
(23, {'player_name': 'Eric Bailly', 'time': '', 'team': ''})
(45, {'player_name': 'Koscielny', 'time': '', 'team': ''})
(38, {'player_name': 'Eden Hazard', 'time': '', 'team': ''})
(41, {'player_name': 'Facundo Sebastian Roncaglia', 'time': '', 'team': ''})
(3, {'player_name': 'Digne', 'time': '', 'team': ''})
(11, {'player_name': 'Loic Remy', 'time': '', 'team': ''})
(8, {'player_name': 'Marco Verratti', 'time': '', 'team': ''})
(18, {'player_name': 'Joao Cancelo', 'time': '', 'team': ''})


In [88]:
import codecs
with codecs.open('/home/hoanganh/zac2020-IE/submit_rule_based_relation.jsonl', 'w', "utf-8") as f:
    for item in submit:
        json.dump(item,f,ensure_ascii=False)
        f.write('\n')

## Generate private test result from NER predicted files
Similar to above, make a seperate section to run the code easier 

In [5]:
import json,codecs
with open('/mnt/d/zaloAI2020/zac2020/data/release/private_test/private_test/private_test.jsonl',encoding="utf-8") as f:
    test = [json.loads(jline) for jline in f.read().splitlines()]

In [7]:
para_cnt=[]
for item in test:
    para_cnt.append(len(item['original_doc']['_source']['body']))

In [9]:
def merge_ner_tokens(ner_file):
    para_list=[]
    para=[]
    ner_token=''
    ner_type='O'
    token_id=0

    with open(ner_file,encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break

            if line.strip()=='':
                if ner_type!='O':
                    para.append((token_id,ner_type,ner_token))
                para_list.append(para)
                para=[]
                ner_token=''
                ner_type='O'
                token_id=0
                continue

            token,type=line.strip().split()
            if type!='O':
                if type[0]=='B':
                    ner_type=type[2:]
                    ner_token=token.replace('_',' ')
                    token_id+=1
                else:
                    ner_token+=' '+ token.replace('_',' ')
            else:
                if ner_type!='O':
                    para.append((token_id,ner_type,ner_token))
                ner_type='O'
                token_id+=1
    
    return para_list

In [10]:
ner_para_list = merge_ner_tokens('/home/hoanganh/zac2020-IE/test_predictions_private.txt')

In [8]:
def connect_ner_tokens(ner_para):
    def nearest(ner, ner_list):
        nearest_idx=-1
        distance=1000
        for idx,el in enumerate(ner_list):
            if abs(el[0]-ner[0])<distance:
                nearest_idx=idx
                distance=abs(el[0]-ner[0])
        return nearest_idx

    table={'card_list':[],'score_list':[],'substitution_list':[]}
    substitute_queue=[]
    for ner in ner_para:
        if ner[1]=='card_list_player_name':
            table['card_list'].append((ner[0],{"player_name": ner[2], "time": "", "team": ""}))
        elif ner[1]=='score_list_player_name':
            table['score_list'].append((ner[0],{"player_name": ner[2], "time": "", "team": ""}))
        elif 'substitution_list_player' in ner[1]:
            if len(substitute_queue)>0 and substitute_queue[0][1]!=ner[1]:
                if ner[1]=='substitution_list_player_in':
                    table['substitution_list'].append((ner[0],{"player_in": ner[2], "time": "", "player_out": substitute_queue.pop(0)[2]}))
                else:
                    table['substitution_list'].append((ner[0],{"player_in": substitute_queue.pop(0)[2], "time": "", "player_out": ner[2]}))
            else:
                substitute_queue.append(ner)
    for ner in substitute_queue:
        if ner[1]=='substitution_list_player_in':
            table['substitution_list'].append((ner[0],{"player_in": ner[2], "time": "", "player_out": ''}))
        else:
            table['substitution_list'].append((ner[0],{"player_in": '', "time": "", "player_out": ner[2]}))

    for ner in ner_para:
        if ner[1]=='card_list_time':
            idx= nearest(ner,table['card_list'])
            if idx!=-1:
                table['card_list'][idx][1]['time']=ner[2]
        elif ner[1]=='card_list_team':
            idx= nearest(ner,table['card_list'])
            if idx!=-1:
                table['card_list'][idx][1]['team']=ner[2]

        elif ner[1]=='score_list_time':
            idx= nearest(ner,table['score_list'])
            if idx!=-1:
                table['score_list'][idx][1]['time']=ner[2]
        elif ner[1]=='score_list_team':
            idx= nearest(ner,table['score_list'])
            if idx!=-1:
                table['score_list'][idx][1]['team']=ner[2]
        
        elif ner[1]=='substitution_list_time':
            idx= nearest(ner,table['substitution_list'])
            if idx!=-1:
                table['substitution_list'][idx][1]['time']=ner[2]
            
    return table

In [13]:
import json
with open('/home/hoanganh/zac2020-IE/submit_private.jsonl',encoding="utf-8") as f:
    submit = [json.loads(jline) for jline in f.read().splitlines()]

first_para_idx=0

for idx, item in enumerate(submit):
    for key in ['card_list','score_list','substitution_list']:
        item['match_summary'][key]=[]

    for para_idx in range(first_para_idx,first_para_idx+para_cnt[idx]):
        table = connect_ner_tokens(ner_para_list[para_idx])
        for key in table:
            for ner in table[key]:
                if key=='card_list':
                    print(ner)
                item['match_summary'][key].append(ner[1])
    first_para_idx+=para_cnt[idx]

(12, {'player_name': 'Sầm Ngọc Đức', 'time': '', 'team': ''})
(39, {'player_name': 'Van Bakel', 'time': '', 'team': ''})
(21, {'player_name': 'Jara', 'time': '', 'team': ''})
(54, {'player_name': 'Jara', 'time': '', 'team': ''})
(16, {'player_name': 'Choum Pisa', 'time': '', 'team': ''})
(14, {'player_name': 'Holding', 'time': '', 'team': ''})
(19, {'player_name': 'Shahrul', 'time': '', 'team': ''})
(5, {'player_name': 'Modric', 'time': '', 'team': ''})
(25, {'player_name': 'Yevhen Khacheridi', 'time': '', 'team': ''})
(22, {'player_name': 'Joaquin Correa', 'time': '', 'team': ''})
(13, {'player_name': 'Sergi Roberto', 'time': '', 'team': ''})
(37, {'player_name': 'Cabral', 'time': '', 'team': ''})


In [16]:
import codecs
with codecs.open('/home/hoanganh/zac2020-IE/submit_private_rule_based_relation.jsonl', 'w', "utf-8") as f:
    for item in submit:
        json.dump(item,f,ensure_ascii=False)
        f.write('\n')