# Prepare data for Training BERT model for Question/Answer
- about confirmed cases by cities or province


Create list of dictionnaries  
One dict by url  
For each url, dict keys are   'url', 'context', 'qas'  
Each qas is a list of dict with key values are :  
'id', 'is_impossible', 'question', 'answers'  
Each answers of qas dicts are list of dict with key : 'text', 'answer_start'  

## import

In [9]:
########
# Import
# built)in libs
import os
import shutil
import re
import json 
import copy
import datetime
# third-party libs
import pandas as pd
import numpy as np


## Definitions

In [10]:
PATH_TABLES_KCDC_UPDATES = os.getcwd() + '/tables_kcdc_updates.json'

PATH_QA_KCDC = os.getcwd() + '/train_data_qa_kcdc.json'

LIST_AREA = ["Seoul",
"Busan",
"Daegu",
"Incheon",
"Gwangju",
"Daejeon",
"Ulsan",
"Sejong",
"Gyeonggi",
"Gangwon",
"Chungbuk",
"Chungnam",
"Jeonbuk",
"Jeonnam",
"Gyeongbuk",
"Gyeongnam",
"Jeju"]

DICT_AREA_SPECIAL_2 ={'Gyeonggi': ["Gyeong", "gi"],
                    'Gangwon':["Gang", "won"],
                    'Chungbuk':["Chung", "buk"],
                    'Chungnam':["Chung", "nam"],
                    'Jeonbuk': ["Jeon", "buk"],
                    'Jeonnam': ["Jeon", "nam"],
                    'Gyeongbuk': ["Gyeong", "buk"],
                    'Gyeongnam': ["Gyeong", "nam"],
                    'Daejeon' : ["Dae", "jeon"]}

## Helper functions

In [11]:
def clean_file(path_file_name):
    '''
    Clean file already traited : rename file with date
    '''
    d = datetime.datetime.now()
    str_date = '_' + d.strftime("%Y%m%d_%H_%M_%S")
       
    res_re = re.search('\.\w+$', path_file_name)
        
    path_file_name_saved = \
        path_file_name[0:res_re.start()] + str_date + res_re.group(0)
    if os.path.isfile(path_file_name):    
        shutil.move(path_file_name, path_file_name_saved) 
        print('File {} moved!'.format(path_file_name_saved))
    else:
        print('File {} does not exist!'.format(path_file_name))
    
def create_search_pattern(area_curr, dict_special=DICT_AREA_SPECIAL_2,
                         input_type="notags"):
    '''
    Create pattern to search into text for all area
    If special area, add special writting like for "Jeonbuk" : 
    "Jeonbuk" or "Jeon-buk" or "Jeon-\r\n buk"
    '''
    if area_curr in dict_special.keys():
        
        if input_type == "notags":
            # patch if Gyeonggi seperate not in middle sometimes... 
            if area_curr == "Gyeonggi":
                # (Gyeonggi)|(Gyeong-gi)|(Gyeong-{0,1}\s{0,1}\s{0,1}-{0,1}gi) 
                # (Gyeon-{0,1}\s{0,1}\s{0,1}-{0,1}ggi)
                return '({})|({})|({})|({})'.format(area_curr,
                    '-'.join(dict_special[area_curr]),
                    '-{0,1}\s{0,1}\r\n\s{0,1}-{0,1}' \
                                        .join(dict_special[area_curr]), 
                    'Gyeon-{0,1}\s{0,1}\s{0,1}-{0,1}ggi')
            
            return '({})|({})|({})'.format(area_curr,
                                  '-'.join(dict_special[area_curr]),
            '-{0,1}\s{0,1}\r\n\s{0,1}-{0,1}'.join(dict_special[area_curr]))
        else:
            #'({}</span>)'.format('</span>.+>-'.join(DICT_AREA_SPECIAL_2[area_curr]))
            return '({})|({})|({}</span>)'.format(area_curr,
                        '-'.join(dict_special[area_curr]),
                        '-{0,1}</span>.+>-{0,1}'.join(dict_special[area_curr]))            
    else:
        return area_curr
    
def find_start_num(context, area_curr, val_curr, index=None, 
                   input_type="notags"):
    '''
    Find string number which is located at closest location from area_curr
    '''
    try:
        # search every pattern into context long string like : 
        #    "\r\n1,234 " or "\r\n1,234\r\n" 
        # or "\r\n1234 " or  "\r\n1234\r\n" 
        
        if input_type == "notags":
            str_num_search = '((?<= )|(?<= \r\n))({:,d}|{})((?= )|(?=\r\n))'.\
                format(int(val_curr), int(val_curr))
            len_first_mark = 0 # = len('\r\n')
        else:
            str_num_search = '>({:,d}|{})\s*<'.format(int(val_curr), 
                                                          int(val_curr))
            len_first_mark = 1 # len('>')            
        
        list_pos_num = []
        for iter_find in re.finditer(str_num_search, context):
            list_pos_num.append(iter_find.start())
        arr_num = np.array(list_pos_num) 
        
        # search every area_curr pattern into context string
        # add to the search special area : 
        str_pat_area = create_search_pattern(area_curr, DICT_AREA_SPECIAL_2,
                                           input_type) 
        list_pos_area = []
        #print('str_pat_area : ', str_pat_area)
        for iter_find in re.finditer(str_pat_area, context, re.DOTALL):
            list_pos_area.append(iter_find.start())
        arr_area = np.array(list_pos_area)

        arr_delta = arr_num[:, np.newaxis] - arr_area
        arr_delta[arr_delta<0] = 1e9
        i_min = arr_delta.shape[0] - np.argmin(np.flip(np.amin(arr_delta, 
                                                               axis=1)))-1
        delta_min = np.min(arr_delta)
        start_curr = int(arr_num[i_min] + len_first_mark)
        return delta_min, start_curr
    except:
        print("Error :[{}]:{} : {} ".format(index, area_curr, val_curr))
        raise


def keep_diff_in_list(list_text):
    '''
    Function to keep only different elements in list 
    (pops all elements repeted)
    return index to keep
    '''
    list_text_ok = []
    list_index = []
    for index, answer_curr in enumerate(list_text):
        list_text_comp = list_text.copy()
        list_text_comp.remove(answer_curr)
        if answer_curr not in list_text_comp:
            list_text_ok.append(answer_curr)
            list_index.append(index)
            
    return list_text_ok, list_index

def filter_prepare_data_model_qa(list_qa):
    '''
    Test function for QA database for Q/A BERT model
    
    Check if values found are differents for each web pages 
    For each questions.
    It is because il could have a problem to detect the first character if 
    several areas have the same number.
    '''
    list_qa_orig = copy.deepcopy(list_qa)
    list_qa_ok = []
    # check list_qa
    for dict_curr in list_qa_orig:
        context = dict_curr["context"]
        list_text = []
        for q_curr in dict_curr['qas']:
            list_text.append(q_curr["answers"][0]["text"])
        # if text answer is different from all others text answers, 
        # then it is possible to use this question
        list_text_ok, list_index = keep_diff_in_list(list_text)
        # if at least one element is unique
        if len(list_index) > 0:
            list_qas_curr = [dict_curr['qas'][i] for i in list_index]
            dict_curr['qas'] = list_qas_curr
            list_qa_ok.append(dict_curr)
                
    return list_qa_ok

def prepare_data_model_qa(df_tables_kcdc_updates, path_json=PATH_QA_KCDC):
    """
    Prepare JSON file for Question/Answering BERT model
    
    - Create list of dictionnaries
    - One dict by url
    - For each url, dict keys are   'url', 'context', 'qas'
    - Each qas is a list of dict with key values are : 
        'id', 'is_impossible', 'question', 'answers'
    - Each answers of qas dicts are list of dict with key : 
        'text', 'answer_start'
    """
    # Creation of output
    list_qa = []
    list_delta_min = []
    K_id = 0
    # for each url, create a dictionnary
    for index in df_tables_kcdc_updates.index:
        #print(index)
        # text data from webpage
        context = df_tables_kcdc_updates.at[index, "body"]
        # for each area, create a list of dictionnary for each question
        list_qas = []
        for area_curr in LIST_AREA:
            val_curr = df_tables_kcdc_updates.at[index, area_curr]
            if np.isnan(val_curr):
                continue
            
            delta_min, start_curr = find_start_num(context, area_curr, val_curr, 
                                                   index, input_type="notags")
            # check if comma in text : 
            try:
                #print("index {} ...".format(index))
                str_test = context[start_curr:start_curr + \
                               len('{}'.format(val_curr)) + 1]
            except:
                print("ERROR : ")
                print("index: ", index)
                print("area_curr: ", area_curr)
                print("start_curr: ", start_curr)
                raise Exception('ERROR!')
                                
            if re.search(",", str_test) != None:
                text_val_curr = '{:,d}'.format(int(val_curr))
            else:
                text_val_curr = '{}'.format(int(val_curr))
            
            K_id += 1
            qas_dict_curr = {'id': "{:05}".format(K_id),
                            'is_impossible': False,
                 "question": "How many confirmed cases are in " + \
                             area_curr + "?",
                 'answers': [ \
                {'text': text_val_curr, 
                'answer_start': start_curr}]}
            list_qas.append(qas_dict_curr)
            list_delta_min.append(delta_min)


        dict_curr = {'index':index, 'url': df_tables_kcdc_updates.at[index, 
                                                                     "url"],
                    'context': context, 
                    'qas': list_qas}
        if len(list_qas) == 0:
            #print("index {} : list_qas empty!".format(index))
            continue
        list_qa.append(dict_curr)
    
    # filter questions with unique answer by webpages
    list_qa_ok = filter_prepare_data_model_qa(list_qa)

    # Save as a JSON file
    with open(path_json, 'w') as f:
        json.dump(list_qa_ok, f)
        
    return list_delta_min, list_qa_ok

def test_prepare_data_model_qa(list_qa):
    '''
    Test function for QA database for Q/A BERT model
    
    Check if text found in each web page is good : answer found in context.
    For each questions.
    '''
    # check list_qa
    for dict_curr in list_qa:
        flag_ok = True
        context = dict_curr["context"]
        for q_curr in dict_curr['qas']:
            text_curr = q_curr["answers"][0]["text"]
            start_curr = q_curr["answers"][0]["answer_start"]
            text_found = context[start_curr:start_curr + len(text_curr)]
            if text_found != text_curr:
                flag_ok = False
                print("ERROR idx:{} Q:{}".format(dict_curr["index"], 
                                                 q_curr["question"]))
                print("answer : ", text_curr)
                print("found  : ", text_found)
                print(dict_curr["url"])

    return flag_ok

## Load data

In [12]:
df_tables_kcdc_updates = pd.read_json(PATH_TABLES_KCDC_UPDATES)
df_tables_kcdc_updates = \
    df_tables_kcdc_updates.sort_values(by=['date_published'])
print("Nb. South Korea articles : ", df_tables_kcdc_updates.shape[0])

Nb. South Korea articles :  506


## Process data

In [14]:
clean_file(PATH_QA_KCDC)
list_delta_min, list_qa = prepare_data_model_qa(df_tables_kcdc_updates, 
                                                path_json=PATH_QA_KCDC)

File /Users/gregory/Documents/CloudStationSinchon/Applications/python/CoronaVirus/code/coronavirusModel/train_data_qa_kcdc_20201113_10_35_28.json moved!


## Test data

In [15]:
print("Test data processed : TEST OK ?")
test_prepare_data_model_qa(list_qa)

Test data processed : TEST OK ?


True