# Read data into Table with BERT 

In [2]:
#########################
#  TRAINING 

# RESUME ?
resume_train = True
EPOCH_RESUME = 300

# on KAGGLE
DATA_SOURCE_KAGGLE = 'readtable-bert-resume-training-output-1000-epochs'
# on LOCAL
# automatic

# data source file for training/eval
DATA_TRAINING = 'train_data_qa_kcdc_to_compare_2.json'
# Model :
mdl_type = 'distilbert'
mdl_name = 'distilbert-base-uncased-distilled-squad'
#mdl_type = 'bert'
#mdl_name = 'bert-base-uncased'
# LEARNING RATE
learning_rate = 4e-5
# number of epochs for training
epochs = 10000
# hyper-parameters
max_seq_length = 512
doc_stride = 128
train_batch_size = 12 #12
# param to save into df_score
epochs_start = EPOCH_RESUME
# ARGUMENTS of model training
ARGS_DEFAULT = {'num_train_epochs': epochs,
                'reprocess_input_data': True, 
                'overwrite_output_dir': True,
                'use_cuda': True,
                'fp16': False,
                'save_steps': 15000,
                'save_model_every_epoch': False,
                "learning_rate": learning_rate,
                "max_seq_length": max_seq_length,
                "doc_stride": doc_stride,
                'n_best_size': 8,
                'max_answer_length': 10,
                'null_score_diff_threshold': 100.0,
                "train_batch_size": train_batch_size,
                "silent": True,
                "warmup_ratio": 0}

## Import

In [3]:
%matplotlib inline

# for figure
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns
sns.set(color_codes=True, font_scale=1.33)

# useful
import time
import datetime
import re

# data management
import pandas as pd
import numpy as np
import numpy.ma as ma
import json

import shutil
import os
import subprocess
import platform
import socket
import logging

# Machine learning
# for training
try:
    import simpletransformers
except:
    !pip install --upgrade transformers
    !pip install simpletransformers
    
from simpletransformers.question_answering import QuestionAnsweringModel

import tensorflow as tf
import tensorboard
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

# for cross validation
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

## Definitions

In [4]:
HOSTNAME = socket.gethostname()
# your data folder
PATH_FOLDER_SAVE = '../../data'
# path of model  during training : big amount of data :    
PATH_FOLDER_TRAIN = PATH_FOLDER_SAVE
# path current
path_current = os.getcwd()
# your code folder LOCAL
PATH_FOLDER_CODE = path_current
# path folder saved model
PATH_FOLDER_MODEL_SAVED = PATH_FOLDER_SAVE + '/model_saved'
# list of camemBERT model files to save
LIST_FILES_TO_SAVE = ["config.json",
                      "nbest_predictions_test.json", 
                      "null_odds_test.json",
                      "predictions_test.json",
                      "pytorch_model.bin" ,
                      "special_tokens_map.json",
                      "tokenizer_config.json", 
                      "training_args.bin",
                      "vocab.txt",
                     "optimizer.pt",
                     "scheduler.pt"]
# path of DATA
#PATH_DF_FAKE_NEWS = PATH_FOLDER_SAVE + '/df_fake_news.pkl'
#PATH_DF_TRUE_NEWS = PATH_FOLDER_SAVE + '/df_true_news.pkl'

# % of data used for training
train_percent = 0.7
# maximum number of epochs limited by disk space (1 epoch = 1Gb of data)                    
#MAX_NB_EPOCHS = 33 
# random state for training and other stuff
RANDOM_STATE = 0

# save df news
#PATH_DF_NEWS_SAVE = PATH_FOLDER_SAVE + '/df_news.pkl' 
# save df results
#PATH_DF_RES_SAVE = PATH_FOLDER_SAVE + '/df_res.pkl'

# check if Google Colab need Drive ?
if re.match("^/content", os.getcwd()):
    MODE_RUN = 'GOOGLE'
    print("GOOGLE COLAB MODE")
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    # your code folder GOOGLE
    PATH_FOLDER_CODE = \
      "/content/drive/My Drive/Coronavirus/code/coronavirusModel"
    # path of model  during training : big amount of data :    
    PATH_FOLDER_TRAIN = '/content/'
    PATH_FOLDER_SAVE = '/content/drive/My Drive/Coronavirus/data'
    PATH_FOLDER_MODEL_SAVED = '/content/drive/My Drive/outputs_bert_temp'
    os.chdir(PATH_FOLDER_CODE)
    # install libs for colab
    !pip install tokenizers==0.5.2
    !pip install simpletransformers
    PATH_QA_KCDC = PATH_FOLDER_CODE + '/' + DATA_TRAINING
    PATH_DF_SCORE = PATH_FOLDER_SAVE + '/' + 'df_read_table_score.csv'
    PATH_DF_SCORE_OUT = PATH_FOLDER_SAVE + '/' + 'df_read_table_score.csv'
elif re.match('/kaggle/working', os.getcwd()):
    MODE_RUN = 'KAGGLE'
    print("KAGGLE MODE")
    PATH_FOLDER_MODEL_RESUME = '/kaggle/input/' + DATA_SOURCE_KAGGLE + \
        '/outputs'
    PATH_FOLDER_CODE = '/kaggle/working'
    PATH_FOLDER_TRAIN = '/kaggle/working'
    PATH_FOLDER_SAVE = '/kaggle/working'
    PATH_FOLDER_MODEL_SAVED = PATH_FOLDER_SAVE + '/model_saved'
    PATH_QA_KCDC = 'https://raw.githubusercontent.com/jeugregg/' + \
        'coronavirusModel/master' + '/' + DATA_TRAINING
    # if resume 
    PATH_DF_SCORE = '/kaggle/input/' + DATA_SOURCE_KAGGLE + \
                    '/' + 'df_read_table_score.csv'
    PATH_DF_SCORE_OUT = PATH_FOLDER_SAVE + '/' + 'df_read_table_score.csv'
        
else: 
    MODE_RUN = 'LOCAL'
    PATH_QA_KCDC = PATH_FOLDER_CODE + '/' + DATA_TRAINING
    PATH_DF_SCORE = PATH_FOLDER_SAVE + '/' + 'df_read_table_score.csv'
    PATH_DF_SCORE_OUT = PATH_DF_SCORE
    
# path to Q/A data for train/eval model : fixed version 2020-04-12


LIST_COL_SCORE = ['date', 'mdl_type', 'mdl_name', 'learning_rate','epochs_start', 'epochs',
                 'acc_train', 'acc_test']

# your run folder
PATH_FOLDER_RUNS = PATH_FOLDER_SAVE + '/runs'

## Helper functions

In [5]:
def get_compute_time(foldername):
    '''
    Get number of seconds to compute training in a run subfolder
    '''
    # scan sub folder runs files to find an events file with a timestamp
    # catch the timestamp (time of start of calculation)
    for x in os.scandir(foldername):
        filename = x.name
        re_file_ts = re.search('(?<=^events\.out\.tfevents\.)\d+(?=\.)', 
                               filename)
        if re_file_ts != None:
            ts_file = int(re_file_ts.group(0))
            break
    # get the last modification datetime of the event file (end of calculation)
    dt_file = datetime.datetime.utcfromtimestamp(os.path.getmtime(foldername \
                                                    + '/' + filename)) 
    # convert to start datetime
    dt_file_start = datetime.datetime.utcfromtimestamp(ts_file)
    
    # return the delta in seconds
    delta_time = dt_file-dt_file_start
    return delta_time.seconds

# save before scraping
def clean_file(path_file_name):
    '''
    Clean file already traited : rename file with date
    '''
    try:
        d = datetime.datetime.now()
        str_date = '_' + d.strftime("%Y%m%d_%H_%M_%S")
       
        res_re = re.search('\.\w+$', path_file_name)
        
        path_file_name_saved = \
            path_file_name[0:res_re.start()] + str_date + res_re.group(0)
         
        shutil.move(path_file_name, path_file_name_saved) 
        print('File {} moved!'.format(path_file_name_saved))
    except:
        print('File {} does not exist!'.format(path_file_name))
        
def get_last_run_folder(path=PATH_FOLDER_RUNS, hostname=HOSTNAME):
    '''
    Get name folder for last run done on this machine
    '''
    path_current = os.getcwd()
    try:
        os.chdir(PATH_FOLDER_CODE)
        list_folder = [x[0] for x in os.walk(path)]
        #print(list_folder)
        list_folder = list_folder[1:]
        list_date = []
        for folder_curr in list_folder:
            if re.search(hostname, folder_curr):
                datenum = os.path.getmtime(folder_curr)
                #print(datenum)
                list_date.append(datenum)
        date_max = max(list_date)
        folder_max = list_folder[list_date.index(date_max)]
    except OSError:
        folder_max = None
    finally:
        os.chdir(path_current)
    return folder_max

def get_acc(result):
    '''Get accuracy over result '''
    return result["correct"] / (result["correct"] + result["similar"] + \
                                result["incorrect"])
def create_data_score(model=None):
    #
    # Create a dataFrame to load data
    #
    
    # prepare date string
    d = datetime.datetime.now()
    date = d.strftime("%Y-%m-%d %H:%M:%S")
    # create dataframe
    df_score = pd.DataFrame(columns=LIST_COL_SCORE, index=[0])
    
    # identify last run folder
    folder_run = get_last_run_folder()
    
    # update col values
    if model is None:
        df_score["date"] = date
        df_score["path_src"] = mdl_name
        df_score["path_out"] = os.path.basename(folder_run)
        df_score["mdl_type"] = mdl_type
        df_score["mdl_name"] = mdl_name
        df_score["learning_rate"] = learning_rate
        df_score["max_seq_length"] = max_seq_length
        df_score["doc_stride"] = doc_stride
        df_score["train_batch_size"] = train_batch_size
        df_score["epochs_start"] = epochs_start
        df_score["epochs"] = epochs
        df_score["acc_train"] = get_acc(result_train) 
        df_score["acc_test"] = get_acc(result_test)
        df_score["data"] = DATA_TRAINING
        df_score["timing_train"] = get_compute_time(folder_run)
    else:
        df_score["date"] = date
        df_score["path_src"] = os.path.basename(model.args["model_name"])
        df_score["path_out"] = os.path.basename(folder_run)
        df_score["mdl_type"] = model.args["model_type"]
        df_score["mdl_name"] = mdl_name
        
        df_score["learning_rate"] = model.args["learning_rate"]
        df_score["max_seq_length"] = model.args["max_seq_length"]
        df_score["doc_stride"] = model.args["doc_stride"]
        df_score["train_batch_size"] = model.args["train_batch_size"]
        df_score["epochs_start"] = epochs_start
        df_score["epochs"] = model.args["num_train_epochs"]
        
        df_score["acc_train"] = get_acc(result_train) 
        df_score["acc_test"] = get_acc(result_test)   
        df_score["data"] = DATA_TRAINING
        df_score["timing_train"] = get_compute_time(folder_run)
    
    return df_score

def add_score(model=None, flag_save=True):
    #
    # add last results of training into df_score and save it 
    # into PATH_DF_SCORE
    #
    
    # read data CSV
    if os.path.exists(PATH_DF_SCORE):
        df_score = pd.read_csv(PATH_DF_SCORE)
        df_add = create_data_score(model)
        df_score = df_score.append(df_add, ignore_index=True)
    else:
        df_score = create_data_score(model)
     
    # save
    if flag_save:
        try:
            # save old csv file
            clean_file(PATH_DF_SCORE)
            # save csv
            df_score.to_csv(PATH_DF_SCORE, index=False)
        except:
            print("No rights to write here")
        # if we want to write in another output folder or filename, 
        # it is possible (for kaggle for example, output is not the same folder than input)
        if PATH_DF_SCORE_OUT != PATH_DF_SCORE:
            # save old csv file
            clean_file(PATH_DF_SCORE_OUT)
            # save csv
            df_score.to_csv(PATH_DF_SCORE_OUT, index=False)
    
    return df_score

def display_missing(df):
    '''
    Display dataFrame missing values : barplots & numbers
    ouput : dataFrame containing missing numbers
    '''
    nb_lignes = df.shape[0]
    nb_missing = df.isnull().sum()
    nb_missing = nb_missing.sort_values()
    df_missing = nb_missing.to_frame()
    df_missing.rename(index=str, columns={0: "nb_missing"}, inplace=True)
    df_missing["ratio"] = 100*df_missing["nb_missing"] / nb_lignes
    fig = plt.figure(figsize=(7, 7))
    liste_name_bars = df_missing.index.values
    liste_name_bars_num = []
    for name in liste_name_bars:
        liste_name_bars_num.append("{} [{}]".format(
            name, df_missing.loc[name, "nb_missing"]))
    ax = sns.barplot(y=liste_name_bars_num, x='ratio', data=df_missing)
    plt.title('Missing lines Ratio over {} lines'.format(nb_lignes))
    ax.set(xlabel='repartition [%]')
    ax.set_xlim([0, 100])
    return df_missing

def save_model_folder(path_folder_source, path_folder_dest):
    '''
    Save model folder to be reused
    '''
    print("Saving model...")
    print("Source: ", path_folder_source)
    path_current = os.getcwd()
    try:
        os.chdir(PATH_FOLDER_CODE)
        if os.path.isdir(path_folder_dest) == False:
            os.mkdir(path_folder_dest)
        
        for file_curr in LIST_FILES_TO_SAVE:
            try:
                shutil.copyfile(path_folder_source + '/' + file_curr, 
                    path_folder_dest + '/' + file_curr)
                print('{} copied.'.format(path_folder_dest + '/' + file_curr))
            except:
                print('{} NOT copied.'.format(file_curr))
    finally:
        os.chdir(path_current)

def clean_model_folder(path_folder_train):
    '''
    Clean folder where model is trained : 
    by deleting  "outputs", "runs", "cache_dir" folders.
    '''
    list_path = ["outputs", "runs", "cache_dir"]
    for path_curr in list_path:
        shutil.rmtree(os.path.join(path_folder_train, path_curr))


def count_good_res(result):
    return result["correct"]
    
def train_model_qa(model, train_data, eval_data, 
                   path_folder_train=PATH_FOLDER_TRAIN,
                   path_to_save=PATH_FOLDER_MODEL_SAVED, 
                   nb_retry=3, args=None):
    
    '''
    Train with BERT model for Question/Awsering task
    train_data is used to train model.
    eval_data is used to eval model and select the best model training.
    They are list of dictionnary. More info here :
    https://github.com/ThilinaRajapakse/simpletransformers#question-answering
    
    At each serie, we train over nb_max_epochs epochs into path_to_run folder,
    As accuracy can be different each times, 
    we re-train nb_retry times over nb_max_epochs.
    The best model is save into path_to_save folder.
    '''


    # activate logging warning messages from transformers lib
    #logging.basicConfig(level=logging.INFO)
    #transformers_logger = logging.getLogger("transformers")
    #transformers_logger.setLevel(logging.WARNING)
    
    # change directory to store temporary model files during training
    path_current = os.getcwd()
    path_outputs = path_folder_train + '/outputs' 
    try:
        if os.path.isdir(path_folder_train) == False:
            os.mkdir(path_folder_train)
        os.chdir(path_folder_train)
        print("Training from folder : ", path_folder_train)
        # loop to train without override disk usage
        list_acc_train = []
        list_acc_test = []
        for num_serie in range(0, nb_retry):
            print("Run #", num_serie)
            # Train the model
            if args is None:
                model.train_model(train_data)
            else:
                model.train_model(train_data, args=args)
                
            if path_to_save =='auto':
                path_to_save = get_last_run_folder()
            # eval train
            result_train, text_train = model.eval_model(train_data)
            print("Eval TRAIN : ")
            print(result_train)
            #print(text)
            acc_train = get_acc(result_train)
            print("acc_train: ", acc_train)
            list_acc_train.append(acc_train)
            print('-------------------')
            # eval test
            result_test, text_test = model.eval_model(eval_data)
            print("Eval TEST : ")
            print(result_test)
            #print(text)
            acc_test = get_acc(result_test)
            print("acc_test: ", acc_test)
            list_acc_test.append(acc_test)
            print('-------------------')
            if (acc_test >= np.max(list_acc_test)):
                # Save model
                save_model_folder(path_outputs, path_to_save)
                result_best_train = result_train
                text_best_train = text_train
                result_best_test = result_test
                text_best_test = text_test
                
        # return to path before training
        print("---------------------")
        print("final results : TRAIN")
        print("list_acc_train: ", list_acc_train)   
        print("best result train: ", result_best_train)
        print("best acc_train: ", np.max(list_acc_train))
        print("---------------------")
        print("final results : TEST")
        print("list_acc_test: ", list_acc_test)
        print("best result test: ", result_best_test)
        print("best acc_test: ", np.max(list_acc_test))
    finally:
        os.chdir(path_current)
        return \
        result_best_train, text_best_train, result_best_test, text_best_test

def process_exists(process_name):
    '''
    Check if process currently exists in OS System Takslist
    '''
    MY_PLATFORM = platform.system()
    if MY_PLATFORM == "Windows":
        call = 'TASKLIST /FI "IMAGENAME eq ' + process_name + '"'
        run_obj = subprocess.run(call, capture_output=True)
        if re.search(process_name, 
                     run_obj.stdout.decode('utf-8', 'backslashreplace')):
            return True
        else:
            return False
    else:
        p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE)
        out, err = p.communicate()
        out = out.decode('utf-8', 'backslashreplace')
        for line in out.splitlines():
            if process_name in line:
                return True
        return False

## Test GPU

In [6]:
print(tf.__version__)

2.1.0


In [6]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [7]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [8]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [9]:
tf.test.is_built_with_cuda()

True

In [10]:
tf.test.is_built_with_gpu_support()

True

In [11]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 4015238993954903515,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 6657650197
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 12325313235300682946
 physical_device_desc: "device: 0, name: GeForce GTX 1080, pci bus id: 0000:00:03.0, compute capability: 6.1"]

## Test Q/A with BERT

### Data

In [10]:
PATH_QA_KCDC

'C:\\Users\\Shadow\\Documents\\CloudStation\\Applications\\python\\CoronaVirus\\code\\coronavirusModel/train_data_qa_kcdc_to_compare_2.json'

In [11]:
# load data 
if not(re.match("http", PATH_QA_KCDC)):
    with open(PATH_QA_KCDC, 'r') as f:
        qa_data = json.load(f)
else:
    import urllib.request, json 
    with urllib.request.urlopen(PATH_QA_KCDC) as url:
        qa_data = json.loads(url.read().decode())

# separate Train / Test 
print("len qa_data : ", len(qa_data))
np.random.seed(0)
i_permut = np.random.permutation(len(qa_data))

i_permut_train = i_permut[0:np.int(len(i_permut)*train_percent)]
print("length i_permut_train : ", len(i_permut_train))
print('i_permut_train : [{} - {}]'.format(0, 
                                        np.int(len(i_permut)*train_percent)-1))
i_permut_test = i_permut[np.int(len(i_permut)*train_percent):]
print("length i_permut_test : ", len(i_permut_test))
print('i_permut_test : [{} - {}]'.format(np.int(len(i_permut)*train_percent), 
    np.int(len(i_permut)*train_percent) + len(i_permut_test)-1))

train_data = [ qa_data[i] for i in i_permut_train]
print("len(train_data) :" , len(train_data))
test_data = [ qa_data[i] for i in i_permut_test]
print("len(test_data) :" , len(test_data))
#train_data = qa_data[i_permut_train]
#test_data = qa_data[i_permut_test]
#df_news['train'].iloc[indices_permut_train] = True
#df_news['train'].value_counts()

len qa_data :  109
length i_permut_train :  76
i_permut_train : [0 - 75]
length i_permut_test :  33
i_permut_test : [76 - 108]
len(train_data) : 76
len(test_data) : 33


In [12]:
list_len = []

for dict_curr in qa_data:
    len_curr = len(dict_curr["context"])
    list_len.append(len_curr)
print("max = ", max(list_len))
print("min = ", min(list_len))
print("mean = ", np.mean(list_len))    

max =  9491
min =  671
mean =  3587.605504587156


In [13]:
#clean_model_folder(PATH_FOLDER_TRAIN)

In [14]:
print(PATH_FOLDER_TRAIN)
print(PATH_FOLDER_MODEL_SAVED)
print(train_data[0])

../../data
../../data/model_saved
{'index': 51, 'url': 'https://www.cdc.go.kr/board/board.es?mid=a30402000000&bid=0030&act=view&list_no=367037&tag=&nPage=6', 'context': ' line Period (since 3 Jan) Total Tested positive Being tested Tested negative line line Confirmed Discharged from isolation Under isolation Deceased line line As of 0:00 28 April (Tues) 608,514 10,752 8,854 1,654 244 9,203 588,559 line line As of 0:00 29 April (Wed) 614,197 10,761 8,922 1,593 246 8,307 595,129 line line Difference (+)5,683 (+)9 (+)68 (-)61 (+)2 (-)896 (+)6,570 line Table: line Total Region/Country Where confirmed Nationality line line China Asia ex-China Europe Americas Africa Australia Point of Entry Community Korean Other line line New 5 1 1 2 1 0 0 3 2 4 1 line line Total 1,061 18 122 456 461 3 1 442 619 966 95 line line (1.7%) (11.5%) (43.0%) (43.4%) (0.3%) (0.1%) (41.7%) (58.3%) (91.0%) (9.0%) line Table: line Region Confirmed cases Other major clusters line line Total Imported cases Clusters Othe

### Train model

#### tensorboard

In [17]:
PATH_FOLDER_RUNS

'../../data/runs'

In [18]:
logs_base_dir = PATH_FOLDER_RUNS

if process_exists('tensorboard.exe'):
    pass
elif process_exists('tensorboard'):
    pass
else:
    print("launch tensorboard process...")
    popen_obj = subprocess.Popen(["tensorboard", "--logdir", logs_base_dir, 
                              "--port", "6006", "--bind_all"])
    # patch to wait process
    time.sleep(30)

# Load the TensorBoard notebook extension
%load_ext tensorboard
if os.path.isdir(PATH_FOLDER_RUNS) == False:
    os.mkdir(PATH_FOLDER_RUNS)
logs_base_dir = PATH_FOLDER_RUNS
print(logs_base_dir)
%tensorboard --logdir {logs_base_dir} --port 6006 --bind_all

launch tensorboard process...
../../data/runs


Reusing TensorBoard on port 6006 (pid 12216), started 0:00:26 ago. (Use '!kill 12216' to kill it.)

In [19]:

#from tensorboard import notebook
#notebook.list() # View open TensorBoard instances
#notebook.display(port=6006, height=1000) # Display tensorboard

#### Load from scratch

In [20]:
# define model
if resume_train == False:
    # load model
    model = QuestionAnsweringModel(mdl_type, mdl_name, ARGS_DEFAULT)

#### Training

In [21]:
# train model & save
if resume_train == False:
    with tf.device("GPU:0"):
        result_train, text_train, result_test, text_test =  train_model_qa(model, 
            train_data, test_data, path_folder_train=PATH_FOLDER_TRAIN,
            path_to_save='auto', nb_retry=1)

In [22]:
if resume_train == False:
    df_score = add_score(model)


In [23]:
if resume_train == False:
    df_score

#### Resume training with last model

In [24]:
if resume_train == True:
    if MODE_RUN == 'KAGGLE':
        path_model = PATH_FOLDER_MODEL_RESUME
    else:
        # select model last model
        path_model = get_last_run_folder()
        
    print("path_model : ", path_model)
    # load model
    model = QuestionAnsweringModel('distilbert', path_model, args=ARGS_DEFAULT)

path_model :  ../../data/runs\May26_10-32-51_SHADOW-VAHVOMTP


In [25]:
get_last_run_folder()

'../../data/runs\\May26_10-32-51_SHADOW-VAHVOMTP'

In [None]:
if resume_train == True:
    # train model & save
    with tf.device("GPU:0"):
        result_train, text_train, result_test, text_test =  train_model_qa(model, 
            train_data, test_data, path_folder_train=PATH_FOLDER_TRAIN,
            path_to_save='auto', nb_retry=1)

Training from folder :  ../../data
Run # 0




In [None]:
#save_model_folder(PATH_FOLDER_TRAIN + '/outputs' , get_last_run_folder())

In [32]:
if resume_train == True:
    df_score = add_score(model)

File ../../data/df_read_table_score_20200528_08_58_11.csv moved!


In [33]:
df_score

Unnamed: 0,date,mdl_type,mdl_name,learning_rate,epochs_start,epochs,acc_train,acc_test,path_src,path_out,max_seq_length,doc_stride,train_batch_size,data,timing_train
0,2020-04-12 01:50:29,distilbert,distilbert-base-uncased-distilled-squad,4e-05,0,500,0.028912,0.013889,,,,,,,
1,2020-04-12 16:12:55,distilbert,distilbert-base-uncased-distilled-squad,4e-05,501,500,0.027211,0.00463,,,,,,,
2,2020-04-14 00:40:30,distilbert,distilbert-base-uncased-distilled-squad,1.737e-05,1001,500,0.028219,0.033755,,,,,,,
3,2020-04-14 13:58:27,distilbert,distilbert-base-uncased-distilled-squad,7.465e-06,1501,500,0.031746,0.037975,,,,,,,
4,2020-04-15 03:38:27,distilbert,distilbert-base-uncased-distilled-squad,2.39e-08,2001,500,0.031746,0.037975,,,,,,,
5,2020-04-15 21:36:46,distilbert,distilbert-base-uncased-distilled-squad,4e-05,2501,500,0.029982,0.012658,,,,,,,
6,2020-04-17 15:45:23,distilbert,distilbert-base-uncased-distilled-squad,1e-05,501,500,0.08642,0.059072,,,,,,,
7,2020-04-18 02:05:07,distilbert,distilbert-base-uncased-distilled-squad,1e-05,1001,500,0.08642,0.059072,Apr16_05-25-42_SHADOW-VAHVOMTP,Apr17_01-31-31_SHADOW-VAHVOMTP,,,,,
8,2020-04-18 17:26:58,distilbert,distilbert-base-uncased-distilled-squad,1e-05,501,500,0.084656,0.046414,Apr18_03-37-21_SHADOW-VAHVOMTP,Apr18_03-37-21_SHADOW-VAHVOMTP,,,,,
9,2020-04-19 07:41:57,distilbert,distilbert-base-uncased-distilled-squad,1e-05,2001,500,0.082892,0.050633,Apr18_17-32-50_SHADOW-VAHVOMTP,Apr18_17-32-50_SHADOW-VAHVOMTP,,,,,


In [None]:
38000/3600

In [None]:
26000/100

In [None]:
26000/100*12

In [None]:
26000/100*12/38

In [27]:
#save_model_folder(PATH_FOLDER_TRAIN + '/outputs/checkpoint-150000', 
#                  get_last_run_folder())

Saving model...
Source:  ../../data/outputs/checkpoint-150000
../../data/runs\May27_12-10-21_SHADOW-VAHVOMTP/config.json copied.
nbest_predictions_test.json NOT copied.
null_odds_test.json NOT copied.
predictions_test.json NOT copied.
../../data/runs\May27_12-10-21_SHADOW-VAHVOMTP/pytorch_model.bin copied.
../../data/runs\May27_12-10-21_SHADOW-VAHVOMTP/special_tokens_map.json copied.
../../data/runs\May27_12-10-21_SHADOW-VAHVOMTP/tokenizer_config.json copied.
../../data/runs\May27_12-10-21_SHADOW-VAHVOMTP/training_args.bin copied.
../../data/runs\May27_12-10-21_SHADOW-VAHVOMTP/vocab.txt copied.
../../data/runs\May27_12-10-21_SHADOW-VAHVOMTP/optimizer.pt copied.
../../data/runs\May27_12-10-21_SHADOW-VAHVOMTP/scheduler.pt copied.


In [None]:
4000/512*38

### Test last model

In [15]:
path_model = get_last_run_folder()
print("path_model : ", path_model)

path_model :  ../../data/runs\May27_12-10-21_SHADOW-VAHVOMTP


In [16]:
# load model
model = QuestionAnsweringModel(mdl_type, get_last_run_folder(), 
                               args=ARGS_DEFAULT)

In [17]:
result_train, text_train = model.eval_model(train_data)
print("Eval TRAIN : ")
print(result_train)
acc_train = get_acc(result_train)
print("acc_train: ", acc_train)
print('-------------------')

Eval TRAIN : 
{'correct': 76, 'similar': 45, 'incorrect': 893}
acc_train:  0.07495069033530571
-------------------


In [22]:
%%time

result_test, text_test = model.eval_model(test_data)
print("Eval TEST : ")
print(result_test)
acc_test  = get_acc(result_test)
print("acc_test: ", acc_test)

Eval TEST : 
{'correct': 31, 'similar': 9, 'incorrect': 465}
acc_test:  0.061386138613861385
Wall time: 1min 30s


In [19]:
result_test

{'correct': 31, 'similar': 9, 'incorrect': 465}

In [20]:
text_test

{'correct_text': {'00697': '120',
  '01666': '145',
  '01679': '47',
  '01036': '45',
  '01222': '53',
  '00203': '76',
  '01602': '121',
  '01083': '42',
  '01171': '53',
  '00486': '15',
  '01235': '40',
  '01475': '46',
  '00640': '22',
  '01525': '45',
  '00968': '45',
  '00845': '40',
  '00082': '8',
  '01492': '47',
  '01357': '680',
  '01050': '46',
  '01337': '40',
  '00476': '114',
  '00223': '60',
  '01273': '53',
  '00060': '9',
  '01611': '47',
  '01645': '47',
  '00998': '41',
  '00952': '138',
  '00659': '41',
  '00611': '33'},
 'similar_text': {'01671': {'truth': '14',
   'predicted': '145',
   'question': 'How many confirmed cases are in Jeju?'},
  '00198': {'truth': '6',
   'predicted': '76',
   'question': 'How many confirmed cases are in Incheon?'},
  '00204': {'truth': '7',
   'predicted': '76',
   'question': 'How many confirmed cases are in Gangwon?'},
  '00291': {'truth': '5',
   'predicted': '65',
   'question': 'How many confirmed cases are in Jeonnam?'},
  '00