# Read data into Table with BERT 

## Installation

In [1]:
!pip install tokenizers==0.5.2

Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 8.6MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.5.2


In [2]:
!pip install simpletransformers

Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/e7/75/836693dae9f2fab7dae172c9f7e2e1292898df377e113be06b061f7f9e0a/simpletransformers-0.23.2-py3-none-any.whl (148kB)
[K     |██▏                             | 10kB 26.4MB/s eta 0:00:01[K     |████▍                           | 20kB 5.9MB/s eta 0:00:01[K     |██████▋                         | 30kB 8.5MB/s eta 0:00:01[K     |████████▉                       | 40kB 5.5MB/s eta 0:00:01[K     |███████████                     | 51kB 6.7MB/s eta 0:00:01[K     |█████████████▎                  | 61kB 7.9MB/s eta 0:00:01[K     |███████████████▌                | 71kB 9.0MB/s eta 0:00:01[K     |█████████████████▊              | 81kB 7.1MB/s eta 0:00:01[K     |████████████████████            | 92kB 7.8MB/s eta 0:00:01[K     |██████████████████████          | 102kB 8.6MB/s eta 0:00:01[K     |████████████████████████▎       | 112kB 8.6MB/s eta 0:00:01[K     |██████████████████████████▌    

In [3]:
import tokenizers
tokenizers.__version__

'0.5.2'

## Import

In [4]:
%matplotlib inline

# for figure
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns
sns.set(color_codes=True, font_scale=1.33)

# for training
from simpletransformers.question_answering import QuestionAnsweringModel
import json
import logging
# to save 
#import pickle


# useful
import pandas as pd
import numpy as np
import numpy.ma as ma
import shutil
import os
import re
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

# for cross validation
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

  import pandas.util.testing as tm


## Definitions

In [5]:
# your data folder
PATH_FOLDER_SAVE = '/content/drive/My Drive/Coronavirus/data'
# path current
path_current = os.getcwd()
# your code folder LOCAL
PATH_FOLDER_CODE = path_current
# your run folder
PATH_FOLDER_RUN = path_current
# path folder saved model
PATH_FOLDER_MODEL_SAVED = '/content/drive/My Drive/outputs_bert_temp'
# list of camemBERT model files to save
LIST_FILES_TO_SAVE = ["config.json",
                      "nbest_predictions_test.json", 
                      "null_odds_test.json",
                      "predictions_test.json",
                      "pytorch_model.bin" ,
                      "special_tokens_map.json",
                      "tokenizer_config.json", 
                      "training_args.bin",
                      "vocab.txt"]
# path of DATA
#PATH_DF_FAKE_NEWS = PATH_FOLDER_SAVE + '/df_fake_news.pkl'
#PATH_DF_TRUE_NEWS = PATH_FOLDER_SAVE + '/df_true_news.pkl'

# % of data used for training
train_percent = 0.7
# maximum number of epochs limited by disk space (1 epoch = 1Gb of data)                    
MAX_NB_EPOCHS = 33 
# random state for training and other stuff
RANDOM_STATE = 0

# save df news
#PATH_DF_NEWS_SAVE = PATH_FOLDER_SAVE + '/df_news.pkl' 
# save df results
#PATH_DF_RES_SAVE = PATH_FOLDER_SAVE + '/df_res.pkl'

# check if Google Colab need Drive ?
if re.match("^/content", os.getcwd()):
    print("GOOGLE COLAB MODE")
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    # your code folder GOOGLE
    PATH_FOLDER_CODE = \
      "/content/drive/My Drive/Coronavirus/code/coronavirusModel"
    PATH_FOLDER_RUN = "/content"  
      
    os.chdir(PATH_FOLDER_CODE)

# path of model  during training : big amount of data :    
PATH_FOLDER_TRAIN = PATH_FOLDER_RUN 
# path to Q/A data for train/eval model
PATH_QA_KCDC = PATH_FOLDER_CODE + '/train_data_qa_kcdc.json'

GOOGLE COLAB MODE
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Helper functions

In [0]:
def display_missing(df):
    '''
    Display dataFrame missing values : barplots & numbers
    ouput : dataFrame containing missing numbers
    '''
    nb_lignes = df.shape[0]
    nb_missing = df.isnull().sum()
    nb_missing = nb_missing.sort_values()
    df_missing = nb_missing.to_frame()
    df_missing.rename(index=str, columns={0: "nb_missing"}, inplace=True)
    df_missing["ratio"] = 100*df_missing["nb_missing"] / nb_lignes
    fig = plt.figure(figsize=(7, 7))
    liste_name_bars = df_missing.index.values
    liste_name_bars_num = []
    for name in liste_name_bars:
        liste_name_bars_num.append("{} [{}]".format(
            name, df_missing.loc[name, "nb_missing"]))
    ax = sns.barplot(y=liste_name_bars_num, x='ratio', data=df_missing)
    plt.title('Missing lines Ratio over {} lines'.format(nb_lignes))
    ax.set(xlabel='repartition [%]')
    ax.set_xlim([0, 100])
    return df_missing

def save_model_folder(path_folder_source, path_folder_dest):
  '''
  Save model folder to be reused
  '''
  
  if os.path.isdir(path_folder_dest) == False:
    os.mkdir(path_folder_dest)
  for file_curr in LIST_FILES_TO_SAVE:
    shutil.copyfile(path_folder_source + '/' + file_curr, 
                    path_folder_dest + '/' + file_curr)

def clean_model_folder(path_folder_train):
    '''
    Clean folder where model is trained : 
    by deleting  "outputs", "runs", "cache_dir" folders.
    '''
    list_path = ["outputs", "runs", "cache_dir"]
    for path_curr in list_path:
        shutil.rmtree(os.path.join(path_folder_train, path_curr))


def count_good_res(result):
    return result["correct"]
    
def train_model_qa(train_data, eval_data, path_folder_train=PATH_FOLDER_TRAIN,
                   path_to_save=PATH_FOLDER_MODEL_SAVED, 
                   nb_max_epochs=MAX_NB_EPOCHS, nb_retry=3, use_cuda=True):
    
    '''
    Train with BERT model for Question/Awsering task
    train_data is used to train model.
    eval_data is used to eval model and select the best model training.
    They are list of dictionnary. More info here :
    https://github.com/ThilinaRajapakse/simpletransformers#question-answering
    
    At each serie, we train over nb_max_epochs epochs into path_to_run folder,
    As accuracy can be different each times, 
    we re-train nb_retry times over nb_max_epochs.
    The best model is save into path_to_save folder.
    By default, we use CUDA to use GPU (use_cuda=True). 
    It can be disabled (use_cuda=False).
    '''


    # activate logging warning messages from transformers lib
    #logging.basicConfig(level=logging.INFO)
    #transformers_logger = logging.getLogger("transformers")
    #transformers_logger.setLevel(logging.WARNING)
    # change directory to store temporary model files during training
    path_current = os.getcwd()
    path_outputs = path_folder_train + '/outputs' 
    try:
        if os.path.isdir(path_folder_train) == False:
            os.mkdir(path_folder_train)
        os.chdir(path_folder_train)
        # loop to train without override disk usage
        list_acc_train = []
        list_acc_test = []
        for num_serie in range(0, nb_retry):
            print("Run #", num_serie)

            # Create the QuestionAnsweringModel
            model = QuestionAnsweringModel('distilbert', 
                                    'distilbert-base-uncased-distilled-squad', 
                                        args={'num_train_epochs': nb_max_epochs,
                                                'reprocess_input_data': True, 
                                                'overwrite_output_dir': True,
                                                'fp16': False})
            # Train the model
            model.train_model(train_data)

            # eval train
            result, text = model.eval_model(train_data)
            print("Eval TRAIN : ")
            print(result)
            print(text)
            print('-------------------')
            acc_train = count_good_res(result)
            print("acc_train: ", acc_train)
            list_acc_train.append(acc_train)

            # eval test
            result, text = model.eval_model(eval_data)
            print("Eval TEST : ")
            print(result)
            print(text)
            print('-------------------')
            acc_test = count_good_res(result)
            print("acc_test: ", acc_test)
            list_acc_test.append(acc_test)

            if (acc_test >= np.max(list_acc_test)):
                # Save model
                 
                save_model_folder(path_outputs, path_to_save)
                result_best = result
        # return to path before training
        print("list_acc_train: ", list_acc_train)  
        print("list_acc_test: ", list_acc_test)    
        print("best result: ", result_best)
        print("best acc_test: ", np.max(list_acc_train))
        print("best acc_test: ", np.max(list_acc_test))
    finally:
        os.chdir(path_current)

## Test Q/A with BERT

### Data

In [7]:
# load data 
with open(PATH_QA_KCDC, 'r') as f:
    qa_data = json.load(f)
# separate Train / Test 
print("len qa_data : ", len(qa_data))
np.random.seed(0)
i_permut = np.random.permutation(len(qa_data))

i_permut_train = i_permut[0:np.int(len(i_permut)*train_percent)]
print("length i_permut_train : ", len(i_permut_train))
print('i_permut_train : [{} - {}]'.format(0, 
                                        np.int(len(i_permut)*train_percent)-1))
i_permut_test = i_permut[np.int(len(i_permut)*train_percent):]
print("length i_permut_test : ", len(i_permut_test))
print('i_permut_test : [{} - {}]'.format(np.int(len(i_permut)*train_percent), 
    np.int(len(i_permut)*train_percent) + len(i_permut_test)-1))

train_data = [ qa_data[i] for i in i_permut_train]
print("len(train_data) :" , len(train_data))
test_data = [ qa_data[i] for i in i_permut_test]
print("len(test_data) :" , len(test_data))
#train_data = qa_data[i_permut_train]
#test_data = qa_data[i_permut_test]
#df_news['train'].iloc[indices_permut_train] = True
#df_news['train'].value_counts()

len qa_data :  51
length i_permut_train :  35
i_permut_train : [0 - 34]
length i_permut_test :  16
i_permut_test : [35 - 50]
len(train_data) : 35
len(test_data) : 16


In [0]:
#clean_model_folder(PATH_FOLDER_TRAIN)

In [8]:
print(PATH_FOLDER_TRAIN)
print(PATH_FOLDER_MODEL_SAVED)
print(train_data[0])

/content
/content/drive/My Drive/outputs_bert_temp
{'index': 47, 'url': 'https://www.cdc.go.kr/board/board.es?mid=a30402000000&bid=0030&act=view&list_no=366512&tag=&nPage=5', 'context': ' \r\n \r\n \r\nPeriod \r\n(since 3 January) \r\n \r\nTotal \r\n \r\nConfirmed cases \r\n \r\nSuspected cases \r\n \r\n \r\n \r\nSub \r\ntotal \r\n \r\ndischarged \r\n \r\nisolated \r\n \r\nDeceased \r\n \r\nSub \r\ntotal \r\n \r\nbeing tested \r\n \r\ntested negative \r\n \r\n \r\n \r\nAs of 0:00, 9. March.  \r\n \r\n196,618 \r\n \r\n7,382 \r\n \r\n166 \r\n \r\n7,165 \r\n \r\n51 \r\n \r\n189,236 \r\n \r\n17,458 \r\n \r\n171,778 \r\n \r\n \r\n \r\nAs of 0:00, 10 March \r\n \r\n210,144 \r\n \r\n7,513 \r\n \r\n247 \r\n \r\n7,212 \r\n \r\n54 \r\n \r\n202,631 \r\n \r\n18,452 \r\n \r\n184,179 \r\n \r\n \r\n \r\nDifferences \r\n \r\n+13,526 \r\n \r\n+131 \r\n \r\n+81 \r\n \r\n+47 \r\n \r\n+3 \r\n \r\n+13,395 \r\n \r\n+994 \r\n \r\n+12,401 \r\n \r\n  \r\n \r\n \r\n\xa0 \r\n \r\nCity  \r\n \r\n \r\n \r\nSeoul \

### Train model

In [12]:
train_model_qa(train_data, test_data, path_folder_train=PATH_FOLDER_TRAIN,
                   path_to_save=PATH_FOLDER_MODEL_SAVED, 
                   nb_max_epochs=4, nb_retry=2, use_cuda=True)

Run # 0


100%|██████████| 547/547 [00:11<00:00, 48.43it/s]


HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=562, style=ProgressStyle(description_…

Running loss: 0.158591



Running loss: 0.081181


HBox(children=(IntProgress(value=0, description='Current iteration', max=562, style=ProgressStyle(description_…

Running loss: 0.002450


HBox(children=(IntProgress(value=0, description='Current iteration', max=562, style=ProgressStyle(description_…

Running loss: 0.004490


HBox(children=(IntProgress(value=0, description='Current iteration', max=562, style=ProgressStyle(description_…

Running loss: 1.178818

TypeError: ignored

### Test saved model 

In [0]:
# load model
model = QuestionAnsweringModel('distilbert', PATH_FOLDER_MODEL_SAVED)

In [16]:
result, text = model.eval_model(train_data)
print("Eval TRAIN : ")
print(result)
print(text)
print('-------------------')
acc_train = count_good_res(result)
print("acc_train: ", acc_train)

100%|██████████| 547/547 [00:14<00:00, 36.69it/s]


HBox(children=(IntProgress(value=0, max=562), HTML(value='')))


Eval TRAIN : 
{'correct': 11, 'similar': 397, 'incorrect': 139}
{'correct_text': {'00091': '317', '00485': '200', '00153': '469', '00219': '624', '00170': '409', '00187': '469', '00638': '337', '00253': '685', '00061': '134', '00126': '1,314', '00202': '514'}, 'similar_text': {'00392': {'truth': '110', 'predicted': '', 'question': 'How many confirmed cases are in Seoul?'}, '00393': {'truth': '86', 'predicted': '', 'question': 'How many confirmed cases are in Busan?'}, '00394': {'truth': '5,533', 'predicted': '', 'question': 'How many confirmed cases are in Daegu?'}, '00395': {'truth': '10', 'predicted': '', 'question': 'How many confirmed cases are in Incheon?'}, '00396': {'truth': '12', 'predicted': '', 'question': 'How many confirmed cases are in Gwangju?'}, '00397': {'truth': '17', 'predicted': '', 'question': 'How many confirmed cases are in Daejeon?'}, '00398': {'truth': '23', 'predicted': '', 'question': 'How many confirmed cases are in Ulsan?'}, '00399': {'truth': '8', 'predict

In [14]:
result_test, text_test = model.eval_model(test_data)
print("Eval TEST : ")
print(result_test)
print(text_test)
print('-------------------')
acc_test  = count_good_res(result_test)
print("acc_test: ", acc_test)

100%|██████████| 218/218 [00:07<00:00, 30.96it/s]


HBox(children=(IntProgress(value=0, max=260), HTML(value='')))


Eval TEST : 
{'correct': 2, 'similar': 187, 'incorrect': 29}
{'correct_text': {'00121': '321', '00017': '131'}, 'similar_text': {'00007': {'truth': '1', 'predicted': '', 'question': 'How many confirmed cases are in Seoul?'}, '00008': {'truth': '6', 'predicted': '', 'question': 'How many confirmed cases are in Busan?'}, '00009': {'truth': '24', 'predicted': '', 'question': 'How many confirmed cases are in Daegu?'}, '00010': {'truth': '2', 'predicted': '', 'question': 'How many confirmed cases are in Gyeonggi?'}, '00112': {'truth': '3', 'predicted': '321', 'question': 'How many confirmed cases are in Incheon?'}, '00116': {'truth': '1', 'predicted': '321', 'question': 'How many confirmed cases are in Sejong?'}, '00123': {'truth': '2', 'predicted': '321', 'question': 'How many confirmed cases are in Jeju?'}, '00545': {'truth': '265', 'predicted': '', 'question': 'How many confirmed cases are in Seoul?'}, '00546': {'truth': '107', 'predicted': '', 'question': 'How many confirmed cases are 

## Test train issue : > 4 epochs

In [15]:
model = QuestionAnsweringModel('distilbert', 
                        'distilbert-base-uncased-distilled-squad', 
                            args={'num_train_epochs': 10,
                                    'reprocess_input_data': True, 
                                    'overwrite_output_dir': True,
                                    'fp16': False})
# Train the model
model.train_model(test_data)

100%|██████████| 218/218 [00:05<00:00, 40.48it/s]


HBox(children=(IntProgress(value=0, description='Epoch', max=10, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Current iteration', max=260, style=ProgressStyle(description_…

Running loss: 0.804014



Running loss: 0.590177


HBox(children=(IntProgress(value=0, description='Current iteration', max=260, style=ProgressStyle(description_…

Running loss: 0.014474


HBox(children=(IntProgress(value=0, description='Current iteration', max=260, style=ProgressStyle(description_…

Running loss: 0.070393


HBox(children=(IntProgress(value=0, description='Current iteration', max=260, style=ProgressStyle(description_…

Running loss: 0.505450


HBox(children=(IntProgress(value=0, description='Current iteration', max=260, style=ProgressStyle(description_…

Running loss: 0.013581


HBox(children=(IntProgress(value=0, description='Current iteration', max=260, style=ProgressStyle(description_…

Running loss: 0.550679


HBox(children=(IntProgress(value=0, description='Current iteration', max=260, style=ProgressStyle(description_…

Running loss: 0.577241


HBox(children=(IntProgress(value=0, description='Current iteration', max=260, style=ProgressStyle(description_…

Running loss: 0.029604

TypeError: ignored

## Ancien model simple

In [0]:
train_data_example = [
    {
        'url': 'https://www.cdc.go.kr/board/board.es?mid=a30402000000&bid=0030',
        'context': "This is the first context",
        'qas': [
            {
                'id': "00001",
                'is_impossible': False,
                'question': "Which context is this?",
                'answers': [
                    {
                        'text': "the first",
                        'answer_start': 8
                    }
                ]
            }
        ]
    },
    {
        'context': "Other legislation followed, including the Migratory " + \
     "Bird Conservation Act of 1929, a 1937 treaty prohibiting the hunting" + \
     "of right and gray whales, and the Bald Eagle Protection Act of 1940. " + \
     "These later laws had a low cost to society—the species were" + \
     " relatively rare—and little opposition was raised",
        'qas': [
            {
                'id': "00002",
                'is_impossible': False,
                'question': "What was the cost to society?",
                'answers': [
                    {
                        'text': "low cost",
                        'answer_start': 225
                    }
                ]
            },
            {
                'id': "00003",
                'is_impossible': False,
                'question': "What was the name of the 1937 treaty?",
                'answers': [
                    {
                        'text': "Bald Eagle Protection Act",
                        'answer_start': 167
                    }
                ]
            }
        ]
    }
]

In [0]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Create dummy data to use for training.
train_data = [
    {
        'context': "This is the first context",
        'qas': [
            {
                'id': "00001",
                'is_impossible': False,
                'question': "Which context is this?",
                'answers': [
                    {
                        'text': "the first",
                        'answer_start': 8
                    }
                ]
            }
        ]
    },
    {
        'context': "Other legislation followed, including the Migratory " + \
     "Bird Conservation Act of 1929, a 1937 treaty prohibiting the hunting" + \
     "of right and gray whales, and the Bald Eagle Protection Act of 1940. " + \
     "These later laws had a low cost to society—the species were" + \
     " relatively rare—and little opposition was raised",
        'qas': [
            {
                'id': "00002",
                'is_impossible': False,
                'question': "What was the cost to society?",
                'answers': [
                    {
                        'text': "low cost",
                        'answer_start': 225
                    }
                ]
            },
            {
                'id': "00003",
                'is_impossible': False,
                'question': "What was the name of the 1937 treaty?",
                'answers': [
                    {
                        'text': "Bald Eagle Protection Act",
                        'answer_start': 167
                    }
                ]
            }
        ]
    }
]

# Save as a JSON file
#path_train = PATH_FOLDER_SAVE + '/train.json'
#os.makedirs('data', exist_ok=True)
#with open(path_train, 'w') as f:
#    json.dump(train_data, f)

#def train_qa(train_data, )
os.chdir(PATH_FOLDER_RUN)
try:
    # Create the QuestionAnsweringModel
    model = QuestionAnsweringModel('distilbert', 
                               'distilbert-base-uncased-distilled-squad', 
                               args={'num_train_epochs': 4,
                                     'reprocess_input_data': True, 
                                     'overwrite_output_dir': True,
                                     'fp16': False})

    # Train the model 
    model.train_model(train_data)

    # Evaluate the model. (Being lazy and evaluating on the train data itself)
    #result, text = model.eval_model(path_train)
    result, text = model.eval_model(train_data)
    print(result)
    print(text)

    print('-------------------')
    # Making predictions using the model.
    to_predict = [
    {'context': 'This is the context used for demonstrating predictions.', 
    'qas': [{'question': 'What is this context?', 'id': '0'}]
    }]

    print(model.predict(to_predict))

finally:
    print("return to code folder")
    os.chdir(PATH_FOLDER_CODE)

INFO:filelock:Lock 140003264628944 acquired on /root/.cache/torch/transformers/e88f38f2c8bc669ef7873de68f36bf764d4f64b9833ca8401efe271aab476745.6e56d80621e979ea15e2bdea57e8e855ff964f9c6c56d3c3ab1c4fe3714cef08.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=555, style=ProgressStyle(description_width=…

INFO:filelock:Lock 140003264628944 released on /root/.cache/torch/transformers/e88f38f2c8bc669ef7873de68f36bf764d4f64b9833ca8401efe271aab476745.6e56d80621e979ea15e2bdea57e8e855ff964f9c6c56d3c3ab1c4fe3714cef08.lock





INFO:filelock:Lock 140003264628944 acquired on /root/.cache/torch/transformers/1fb4b3980f6966dcb2c2e8a04794b70423fc470b65efcb692b8d796f3cae9e9e.f4565e3948d4331d7e0460adbcbdcac536e9886f24a2fad1190d6b53c231a3a3.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=265481570, style=ProgressStyle(description_…

INFO:filelock:Lock 140003264628944 released on /root/.cache/torch/transformers/1fb4b3980f6966dcb2c2e8a04794b70423fc470b65efcb692b8d796f3cae9e9e.f4565e3948d4331d7e0460adbcbdcac536e9886f24a2fad1190d6b53c231a3a3.lock





INFO:filelock:Lock 140003264628720 acquired on /root/.cache/torch/transformers/9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…

INFO:filelock:Lock 140003264628720 released on /root/.cache/torch/transformers/9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock





INFO:simpletransformers.question_answering.question_answering_model: Converting to features started.
  0%|          | 0/1 [00:00<?, ?it/s]INFO:simpletransformers.question_answering.question_answering_utils:*** Example ***
INFO:simpletransformers.question_answering.question_answering_utils:unique_id: 1000000000
INFO:simpletransformers.question_answering.question_answering_utils:example_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:doc_span_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:tokens: [CLS] [UNK] context is this ? [SEP] [UNK] is the first context [SEP]
INFO:simpletransformers.question_answering.question_answering_utils:token_to_orig_map: 7:0 8:1 9:2 10:3 11:4
INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 7:True 8:True 9:True 10:True 11:True
INFO:simpletransformers.question_answering.question_answering_utils:input_ids: 101 100 6123 2003 2023 1029 102 100 2003 1996 2034 6123 1

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=1, style=ProgressStyle(description_wi…

Running loss: 1.658352


HBox(children=(IntProgress(value=0, description='Current iteration', max=1, style=ProgressStyle(description_wi…

Running loss: 1.994044


HBox(children=(IntProgress(value=0, description='Current iteration', max=1, style=ProgressStyle(description_wi…

Running loss: 0.165820


HBox(children=(IntProgress(value=0, description='Current iteration', max=1, style=ProgressStyle(description_wi…

Running loss: 0.072466



INFO:simpletransformers.question_answering.question_answering_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.question_answering.question_answering_model: Converting to features started.
  0%|          | 0/3 [00:00<?, ?it/s]INFO:simpletransformers.question_answering.question_answering_utils:*** Example ***
INFO:simpletransformers.question_answering.question_answering_utils:unique_id: 1000000000
INFO:simpletransformers.question_answering.question_answering_utils:example_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:doc_span_index: 0
INFO:simpletransformers.question_answering.question_answering_utils:tokens: [CLS] [UNK] context is this ? [SEP] [UNK] is the first context [SEP]
INFO:simpletransformers.question_answering.question_answering_utils:token_to_orig_map: 7:0 8:1 9:2 10:3 11:4
INFO:simpletransformers.question_answering.question_answering_utils:token_is_max_context: 7:True 8:True 9:True 10:True 11:True
INFO:simp

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

INFO:simpletransformers.question_answering.question_answering_utils:Writing predictions to: outputs/predictions_test.json
INFO:simpletransformers.question_answering.question_answering_utils:Writing nbest to: outputs/nbest_predictions_test.json
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] is the first' in 'This is the first'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] is the first context' in 'This is the first context'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] is the' in 'This is the'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK]' in 'This'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] is' in 'This is'
INFO:simpletransformers.question_answering.question_answering_utils:Unable to find text: '[UNK] later laws had a low' in 'These later


{'correct': 1, 'similar': 1, 'incorrect': 1}
{'correct_text': {'00001': 'the first'}, 'similar_text': {'00002': {'truth': 'low cost', 'predicted': 'low', 'question': 'What was the cost to society?'}}, 'incorrect_text': {'00003': {'truth': 'Bald Eagle Protection Act', 'predicted': '1929', 'question': 'What was the name of the 1937 treaty?'}}}
-------------------


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


[{'id': '0', 'answer': 'the context used for demonstrating predictions'}]
return to code folder
