In [1]:
#default_exp haystack_search

In [2]:
#export
import pprint
import numpy as np
import pandas as pd
import requests
import torch
from sklearn import metrics
from nltk import tokenize
from operator import itemgetter

from haystack import Finder
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.database.memory import InMemoryDocumentStore

from haystack.retriever.dense import EmbeddingRetriever
from haystack.utils import print_answers
from pytorch_hackathon import rss_feeds

import seaborn as sns

In [3]:
pd.set_option('max_colwidth', 100)

In [4]:
cm = sns.light_palette("green", as_cmap=True)

In [5]:
%cd ..

/home/kuba/Projects/pytorch_hackathon


In [6]:
!ls data

feeds.txt  topics.txt  zsl_feed_results.csv


In [7]:
rss_feed_urls = list(pd.read_table('data/feeds.txt', header=None).iloc[:,0].values)

In [8]:
feed_df = rss_feeds.get_feed_df(rss_feed_urls)

100%|██████████| 16/16 [00:09<00:00,  1.69it/s]


  feed_df['text'] = feed_df['summary'].apply(lambda s: bs4.BeautifulSoup(s).text)


In [9]:
#export

use_gpu = torch.cuda.is_available()

In [10]:
pretty_print = pprint.PrettyPrinter(indent=2).pprint

In [11]:
feed_df.head()

Unnamed: 0,title,title_detail,links,link,summary,summary_detail,id,guidislink,tags,text,...,published_parsed,comments,authors,author,author_detail,updated,updated_parsed,content,href,media_thumbnail
0,Guided Collaborative Training for Pixel-wise Semi-Supervised Learning,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/guided-coll...",https://paperswithcode.com/paper/guided-collaborative-training-for-pixel-wise,"Although SSL methods have achieved impressive results in image classification, the performances ...","{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/guided-collaborative-training-for-pixel-wise,False,"[{'term': 'Image classification', 'scheme': None, 'label': None}, {'term': 'Image denoising', 's...","Although SSL methods have achieved impressive results in image classification, the performances ...",...,,,,,,,,,,
1,PiNet: Attention Pooling for Graph Classification,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/pinet-atten...",https://paperswithcode.com/paper/pinet-attention-pooling-for-graph,"We propose PiNet, a generalised differentiable attention-based pooling mechanism for utilising g...","{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/pinet-attention-pooling-for-graph,False,"[{'term': 'Graph classification', 'scheme': None, 'label': None}]","We propose PiNet, a generalised differentiable attention-based pooling mechanism for utilising g...",...,,,,,,,,,,
2,Rethinking Pseudo-LiDAR Representation,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/rethinking-...",https://paperswithcode.com/paper/rethinking-pseudo-lidar-representation,"Based on this observation, we design an image based CNN detector named Patch-Net, which is more ...","{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/rethinking-pseudo-lidar-representation,False,,"Based on this observation, we design an image based CNN detector named Patch-Net, which is more ...",...,,,,,,,,,,
3,DensE: An Enhanced Non-Abelian Group Representation for Knowledge Graph Embedding,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/dense-an-en...",https://paperswithcode.com/paper/dense-an-enhanced-non-abelian-group,Capturing the composition patterns of relations is a vital task in knowledge graph completion. <...,"{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/dense-an-enhanced-non-abelian-group,False,"[{'term': 'Entity embeddings', 'scheme': None, 'label': None}, {'term': 'Knowledge graph complet...",Capturing the composition patterns of relations is a vital task in knowledge graph completion. C...,...,,,,,,,,,,
4,A parallel evaluation data set of software documentation with document structure annotation,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/a-parallel-...",https://paperswithcode.com/paper/a-parallel-evaluation-data-set-of-software,"This paper accompanies the software documentation data set for machine translation, a parallel e...","{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/a-parallel-evaluation-data-set-of-software,False,"[{'term': 'Machine translation', 'scheme': None, 'label': None}]","This paper accompanies the software documentation data set for machine translation, a parallel e...",...,,,,,,,,,,


In [12]:
tokenize.wordpunct_tokenize

<bound method RegexpTokenizer.tokenize of WordPunctTokenizer(pattern='\\w+|[^\\w\\s]+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>

In [13]:
#export


def _setup_retriever(document_store, model_name, use_gpu, quantize_model):
    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model=model_name,
        use_gpu=use_gpu)
    if not use_gpu and quantize_model:
        quantized_model = torch.quantization.quantize_dynamic(
            retriever.embedding_model.model,
            {torch.nn.Linear}, dtype=torch.qint8
        )
    return retriever



def setup_document_store_with_retriever(
        model_name,
        df,
        text_col='text',
        max_document_length=256,
        use_gpu=use_gpu,
        quantize_model=True):
    embedding_col = text_col + '_emb'
    document_store = InMemoryDocumentStore(
        embedding_field=embedding_col,
    )
    retriever = _setup_retriever(document_store, model_name, use_gpu, quantize_model)
    
    article_embeddings = retriever.embed_queries(
        texts=[
            ' '.join(tokenize.wordpunct_tokenize(text)[:max_document_length])
            for text in df[text_col] 
        ]
    )

    df[embedding_col] = article_embeddings
    document_store.write_documents(df.to_dict(orient='records'))
    return document_store, retriever

In [14]:
article_texts = feed_df['text']

In [15]:
model_name = "deepset/sentence_bert"
document_store, retriever = setup_document_store_with_retriever(model_name, feed_df, 'text', use_gpu)

08/13/2020 21:33:22 - INFO - haystack.retriever.dense -   Init retriever using embeddings of model deepset/sentence_bert
08/13/2020 21:33:22 - INFO - farm.utils -   device: cuda n_gpu: 1, distributed training: False, automatic mixed precision training: None
08/13/2020 21:33:22 - INFO - farm.infer -   Could not find `deepset/sentence_bert` locally. Try to download from model hub ...
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
08/13/2020 21:33:30 - INFO - farm.utils -   device: cuda n_gpu: 1, distributed training: False, automatic mixed precision training: None
Inferencing Samples: 100%|██████████| 74/74 [00:10<00:00,  7.01 Batches/s]


In [16]:
feed_df.iloc[1]['text']

'We propose PiNet, a generalised differentiable attention-based pooling mechanism for utilising graph convolution operations for graph level classification. Code: https://github.com/meltzerpete/PiNet'

In [17]:
#export


def doc_to_dict(doc):
    d = {}
    d['text'] = doc.text
    d['title'] = doc.meta['title']
    d['score'] = doc.query_score
    return d

In [18]:
topic_strings = pd.read_table('data/topics.txt', header=None).iloc[:,0].values

In [19]:
print('\n'.join(topic_strings))

deep learning
natural language processing
computer vision
statistics
implementation
visualization
industry
software engineering
reddit question
arxiv
cloud computing
deployment
competitions
business
business intelligence


In [20]:
topic_query_strings = [
    'text is about {}'.format(topic)
    for topic in topic_strings
]

In [21]:
raw_results = retriever.retrieve(
    topic_query_strings[1]
)

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.67 Batches/s]


In [39]:
#export


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def get_scored_df(retriever, raw_results, topic_strings):
    topic_query_strings = [
        'text is about {}'.format(topic)
        for topic in topic_strings
    ]
    
    results = [
        doc_to_dict(doc)
        for doc in raw_results 
    ]
    result_embeddings = np.array([
        doc.meta['text_emb']
        for doc in raw_results
    ]).astype('float32')
    topic_query_embeddings = np.array(retriever.embed_passages(
        list(topic_strings)
    )).astype('float32')
    
    scores_df = pd.DataFrame({})
    scores_df['title'] = list(map(itemgetter('title'), results))
    scores_df['text'] = list(map(itemgetter('text'), results))

    scores = pd.DataFrame(metrics.pairwise.cosine_similarity(
        result_embeddings,
        topic_query_embeddings
    ))
    scores.columns = topic_strings

    scores_df = pd.concat(
        [scores_df, sigmoid(scores)],
        axis=1
    )
    return scores_df

In [40]:
scores_df = get_scored_df(retriever, raw_results, topic_strings)

Inferencing Samples: 100%|██████████| 4/4 [00:00<00:00,  7.14 Batches/s]


In [41]:
scores_df.style.background_gradient(cmap=cm)

Unnamed: 0,title,text,deep learning,natural language processing,computer vision,statistics,implementation,visualization,industry,software engineering,reddit question,arxiv,cloud computing,deployment,competitions,business,business intelligence
0,Using Julia to Do Whole Word Masking,"Syntax almost as friendly as Python, while running up to 100x fasterPhoto Credit(This post was originally published on my personal blog.)IntroductionIn my last post, [Failure Report] Distill Fine-tuned Transformers into Recurrent Neural Networks, I tried to distill the knowledge of a fine-tuned BERT model into an LSTM or GRU model without any data augmentation and failed to achieve satisfiable results. In the follow-up works, I tried to replicate the easies-to-implement augmentation method — masking — used in [1] and see its effect. The masking described in [1] is called “whole word masking” [2], that is, masking the whole word instead of just masking a single word piece.It is non-trivial to implement whole word masking, as it would require the sampling process to be aware of which word piece is itself a whole word, and which is part of a word. As you may know, doing text processing in pure Python is quite slow comparing to other compiled languages. I recently picked up the Julia programming language, which promises the flexibility of scripting languages and the speed of compiled languages, and thought that it was a good opportunity to test Julia in the field.This post describes the Julia code I wrote for this task and shows that for this specific task the Julia code is as simple to write as Python, while runs up to 100x faster than its pure Python counterpart.The AlgorithmThis is the algorithm I used to do whole word masking (given that the examples are already tokenized to word pieces):For each example, mark all the word pieces that are either a whole word or the first piece of a word (by using a mask).Randomly sample N marked pieces for each example (N is a hyper-parameter).Replacing the selected pieces with “[MASK]“.Check if the next piece is a part of this word (tokens start with “##” in BERT tokenizer). If so, also replace it with “[MASK]“.Repeat step 4 until the condition is false or the end of the example is reached.BenchmarksNotebook used in this section:PythonJuliaSummary(Comparing the mean run time here as the %timeit magic doesn’t provide the median run time.)Tokenizing examples:15 seconds (shared by both Python and Julia Pipeline)Adding Special Tokens:Python: 42 ms (estimated)Julia: 41 msMarking First PiecesPython: 326 msJulia: 47 ms (single-threaded)Julia: 39 ms (multi-threaded)Sample One Word to MaskPython: 8.2 s (using Numpy.random.choice)Julia: 69 msMaskingPython: 725 ms (copying the examples)Julia: 426 ms (copying the examples)Python: 300 ms (estimated)Julia: 10 msRemarksThe most time-consuming part is tokenizing the examples. So in reality optimizing the tokenizer has the most potential (That’s why huggingface has re-implemented the word-piece tokenizers in Rust).But the eight seconds saved on sampling by switching to Julia is also a significant improvement, and just took a few lines to implement.Copying the examples takes around 300 to 500 ms, and is the most expensive operation besides tokenization. So try to avoid it if possible. (If you need the augment the same dataset multiple times, you have no choice to copy the examples.)Adding Special TokensA simple operation that adds “[CLS]” to the head and “[SEP]” to the tail. Python and Julia are equally fast in this one.Pythondef add_special_tokens(sentence): sentence.insert(0, ""[CLS]"") sentence.append(""[SEP]"")tmp = deepcopy(sentences)for sentence in tmp: add_special_tokens(sentence)Juliafunction add_special_tokens!(sentence) pushfirst!(sentence, ""[CLS]"") push!(sentence, ""[SEP]"")endtmp = deepcopy(sentences)results = add_special_tokens!.(tmp)Marking First PiecesCreate binary masks to filter out word piece that is not the first word piece of a word. Julia is starting to outperform Python.Pythondef is_first_piece(tokens): return [not token.startswith(""##"") for token in tokens]first_piece_masks = [is_first_piece(sent) for sent in sentences]JuliaVectorized (single-thread) version:function is_first_piece(arr::Array{String,1}) return .!startswith.(arr, ""##"")endresults = is_first_piece.(sentences)A multi-thread version is also provided, which can sometimes be faster depending on your hardware:results = [Bool[] for _ in 1:length(sentences)]Threads.@threads for i in 1:length(sentences) results[i] = is_first_piece(sentences[i])endSamplingRandomly sample one word from each example to be masked. Since I can’t think of any simple way to vectorized this in Python, a naive for-loop approach is used. Vectorizing in Julia, on the other hand, is fairly straight-forward. As a result, the Julia version is vastly faster (100x) than the Python one.Note: I used Numpy in the Python implementation, so it’s not really “pure python” in this case.Pythondef sample(first_piece_masks, n=1): results = [] for mask in first_piece_masks: if sum(mask) <= n: results.append([]) continue probabilities = np.asarray(mask) / float(sum(mask)) results.append(np.random.choice(np.arange(len(mask)), size=n, p=probabilities)) return resultsmasking_points = sample(first_piece_masks)Juliausing StatsBasefunction sample_mask_position(first_piece_mask, n=1) if sum(first_piece_mask) <= n return Int64[] end return sample(1:length(first_piece_mask), Weights(first_piece_mask), n, replace=false)endmasking_points = sample_mask_position.(first_piece_masks)MaskingFull word masking. This one inevitably has to use some loop to scan the example. For loops are not a problem for Julia, so the Julia version is much faster (30x) than Python.The implementation presented here copies the examples inside the function so the original examples can be augmented multiple times.Pythondef masking(rows, first_piece_masks, masking_points): augmented_rows = deepcopy(rows) for idx in range(len(masking_points)): for pos in masking_points[idx]: augmented_rows[idx][pos] = ""[MASK]"" while pos +1 < len(first_piece_masks[idx]) and first_piece_masks[idx][pos + 1] == 0: pos += 1 augmented_rows[idx][pos] = ""[MASK]"" return augmented_rowsaugmented_sentences = masking(sentences, first_piece_masks, masking_points)Juliafunction masking(rows::Vector{Vector{String}}, first_piece_masks::Vector{Vector{Bool}}, masking_points::Vector{Vector{Int64}}) augmented_rows = deepcopy(rows) for idx in 1:length(masking_points) for pos in masking_points[idx] augmented_rows[idx][pos] = ""[MASK]"" while pos + 1 <= length(first_piece_masks[idx]) && first_piece_masks[idx][pos + 1] == 0 pos += 1 augmented_rows[idx][pos] = ""[MASK]"" end end end return augmented_rowsendaugmented_sentences = masking(sentences, first_piece_masks, masking_points)ConclusionThis is the first time I integrate Julia in an NLP pipeline, and the results are encouraging. The easy of development of Julia is on the same level as Python, but the is on a totally different level. In this example, the most improvement in speed comes from the sampling process, but it only represents less than 40 % of the total run time. And the total run time in Python is relatively short. I look forward to seeing what kind of speedup Julia can bring in bigger datasets or more complicated tasks.(The notebook actually used in the pipeline).ReferencesTang, R., Lu, Y., Liu, L., Mou, L., Vechtomova, O., & Lin, J. (2019). Distilling Task-Specific Knowledge from BERT into Simple Neural Networks.BERT: New May 31st, 2019: Whole Word Masking ModelsUsing Julia to Do Whole Word Masking was originally published in Veritable on Medium, where people are continuing the conversation by highlighting and responding to this story.",0.539542,0.630511,0.533587,0.557276,0.572191,0.569392,0.542997,0.574472,0.551468,0.531546,0.515421,0.542184,0.506937,0.527759,0.54328
1,Transfer Learning with KERAS,"Learning is a never-ending process, but it’s more important to use previously gained knowledge in a new experiment.Continue reading on Towards AI — Multidisciplinary Science Journal »",0.696341,0.595428,0.577528,0.566244,0.623856,0.597831,0.579651,0.589617,0.514814,0.502834,0.537744,0.560929,0.568218,0.559038,0.55649
2,How PyTorch Lightning became the first ML framework to runs continuous integration on TPUs,Learn how PyTorch Lightning added CI tests on TPUsContinue reading on PyTorch »,0.676328,0.595533,0.567742,0.567639,0.638649,0.599192,0.58586,0.589025,0.528871,0.514884,0.531069,0.567574,0.562189,0.560442,0.553597
3,Textual Description for Mathematical Equations,Reading of mathematical expression or equation in the document images is very challenging due to the large variability of mathematical symbols and expressions. Code: https://github.com/ajoymondal/Equation-Description-PyTorch,0.58575,0.561175,0.529714,0.54256,0.564061,0.554569,0.550492,0.519143,0.529417,0.536874,0.50922,0.539236,0.525885,0.546909,0.525588
4,[Notes] “Statistical Inference Enables Bad Science; Statistical Thinking Enables Good Science”,"Reading the Article by Christopher Tong on The American Statistician Volume 73, 2019Photo Credit(This is a republication of this post on my personal blog.)This article by Christopher Tong has got a lot of love from people I followed on Twitter, so I decided to read it. It was very enlightening. But to be honest, I don’t fully understand quite a few arguments made by this article, probably because I lack the experience of more rigorous scientific experiments and research. Nonetheless, I think writing down the parts I find interesting and put it into a blog post would be beneficial for myself and other potential readers. Hopefully, it makes it easier to reflect on these materials later.This article argues that instead of relying on the statistical inference on an isolated study, we should use guide scientific research of all kinds by statistical thinking, and validate claims by replicating and predicting finds in new data and new settings.Replicating and predicting findings in new data and new settings is a stronger way of validating claims than blessing results from an isolated study with statistical inferences.Let’s see the reasoning behind this claim.IntroductionFirst, Tong makes clear what “statistical inferences” are:Statistical inferences are claims made using probability models of data generating processes, intended to characterize unknown features of the population(s) or process(es) from which data are thought to be sampled. Examples include estimates of parameters such as the population mean (often attended by confidence intervals), hypothesis test results (such as p-values), and posterior probabilities.Some of the widely used tools in statistical inference has come under fire recently for being misused or abused (as in The ASA’s Statement on p-Values: Context, Process, and Purpose)Among these criticisms, McShane and Gelman (2017) succinctly stated that null hypothesis testing “was supposed to protect researchers from over-interpreting noisy data. Now it has the opposite effect.”Tong tries to distinguish exploratory and confirmatory objectives of a study. He argues that most scientific research tends to be exploratory and flexible, but the statistical inference is only suitable in a confirmatory setting where study protocol and statistical model are fully prespecified.We shall argue that these issues stem largely from the Optimism Principle (Picard and Cook 1984)that is an inevitable byproduct of the necessarily flexible data analysis and modeling work that attends most scientific research.And the lack of this distinction in the current use of inferential methods in science has enabled biased statistical inference and encouraged a Cult of the Isolated Study that short-circuits the iterative nature of research.Statistical Inference and the Optimism PrincipleAs Efron and Hastie stated in their new book “Computer-Age Statistical Inference: Algorithms, Evidence, and Data Science”:It is a surprising, and crucial, aspect of statistical theory that the same data that supplies an estimate can also assess its accuracy.I had similar doubts when receiving traditional statistics education. The bias and variance tradeoff is mentioned but it is generally up to use to decide where to draw the line. The cross-validation is clearly a more principled and objective approach. (See the Breiman’s classic paper “Statistical Modeling: The Two Culture”)As Harrell, F. E., Jr. (2015) observed:Using the data to guide the data analysis is almost as dangerous as not doing so.Essentially, when researchers devise their analysis approach based on the data, it creates a chance to overfit the data. Simmons, Nelson, and Simonsohn (2011) called these opportunities researcher degrees of freedom, and when abused to fish for publishable p-values, p-hacking.The resulting inferences from the final model tend to be biased, with uncertainties underestimated, and statistical significance overestimated, a phenomenon dubbed the Optimism Principle by Picard andCook (1984).In extreme cases, nonsense data can still seem to make sense.In other words, it is possible to obtain a seemingly informative linear model, with decent R² and several statistically significant predictor variables, from data that is utter nonsense. This finding was later dubbed “Freedman’s paradox” (Raftery, Madigan, and Hoeting 1993).This kind of bias would lead to an underestimation of the uncertainty because we picked the model that has fit the training data best.Chatfield (1995) used the term model selection bias to describe the distorted inferences that result when using the same data that determines the form of the final model to also produce inferences from that model.Exploratory and Confirmatory Objectives in Scientific ResearchThe obvious way to avoid the difficulties of overfitting and produce valid statistical inferences is to completely prespecify the study design and statistical analysis plan prior to the start of data collection.Tong uses the phased experimentation of medical clinical trials as an example of scientific research where exploratory/confirmatory distinction is clearly made.This framework helps to separate therapeutic exploratory (typically Phase II) with therapeutic confirmatory (typically Phase III) objectives.It doesn’t prevent the expensive clinical dataset to be used for further exploratory work — to generate hypotheses for further testing in later experiments.A succinct perspective on such inferences is given by Sir Richard Peto, often quoted (e.g., Freedman 1998) as saying “you should always do subgroup analysis and never believe the results.”And it doesn’t mean the result from exploratory studies shouldn’t be published.If the result is important and exciting, we want to publish exploratory studies, but at the same time make clear that they are generally statistically underpowered, and need to be reproduced.From the Cult of the Isolated Study to TriangulationThe treatment of statistical inferences from exploratory research as if they were confirmatory enables what Nelder (1986) called The Cult of the Isolated Study, so that The effects claimed may never be checked by painstaking reproduction of the study elsewhere, and when this absence of checking is combined with the possibility that the original results would not have been reported unless the effects could be presented as significant, the result is a procedure which hardly deserves the attribute ‘scientific.Simple replication is usually not sufficient. Tong uses the Wright Brothers as a demonstrative example.Munafo and Davey Smith (2018) define triangulation as “the strategic use of multiple approaches to address one question. Each approach has its own unrelated assumptions, strengths, and weaknesses. Results that agree across different methodologies are less likely to be artifacts.”The notorious example of the report by the OPERA collaboration shows the importance of triangulation to uncover systematic errors.A particular weakness of the Isolated Study is that systematic errors may contaminate an entire study but remain hidden if no further research is done.Technical Solutions and Their DeficienciesThe most widely known class of such methods is based on adjusting for multiple inferences. These range from the simple Bonferroni inequality to the modern methods of false discovery rate and false coverage rate (e.g., Dickhaus 2014).A second class of methods incorporates resistance to overfitting into the statistical modeling process, often through an optimization procedure that penalizes model complexity, an approach sometimes called regularization.Tong also indicates that random splitting is still not the perfect solution.Unfortunately, such procedures (or their variants) are still vulnerable to the Optimism Principle, because random splitting implies that “left-out” samples are similar to the “left-in” samples (Gunter and Tong 2017).So it’s better to collect more data to overcome model uncertainty:Obtaining “more than one set of data, whenever possible, is a potentially more convincing way of overcoming model uncertainty and is needed anyway to determine the range of conditions under which a model is valid” (Chatfield 1995).Tong also discussed another widely advocated solution — model averaging. Those who are familiar with Kaggle competitions should already have a firm grasp on this.Only through the iterative learning process, using multiple lines of evidence and many sets of data, can systematic error be discovered, and model refinement be continually guided by new data.More Thoughtful SolutionsOne strategy requires preregistering both the research hypotheses to be tested and the statistical analysis plan prior to data collection, much as in a late-stage clinical trial (e.g., Nosek et al. 2018).However, the fact that most scientific research cannot fit the above paradigm is a big problem. A more realistic approach is preregistered replication.A variation on this theme is preregistered replication, where a replication study, rather than the original study, is subject to strict preregistration (e.g., Gelman 2015). A broader vision of this idea (Mogil andMacleod 2017) is to carry out a whole series of exploratory experiments without any formal statistical inference, and summarize the results by descriptive statistics (including graphics) or even just disclosure of the raw data.Enabling Good ScienceTong adapts a taxonomy of statistical activity by Cox (1957) and Moore (1992):Data production. The planning and execution of a study (either observational or experimental).Descriptive and exploratory analysis. Study the data at hand.Generalization. Make claims about the world beyond the data at hand.The first step of statistical thinking is to understand the objective of the study, its context, and its constraints, so that planning for study design and analysis can be fit for purpose.Data ProductionFeller (1969) pronounced that “The purpose of statistics in laboratories should be to save labor, time, and expense by efficient experimental designs” rather than null hypothesis significance testing.Tong discusses a few experiment design techniques that should already be familiar to those who have taken formal statistics education. He also raises some practical concerns when conducting the experiment and its analysis.Data acquisition and storage systems should have appropriate resolution and reliability. (We once worked with an instrument that allowed the user to retrieve stored time series data with a choice of time-resolution. Upon investigation, we found that the system was artificially interpolating data, and reporting values not actually measured, if the user chose a high resolution.)And other research degrees of freedom that is related to decisions around experiment design:Other researcher degrees of freedom can affect study design and execution. An instructive example for the latter is the decision to terminate data collection. Except in clinical trials, where this decision is tightly regulated and accounted for in the subsequent analysis (e.g., Chow and Chang 2012), many researchers have no formal termination rule, stopping when funding is exhausted, lab priorities shift, apparent statistical significance is achieved (or becomes clearly hopeless), or for some other arbitrary reason, often involving unblinded interim looks at the data.Data DescriptionMoses (1992)warned us that Good statistical description is demanding and challenging work: it requires sound conceptualization, and demands insightfully organizing the data, and effectively communicating the results; not one of those tasks is easy. To mistakenly treat description as ‘routine’ is almost surely to botch the job.Theory of Description:Mallows (1983) provided an interesting perspective on a Theory of Description. He noted that “A good descriptive technique should be appropriate for its purpose; effective as a mode of communication, accurate, complete, and resistant.”Something like Tukey’s (1977) five number summary (the minimum, first quartile, median, third quartile, and maximum) can be helpful to describe the variability of the data.Though we might not quantify uncertainty using probability statements, we can attempt to convey the observed variability of the data at hand, while acknowledging that it does not fully capture uncertainty… However, the use of such data summaries is not free of assumptions (e.g., unimodality, in some cases symmetry), so they are descriptive only in relation to these assumptions, not in an absolute sense.Disciplined Data ExplorationAccording to Tukey (1973), exploratory analysis of the data is not “just descriptive statistics,” but rather an “actively incisive rather than passively descriptive” activity, “with a real emphasis on the discovery of the unexpected.”An example of how exploratory analysis may be essential for scientific inquiry is in the detection of and adjustment for batch effects.Leek et al. (2010) defined batch effects as “sub-groups of measurements that have qualitatively different behavior across conditions and are unrelated to the biological or scientific variables in a study.”Tong also cites the warning of Diaconis (1985) about the danger of undisciplined exploratory analysis.If such patterns are accepted as gospel without considering that they may have arisen by chance, he considers it magical thinking, which he defines as“our inclination to seek and interpret connections and events around us, together with our disinclination to revise belief after further observation.”Statistical ThinkingStatistical thinking begins with a relentless focus on fitness for purpose (paraphrasing Tukey 1962: seeking approximate answers to the right questions, not exact answers to the wrong ones), sound attitudes about data production and its pitfalls, and good habits of data display and disciplined data exploration.Statistical thinking also involves a keen awareness of the pitfalls of data analysis and its interpretation, including:The correlation versus causation fallacy.The distinction between interpolation and extrapolation.The distinction between experimental and observational data.Regression to the mean.Simpson’s paradox, and the ecological fallacy.The curse of dimensionalityDiscussionThere is no scientifically sound way to quantify uncertainty from a single set of data, in isolation from other sets of data comprising an exploratory/learning process. This brings to mind an observation made about certain research in materials science: “Even if the studies had reported an error value, the trustworthiness of the result would not depend on that value alone” (Wenmackers and Vanpouke 2012). By emphasizing principles of data production, data description, enlightened data display, disciplined data exploration, and exposing statistical pitfalls in interpretation, there is much that statisticians can do to ensure that statistics is “a catalyst to iterative scientific learning” (Box 1999).[Notes] “Statistical Inference Enables Bad Science; Statistical Thinking Enables Good Science” was originally published in Veritable on Medium, where people are continuing the conversation by highlighting and responding to this story.",0.58575,0.561175,0.529714,0.54256,0.564061,0.554569,0.550492,0.519143,0.529417,0.536874,0.50922,0.539236,0.525885,0.546909,0.525588
5,A generic pipeline to make offline inferences,"Data Science has been at the core of Doctrine.fr since the beginning. As such, a lot of effort has been put into developing data science models and industrializing them using pipelines tailored to provide quality answers to business questions. This article focuses on the industrialization part, as we detail how we are doing offline inferences using our models. For insights about our online inference system, you can refer to our article about a dedicated API for machine learning.More precisely, this article describes the generic logic we have developed in order to make the inference step straightforward such that we only have to focus on the specific processing for a given task.Technical stackBefore going into the details of our generic pipeline, here’s some context with a small summary of the technical stack used in our data science projects.These projects are coded in Python.Data is mainly queried from a PostgreSQL database (we are also using AWS S3 but less frequently).Different versions of a model are stored on AWS S3.Offline tasks are scheduled and run using Airflow.Global overview of the generic pipelineLet’s say we have a production-ready model which is trained to predict the structure of a decision (it predicts a label for each paragraph of a decision) and we want to apply it to our corpus of decisions. If you want to know how we trained this model, you can read our article about structuring legal document through deep learning.We are using the example of predicting classes on decisions, but it could be inference of any type of model on any type of content, which emphasizes the need for a truly generic data pipeline. This is how the generic logic works for this specific use case, steps 3 to 5 are parallelizable:Query decisions to be processed through the unique ID we have for each decision at DoctrineSeparate those IDs in batches of N decisions for multiprocessing purposesFetch the data needed for those ids (e.g. contents of the decision)Process each decision (its contents) and apply the modelInsert inferred results in the databaseEvery single one of the steps laid out above is implemented in a generic python class we have called Project.Generic pipeline in Project classThis class also includes other things like connectors to database, connectors to Elasticsearch, loaders for models on AWS S3 or management of asynchronous operations . Those technical components are of tremendous value and help us avoid fragmentation and duplication, but fall outside the scope of this article, so we will not go into further details about them.This is a simplified implementation of thisProject class:https://medium.com/media/d7a491a98a3a96431962f1ddc88880d3/hrefThe different steps of the pipeline are launched in the run method of Project. All new data science projects thus inherit from theProject class and overload its get_rows, process_element and insert_results methods. In the next section, we will also go into more detail for get_ids method, which is used to get the identifiers of documents to process, in this use case decisions. So, if we apply it here to our task of structure prediction on decisions, our data script looks like this:https://medium.com/media/af8810ead74c16c99588508ad7dce52c/hrefLet’s now dig a bit more into what each step of the pipeline is doing.Identify elements to processThe first step of our pipeline is to query identifiers of documents we want to process, which is done with get_ids method. In our example we're dealing with court decisions, so let's suppose we have the identifiers stored in a PostgreSQL table called decisions. This table also contains data like the content of the decision on which we want to apply our model of classification.The table decisions is defined with:https://medium.com/media/8f5d00a5a1823072ce57ebf6be7cc060/hrefThis first step highlights a first natural need: we have to store information about which decisions have been processed and when they have been processed.Storing this information has several purposes:It is used for debugging, knowing when a model has given a label for a decision can be very helpful if we detect errors in predictionsIt is used to only process new decisions and not those already processedIf the prediction script fails for any reason, we can rerun the script from the decisions where it stoppedHow do we store this information?We simply use a table in our PostgreSQL database to do it. We have a schema named operation_states to store tables of this type in our database. And this is how the structure of the table is defined:https://medium.com/media/aad1a6a7c472c8887b792a033b5fe503/hrefWe store the ID of the decision with the date of the first and the last time the model has made a prediction on it.How do we select the IDs for reprocessing?We need the first and last time we have made a prediction for a given decision because we can make predictions several times in the lifetime of a decision. There are two main reasons why we would make several structure predictions for a decision:Since a given decision is fetched from multiple external sources at Doctrine, it sometimes happens that the metadata or actual contents for a given decision get updated several times in its lifetime. When that happens, the model has to make a new prediction.A new, better model has been trained and we want to apply it again on decisions.The first and naive solution we have found to automatically reprocess decisions is to only process a fraction of them every day. But with this solution, in most of the cases, we are processing decisions which did not change since the last computation. It was a waste of time and resources.Hence we have thought of a more efficient solution. It is clear that we need to focus on modified content only. We need to store information about when a decision has been modified. For this purpose, we are still using a PostgreSQL table, which stores the last modification date about the decision (date of the content change, date of the metadata change etc…). That’s why we are storing those kind of tables in a schema called modification_states. The structure of the table looks like this:https://medium.com/media/1876796260677d5ec492e9a0ee0fcefd/hrefHow is the modification information updated?The modification information is obtained from the decision loading scripts. In those scripts, the content or metadata of a decision are compared to the existing ones using a hash function. If there should be a difference, we have rules to determine which content to keep, and if the kept content is new then the script updates the updated_at field of modification_states.decisions_modified_at table for this decision.Finally, the method get_ids takes as input decisions, operation_states.decisions_classified_atand modification_states.decisions_modified_attables in order to select decisions to be processed.Selection of IDs to be processedFor example, this is what the SQL query in get_ids looks like as we want to get the new decisions and the decisions which have been modified:https://medium.com/media/f21aa80e205ca8ab876ab56e55dc443a/hrefActually, in our Project class, we have implemented a generic get_ids function which takes input table names as arguments among other arguments. An interesting argument is stock, when True it selects the entire list of decisions, in the case we want to apply a new model on the whole corpus of decisions.In order to use these IDs, we split them into batches of N elements to be processed, because the subsequent steps are fully parallelizable. The parallelization is done using python’s multiprocessing package inside the Project.run method.Query data to be processedHaving a batch of decisions to process, through their ids, we now have to get useful information about the decisions. We want to predict a class for each paragraph of the decision, therefore we need to have the contents of the decision.The get_rows method is simply about querying this information for the chosen decisions from the previous step. It actually translates into simple SQL query:https://medium.com/media/1c261465729f18a91e25449fc67a050f/hrefWhere science happensWe have queried information required for the batch of decisions, we now need to apply the logic specific to the task of predicting a class for each paragraph of a decision. The logic of the task is implemented in the process_element method. This method takes one decision as input and includes preprocessing on the raw contents of the decision (lowercase, stemming, etc.), separation of the content into paragraphs and use of the model to make a prediction for each of them. Finally, the method returns predictions as outputs.In addition to the fact that we can parallelize several batches of decisions, we can also do it inside a single batch by doing asynchronous operations when we are requesting different services which can be done simultaneously (Database, ElasticSearch index, etc.). This is not presented in our simplified implementation of the Project class, but in practice it is based on the asyncio package. In our example, we could take advantage of asynchronous processings because the limiting resources could be the database or the available CPUs, however in practice it processes in reasonable time, such that we did not need to leverage the asynchronous part.Insert resultsFinally, having our predictions, the last step of our pipeline is to insert data in our database using the insert_results method.At this step, we are inserting several pieces of data:Results of the predictions in a defined tableProcessed IDs in the operation_states.decisions_classified_at table to store the information about which decisions have been processed and whenThe insert_results can also deal with data comparison before insertion. Typically, if we make a prediction for a decision because the content has changed for example, we compare the prediction against the previous one, and we only update data if the prediction has changed. It optimizes resource consumption, as aSELECT is more efficient than a DELETE followed by INSERT in PostgreSQL. The comparison can be done using some hash functions (PostgreSQL has an implementation of MD5 algorithm).ConclusionUsing this generic pipeline, we have removed a lot of the redundant code in the productionization of a model such that we only have to focus on some limited aspects. We took the example of inferring classes for decisions using a model, but this pipeline can deal with any kind of content and any kind of processing (inference using models, text processing etc.).Thanks to Pauline Chavallard, Bertrand Chardon and Nicolas Fiorini for their valuable feedbacks.A generic pipeline to make offline inferences was originally published in Inside Doctrine on Medium, where people are continuing the conversation by highlighting and responding to this story.",0.583226,0.596984,0.615735,0.660321,0.643634,0.641581,0.584964,0.587327,0.562529,0.558694,0.578877,0.606548,0.583824,0.563006,0.592717
6,"[D] Data Augmentation using Pre-Trained Transformers (BERT, GPT, etc) | Research Paper Walkthrough","Data augmentation is a widely used technique to increase the size of the training data. It helps in significatly increasing the diversity of data available for training models resulting in reducing over fitting and enhancing robustness of ML model, without actually collecting new data. In this video we will understand how we can use Transformers to do augmentation in NLP. 🔥 Check out at - https://youtu.be/9O9scQb4sNo Original Paper - https://arxiv.org/abs/2003.02245 Also check out, Easy Data Augmentation in NLP - https://youtu.be/-1unNLkwImw Feel free to share your thoughts 👍 submitted by /u/prakhar21 [link] [comments]",0.583226,0.596984,0.615735,0.660321,0.643634,0.641581,0.584964,0.587327,0.562529,0.558694,0.578877,0.606548,0.583824,0.563006,0.592717
7,Tensorflow Profiler with Custom Training Loop,"Analyze and Optimize TensorFlow Performance on GPUPhoto Credit(This article was first published on my personal blog.)IntroductionThe Tensorflow Profiler in the upcoming Tensorflow 2.2 release is a much-welcomed addition to the ecosystem. For image-related tasks, often the bottleneck is the input pipeline. But you also don’t want to spend time optimizing the input pipeline unless it is necessary. The Tensorflow Profiler makes pinpointing the bottleneck of the training process much easier, so you can decide where the optimization effort should be put into.An Input-Bound Example. SourceThe official documentation demonstrates how to use the profiler with the Keras interface via a callback(tf.keras.callbacks.TensorBoard). However, there are no mentions of custom training loops. I did some research and came up with a working solution, which will be described in this post, along with some obstacles I had met and how I overcame them.PreparationInstall the Latest Tensorflow and the Profiler PluginThis comes directly from the documentation:# Uninstall twice to uninstall both the 1.15.0 and 2.1.0 version of TensorFlow and TensorBoard.pip uninstall -y -q tensorflow tensorboardpip uninstall -y -q tensorflow tensorboardpip install -U -q tf-nightly tb-nightly tensorboard_plugin_profile(This will no longer be required once Tensorflow and TensorBoard 2.2 are released)NVIDIA GPU Libraries(This section is for training on a single GPU. For training on multiple GPUs, please refer to this guide.)You’ll need to install NVIDIA GPU drivers and CUDA Toolkit as you normally do when training models on GPU.The next step is more specifically for the profiler. First, make sure that CUPTI 10.1 exists on the path (source):/sbin/ldconfig -N -v $(sed 's/:/ /g' <<< $LD_LIBRARY_PATH) | grep libcuptiIf not, update the LD_LIBRARY_PATH environment variable:export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATHTroubleshoot: CUPTI_ERROR_INSUFFICIENT_PRIVILEGESYou’ll likely see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES and CUPTI_ERROR_INVALID_PARAMETER errors in the log when trying to profile your model. This is because NVIDIA GPU performance counters, when running on one of the newer drivers, is only available to system administrators.Please read this document from NVIDIA to find a solution to your system.For my Linux system, the recommended modprobe nvidia NVreg_RestrictProfilingToAdminUsers=0 does not work. An alternative solution, which writes a file to /etc/modprobe.d, works for me. It is also offered in this Github thread:Adding options nvidia ""NVreg_RestrictProfilingToAdminUsers=0"" to /etc/modprobe.d/nvidia-kernel-common.conf and reboot should resolve the permission issue.Profile the Training LoopThis guide(Profile Tensorflow performance) describes four ways to collect performance data. One of them is specific to Keras interface. Another one(sampling mode) is interactive through Tensorboard web UI. I’ll describe the two that works programmatically and are compatible with custom training loops.Using tf.profiler Function API:tf.profiler.experimental.start('logdir')# Train the model heretf.profiler.experimental.stop()2. Using Context Manager:with tf.profiler.experimental.Profile('logdir'): # Train the model here passThere is one additional way. By reading the source code of the Keras Tensorboard callback, I reconstructed the Tensorflow Profiler part in the callback as:from tensorflow.python.profiler import profiler_v2 as profilerprofiler.warmup()profiler.start(logdir='logdir')# Train the model hereprofiler.stop()A Working ExampleHere is an example that trains an Efficientnet-B3 model and collect performance data using two different ways(with no obvious differences in results):overview_pageNotice that the Device Compute Precisions indicates that 87.6% of the GPU time was spent in 16-bit computation, showing that the mixed-precision training is configured correctly. Judging from the graph, the GPU is well fed with basically no time spent on waiting for input (I enabled prefetch in the data pipeline, so this tells us that it hadn’t run out of the prefetched batches).The input_pipeline_analyzer page shows that most time on host(CPU side) is spent on data preprocessing, so disk IO doesn’t seem to be a problem:input_pipeline_analyzerThe kernel_stats page shows that 25% of the time is spent on SwapDimension1And2InTensor3UsingTiles. I’m not sure swapping dimensions should take up that much time (doesn’t seem so). Some more research is required to answer that. The page also provides a helpful indication of whether an Op is Tensor Core eligible and whether Tensor Cores were actually used:kernel_statsThe notebook used (I used my library tf-helper-bot to wrap my custom training loop in a Fast.ai-inspired API. ):https://medium.com/media/50af2c437d3da78615887a96ce5f916b/hrefConclusionThanks for reading! Hopefully this post shows to you that Tensorflow Profiler is a powerful and easy-to-use tool (once you overcome the installation hurdles) that can potentially save you tons of time.This post only covers part of the profiler capabilities. There are a lot of things I don’t fully understand yet. The profiling report should give you some sense of where to look. I’d love to know if you found any other interesting resources on this topic (leave a comment!).Tensorflow Profiler with Custom Training Loop was originally published in Veritable on Medium, where people are continuing the conversation by highlighting and responding to this story.",0.588091,0.59769,0.601689,0.599742,0.619534,0.63645,0.569693,0.589725,0.570566,0.572293,0.524144,0.57,0.568478,0.534849,0.563137
8,HOLMES: Health OnLine Model Ensemble Serving for Deep Learning Models in Intensive Care Units,HOLMES is tested on risk prediction task on pediatric cardio ICU data with above 95% prediction accuracy and sub-second latency on 64-bed simulation. Code: https://github.com/hsd1503/HOLMES,0.560064,0.594392,0.559752,0.530292,0.56094,0.542139,0.546098,0.573157,0.552135,0.54459,0.530991,0.522977,0.529952,0.543969,0.55875
9,[Notes] Training Question Answering Models From Synthetic Data,"Generate questions and answers from both real and synthetic contextsPhoto Credit(This post was originally published on my personal blog.)Preamble“Training Question Answering Models From Synthetic Data” is an NLP paper from Nvidia that I found very interesting. Question and answer(QA) data is expansive to obtain. If we can use the data we have to generate more data, that will be a huge time saver and create a lot of new possibilities. This paper shows some promising results in this direction.Some caveats:We need big models to be able to get decent results. (The paper reported question generation models with the number of parameters from 117M to 8.3B. See the ablation study in the following sections.)Generated QA data is still not at the same level as the real data. (At least 3x+ more synthetic data is needed to reach the same level of accuracy.)There are a lot of contents in this paper, and it can be a bit overwhelming. I wrote down parts of the paper that I think is most relevant in this post, and hopefully, it can be helpful to you as well.MethodComponentsThere are three or four stages in the data generation process. Each stage requires a separate model:Stage 0 [Optional] — Context generation: The SQuAD 1.1 training data were used to train the following three stages (Figure 2 below). But when testing/generating, we can choose to use real Wikipedia data or use a model to generate Wikipedia-like data.Stage 1 — Answer Generation: A BERT-style model to do answer extraction from the given context. The start and the end of the token span are jointly sampled.Stage 2 — Question Generation: Fine-tuned GPT-2 model to generation question from the context and the answer.Stage 3 — Roundtrip Filtration: A trained extractive QA model to get the answer from the context and the generated question. If the predicted answer matches the generated answer, we keep this triplet (context, answer, and question). Otherwise, the triplet is discarded.The last step seems to be very strict. Any deviation from the generated answer will not be tolerated. However, given the EM(exact match) of the model trained on SQuAD 1.1 alone is already 87.7%, it’s reasonable to expect that the quality of answer predicted by the filtration model to be quite accurate. The paper also proposes an over-generation technique (generate two questions for each answer and context pair) to compensate for those valid triplets being discarded.(Taken from the source paper)More DetailsContext GenerationBeside using Wikipedia documents as contexts, this paper also generates completely synthetic contexts using an 8.3B GPT-2 model:This model was first trained with the Megatron-LM codebase for 400k iterations before being fine-tuned on only Wikipedia documents for 2k iterations. This allows us to generate high-quality text from a distribution similar to Wikipedia by using top-p (p = 0.96) nucleus sampling.Answer GenerationThis paper train the answer generation model to match the exact answer in the training data. This naturally ignores the other possible answers from the context but seems to be a more generalizable way to do it.The joint modeling of the starts and the ends of the answer span, which is reported to perform better, creates more candidates in the denominator in the calculation of the likelihood.(Taken from the source paper)(I’m not very sure about the complexity and performance impact of this joint approach.)Question GenerationThis paper uses token type ids to identify the components in the triplets. The answer span in the context are also marked by the answer token id. Special tokens is also added to the start and the end of the questions.(Taken from the source paper)Number of Triplets GeneratedAs explained in the previous section, the paper uses an over-generation technique to compensate for the model precision problem. Two questions are generated for each answer and context pair (a.k.a. answer candidate). Answer candidates of the context are generated by top-k sampling within a nucleus of p = 0.9 (that means we take the samples with the highest likelihoods until we either get K samples or the cumulative probabilities of the samples taken reaches 0.9).(Taken from the source paper)In the ablation study(which will be covered in the following sections), the models in stage 1 to 3 are trained with half of the SQuAD 1.1 training data, and the other half is used to generate synthetic data. The performance of the QA model trained on synthetic data is used to evaluate the quality of synthetic data.From the table above (Table 4), we can see that the smaller model on average generated 2.75 valid triplets per context, and the larger model generated 4.36 triplets. Those synthetic datasets are already bigger than the SQuAD 1.1 training set.ExperimentsModel ScaleTable 4 (in the previous section) shows that larger models in stage 1 to 3 create better data for the downstream model, but it is not clear whether it was the quality of the data or the quantity of the data that helped.(Taken from the source paper)Table 5 shows that the quality of questions generated does increase as the model scales up.(Taken from the source paper)To test the quality of the generated answers, the paper used the 1.2B question generator (see Table 5) to generate questions without filtration from the generated answers, fine-tune a QA model and test against the dev set. Table 6 shows that bigger model increases the quality of generated answers, but only marginally.(I am not sure how they obtained the benchmark for BERT-Large, though. I think BERT-Large expects a context and question pair to generate an answer, but here we want to generate answers from only the context. Maybe they take the pre-trained BERT-Large model and fine-tune it like the other two.)(Taken from the source paper)In Table 7 we can see that filtration does improve the performance of the downstream models (compare to Table 5). When using real answers to generate questions, less than 50% of the triplets generated by the 345M model were rejected, while about 55% by the 1.2B model was rejected. Note that all the models in this set under-performed to the model trained with only human-generated data (SQuAD training set).The additional triplets from using generated answers are quite helpful, the 1.2B model finally surpassed the baseline model (human-generated data), but it used 3x+ more data.To sum up, the ablation study shows that scaling up the model improved the quality of the generated data, but the increase in the quantity of the data also played a part.Fully Synthetic DataIn this part of the paper, they trained the models for stage 1 to 3 using the full SQuAD 1.1 training set, and use the deduplicated Wikipedia documents as contexts to generate answers and questions. They also fine-tune an 8.3B GPT-2 model on Wikipedia documents to generate synthetic contexts.(Taken from the source paper)Table 2 shows that synthetic contexts can be as good as the real ones. Also, further fine-tuning on the real SQuAD 1.1 data can further improve the performance, which might imply that there is still something missing in the fully or partially synthetic triplets.However, using 200x+ more data to get less 1% more accuracy seems wasteful. We want to know how much synthetic data we need to reach the baseline accuracy. The next section answers this question.The Quantity of Data Required to Beat the Baseline(Taken from the source paper)(The “data labeled” seems to mean the size of the corpus used to generate the triplets, not the size of generated triplets.)Figure 3 shows that we need at least 50 MB of text labeled to reach the baseline accuracy (without fine-tuning with the real data), 100 MB to surpass. That’s 2.5x+ and 7x+ more than the real one used by the baseline. Considering there are multiple triplets generated by one context, the number of triplets required is estimated (by me) to be around 20x and 40x more.The silver lining is that only 10 MB of text is needed to be labeled if we fine-tune the model with the real SQuAD data to surpass the baseline. That roughly translates to 3 to 4 times more triplets used than the baseline. So real plus synthetic data is probably the way to go for now.Wrapping UpThere are quite a few more details I did not cover in this post. Please refer to the source paper if you want to know more.All in all, very interesting results in this paper. Unfortunately, the amount of compute needed to synthesize the data and the amount of synthetic data needed to reach good results are still staggering. But on the other hand, it is essentially trade compute for the human labors required by the annotation process, and it might not be a bad deal.[Notes] Training Question Answering Models From Synthetic Data was originally published in Veritable on Medium, where people are continuing the conversation by highlighting and responding to this story.",0.561783,0.582028,0.574779,0.57821,0.638708,0.628183,0.615979,0.570093,0.579988,0.532942,0.568818,0.611583,0.583766,0.532249,0.530687


In [42]:
[doc.text for doc in raw_results]

['Syntax almost as friendly as Python, while running up to 100x\xa0fasterPhoto Credit(This post was originally published on my personal\xa0blog.)IntroductionIn my last post, [Failure Report] Distill Fine-tuned Transformers into Recurrent Neural Networks, I tried to distill the knowledge of a fine-tuned BERT model into an LSTM or GRU model without any data augmentation and failed to achieve satisfiable results. In the follow-up works, I tried to replicate the easies-to-implement augmentation method\u200a—\u200amasking\u200a—\u200aused in [1] and see its effect. The masking described in [1] is called “whole word masking” [2], that is, masking the whole word instead of just masking a single word\xa0piece.It is non-trivial to implement whole word masking, as it would require the sampling process to be aware of which word piece is itself a whole word, and which is part of a word. As you may know, doing text processing in pure Python is quite slow comparing to other compiled languages. I rec

In [43]:
scores_df

Unnamed: 0,title,text,deep learning,natural language processing,computer vision,statistics,implementation,visualization,industry,software engineering,reddit question,arxiv,cloud computing,deployment,competitions,business,business intelligence
0,Using Julia to Do Whole Word Masking,"Syntax almost as friendly as Python, while running up to 100x fasterPhoto Credit(This post was o...",0.539542,0.630511,0.533587,0.557276,0.572191,0.569392,0.542997,0.574472,0.551468,0.531546,0.515421,0.542184,0.506937,0.527759,0.54328
1,Transfer Learning with KERAS,"Learning is a never-ending process, but it’s more important to use previously gained knowledge i...",0.696341,0.595428,0.577528,0.566244,0.623856,0.597831,0.579651,0.589617,0.514814,0.502834,0.537744,0.560929,0.568218,0.559038,0.55649
2,How PyTorch Lightning became the first ML framework to runs continuous integration on TPUs,Learn how PyTorch Lightning added CI tests on TPUsContinue reading on PyTorch »,0.676328,0.595533,0.567742,0.567639,0.638649,0.599192,0.58586,0.589025,0.528871,0.514884,0.531069,0.567574,0.562189,0.560442,0.553597
3,Textual Description for Mathematical Equations,Reading of mathematical expression or equation in the document images is very challenging due to...,0.58575,0.561175,0.529714,0.54256,0.564061,0.554569,0.550492,0.519143,0.529417,0.536874,0.50922,0.539236,0.525885,0.546909,0.525588
4,[Notes] “Statistical Inference Enables Bad Science; Statistical Thinking Enables Good Science”,"Reading the Article by Christopher Tong on The American Statistician Volume 73, 2019Photo Credit...",0.58575,0.561175,0.529714,0.54256,0.564061,0.554569,0.550492,0.519143,0.529417,0.536874,0.50922,0.539236,0.525885,0.546909,0.525588
5,A generic pipeline to make offline inferences,"Data Science has been at the core of Doctrine.fr since the beginning. As such, a lot of effort h...",0.583226,0.596984,0.615735,0.660321,0.643634,0.641581,0.584964,0.587327,0.562529,0.558694,0.578877,0.606548,0.583824,0.563006,0.592717
6,"[D] Data Augmentation using Pre-Trained Transformers (BERT, GPT, etc) | Research Paper Walkthrough",Data augmentation is a widely used technique to increase the size of the training data. It helps...,0.583226,0.596984,0.615735,0.660321,0.643634,0.641581,0.584964,0.587327,0.562529,0.558694,0.578877,0.606548,0.583824,0.563006,0.592717
7,Tensorflow Profiler with Custom Training Loop,Analyze and Optimize TensorFlow Performance on GPUPhoto Credit(This article was first published ...,0.588091,0.59769,0.601689,0.599742,0.619534,0.63645,0.569693,0.589725,0.570566,0.572293,0.524144,0.57,0.568478,0.534849,0.563137
8,HOLMES: Health OnLine Model Ensemble Serving for Deep Learning Models in Intensive Care Units,HOLMES is tested on risk prediction task on pediatric cardio ICU data with above 95% prediction ...,0.560064,0.594392,0.559752,0.530292,0.56094,0.542139,0.546098,0.573157,0.552135,0.54459,0.530991,0.522977,0.529952,0.543969,0.55875
9,[Notes] Training Question Answering Models From Synthetic Data,Generate questions and answers from both real and synthetic contextsPhoto Credit(This post was o...,0.561783,0.582028,0.574779,0.57821,0.638708,0.628183,0.615979,0.570093,0.579988,0.532942,0.568818,0.611583,0.583766,0.532249,0.530687
