In [1]:
#default_exp haystack_search

In [2]:
#export
import pprint
import numpy as np
import pandas as pd
import requests
import torch
from sklearn import metrics
from nltk import tokenize
from operator import itemgetter

from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.database.memory import InMemoryDocumentStore

from haystack.retriever.dense import EmbeddingRetriever
from pytorch_hackathon import rss_feeds

import seaborn as sns

08/16/2020 13:01:42 - INFO - transformers.file_utils -   PyTorch version 1.5.0+cu101 available.
08/16/2020 13:01:43 - INFO - transformers.file_utils -   TensorFlow version 2.1.0 available.


In [3]:
pd.set_option('max_colwidth', 100)

In [4]:
cm = sns.light_palette("green", as_cmap=True)

In [5]:
%cd ..

/home/kuba/Projects/pytorch_hackathon


In [6]:
!ls data

feeds.txt  topics.txt  zsl_feed_results.csv


In [7]:
rss_feed_urls = list(pd.read_table('data/feeds.txt', header=None).iloc[:,0].values)

In [8]:
feed_df = rss_feeds.get_feed_df(rss_feed_urls)

100%|██████████| 16/16 [00:07<00:00,  2.11it/s]


  feed_df['text'] = feed_df['summary'].apply(lambda s: bs4.BeautifulSoup(s).text)


In [9]:
use_gpu = torch.cuda.is_available()

In [10]:
pretty_print = pprint.PrettyPrinter(indent=2).pprint

In [11]:
feed_df.head()

Unnamed: 0,title,title_detail,links,link,summary,summary_detail,id,guidislink,tags,text,...,published_parsed,comments,authors,author,author_detail,updated,updated_parsed,content,href,media_thumbnail
0,Hybrid Dynamic-static Context-aware Attention Network for Action Assessment in Long Videos,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/hybrid-dyna...",https://paperswithcode.com/paper/hybrid-dynamic-static-context-aware-attention,"However, most existing works focus only on video dynamic information (i. e., motion information)...","{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/hybrid-dynamic-static-context-aware-attention,False,"[{'term': 'Action quality assessment', 'scheme': None, 'label': None}]","However, most existing works focus only on video dynamic information (i. e., motion information)...",...,,,,,,,,,,
1,Predicting Visual Overlap of Images Through Interpretable Non-Metric Box Embeddings,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/predicting-...",https://paperswithcode.com/paper/predicting-visual-overlap-of-images-through,"Even when this is a known scene, the answer typically requires an expensive search across scale ...","{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/predicting-visual-overlap-of-images-through,False,,"Even when this is a known scene, the answer typically requires an expensive search across scale ...",...,,,,,,,,,,
2,Statistical Evaluation of Anomaly Detectors for Sequences,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/statistical...",https://paperswithcode.com/paper/statistical-evaluation-of-anomaly-detectors,"Although precision and recall are standard performance measures for anomaly detection, their sta...","{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/statistical-evaluation-of-anomaly-detectors,False,"[{'term': 'Anomaly detection', 'scheme': None, 'label': None}]","Although precision and recall are standard performance measures for anomaly detection, their sta...",...,,,,,,,,,,
3,On failures of RGB cameras and their effects in autonomous driving applications,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/on-failures...",https://paperswithcode.com/paper/on-failures-of-rgb-cameras-and-their-effects,RGB cameras are arguably one of the most relevant sensors for autonomous driving applications. <...,"{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/on-failures-of-rgb-cameras-and-their-effects,False,"[{'term': 'Autonomous driving', 'scheme': None, 'label': None}]",RGB cameras are arguably one of the most relevant sensors for autonomous driving applications. C...,...,,,,,,,,,,
4,Contextual Diversity for Active Learning,"{'type': 'text/plain', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/p...","[{'rel': 'alternate', 'type': 'text/html', 'href': 'https://paperswithcode.com/paper/contextual-...",https://paperswithcode.com/paper/contextual-diversity-for-active-learning,Contextual Diversity (CD) hinges on a crucial observation that the probability vector predicted ...,"{'type': 'text/html', 'language': None, 'base': 'https://us-east1-ml-feeds.cloudfunctions.net/pw...",https://paperswithcode.com/paper/contextual-diversity-for-active-learning,False,"[{'term': 'Active learning', 'scheme': None, 'label': None}, {'term': 'Image classification', 's...",Contextual Diversity (CD) hinges on a crucial observation that the probability vector predicted ...,...,,,,,,,,,,


In [12]:
#export



class Searcher:
    
    def __init__(
        self,
        model_name,
        text_col,
        use_gpu,
        max_document_length=256,
        quantize_model=True,
        document_store_cls=InMemoryDocumentStore
    ):
        self.text_col = text_col
        self.embedding_col = text_col + '_emb'
        self.max_document_length = max_document_length
        self.model_name = model_name
        self.document_store = document_store_cls(
            embedding_field=self.embedding_col,
        )
        self.retriever = self._setup_retriever(use_gpu, quantize_model)

    def _setup_retriever(self, use_gpu, quantize_model):
        retriever = EmbeddingRetriever(
            document_store=self.document_store,
            embedding_model=self.model_name,
            use_gpu=use_gpu)
        if not use_gpu and quantize_model:
            self.set_quantized_model(retriever)
            
        return retriever

    def add_texts(
        self,
        df
    ):
        truncated_texts = [
            ' '.join(tokenize.wordpunct_tokenize(text)[:self.max_document_length])
            for text in df[self.text_col] 
        ]
        article_embeddings = self.retriever.embed_queries(
            texts=truncated_texts
        )

        df[self.embedding_col] = article_embeddings
        self.document_store.write_documents(df.to_dict(orient='records'))
    
    @classmethod
    def set_quantized_model(cls, retriever):
        quantized_model = torch.quantization.quantize_dynamic(
            retriever.embedding_model.model,
            {torch.nn.Linear}, dtype=torch.qint8
        )
        retriever.embedding_model.model = quantized_model
        
    @classmethod 
    def sigmoid(cls, x):
        return 1 / (1 + np.exp(-x))
    
    @classmethod
    def doc_to_dict(cls, doc):
        d = {}
        d['text'] = doc.text
        d['title'] = doc.meta['title']
        d['score'] = doc.query_score
        return d

    def get_topic_score_df(self, raw_results, topic_strings):
        topic_query_strings = [
            'text is about {}'.format(topic)
            for topic in topic_strings
        ]

        results = [
            self.doc_to_dict(doc)
            for doc in raw_results 
        ]
        result_embeddings = np.array([
            doc.meta['text_emb']
            for doc in raw_results
        ]).astype('float32')
        topic_query_embeddings = np.array(self.retriever.embed_passages(
            list(topic_strings)
        )).astype('float32')

        scores_df = pd.DataFrame({})
        scores_df['title'] = list(map(itemgetter('title'), results))
        scores_df['text'] = list(map(itemgetter('text'), results))

        scores = pd.DataFrame(metrics.pairwise.cosine_similarity(
            result_embeddings,
            topic_query_embeddings
        ))
        scores.columns = topic_strings

        scores_df = pd.concat(
            [scores_df, self.sigmoid(scores)],
            axis=1
        )
        return scores_df

In [13]:
model_name = "deepset/sentence_bert"

In [14]:
searcher = Searcher(
    model_name,
    'text',
    use_gpu=use_gpu
)

08/16/2020 13:01:52 - INFO - haystack.retriever.dense -   Init retriever using embeddings of model deepset/sentence_bert
08/16/2020 13:01:52 - INFO - farm.utils -   device: cuda n_gpu: 1, distributed training: False, automatic mixed precision training: None
08/16/2020 13:01:52 - INFO - farm.infer -   Could not find `deepset/sentence_bert` locally. Try to download from model hub ...
08/16/2020 13:01:53 - INFO - transformers.modeling_utils -   loading weights file https://cdn.huggingface.co/deepset/sentence_bert/pytorch_model.bin from cache at /home/kuba/.cache/torch/transformers/fa9d12cb00cd5a31f5a5367f58d242199473a6deb02c51380681ade7bf33c713.4948a08b5d844db1ecda79f6e7f47643f0175f2c030d48ce8b3beee3c6bd6012
08/16/2020 13:01:54 - INFO - transformers.modeling_utils -   All model checkpoint weights were used when initializing BertModel.

08/16/2020 13:01:54 - INFO - transformers.modeling_utils -   All the weights of BertModel were initialized from the model checkpoint at deepset/sentence_be

In [15]:
searcher.add_texts(feed_df)

08/16/2020 13:02:02 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
08/16/2020 13:02:02 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: 57-0
Clear Text: 
 	text: Comments
Tokenized: 
 	tokens: ['comments']
 	offsets: [0]
 	start_of_word: [True]
Features: 
 	input_ids: [101, 7928, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Inferencing Samples: 100%|██████████| 74/74 [00:10<00:00,  7.08 Batches/s]


In [16]:
article_texts = feed_df['text']

In [17]:
topic_strings = pd.read_table('data/topics.txt', header=None).iloc[:,0].values

In [18]:
print('\n'.join(topic_strings))

deep learning
natural language processing
computer vision
statistics
implementation
visualization
industry
software engineering
reddit question
arxiv
cloud computing
deployment
competitions
business
business intelligence


In [19]:
topic_query_strings = [
    'text is about {}'.format(topic)
    for topic in topic_strings
]

In [20]:
raw_results = searcher.retriever.retrieve(
    topic_query_strings[1]
)

08/16/2020 13:02:12 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
08/16/2020 13:02:12 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: 0-0
Clear Text: 
 	text: text is about natural language processing
Tokenized: 
 	tokens: ['text', 'is', 'about', 'natural', 'language', 'processing']
 	offsets: [0, 5, 8, 14, 22, 31]
 	start_of_word: [True, True, True, True, True, True]
Features: 
 	input_ids: [101, 3793, 2003, 2055, 3019, 2653, 6364, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.63 Batches/s]


In [21]:
scores_df = searcher.get_topic_score_df( raw_results, topic_strings)

08/16/2020 13:02:12 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
08/16/2020 13:02:12 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: 9-0
Clear Text: 
 	text: arxiv
Tokenized: 
 	tokens: ['ar', '##xi', '##v']
 	offsets: [0, 2, 4]
 	start_of_word: [True, False, False]
Features: 
 	input_ids: [101, 12098, 9048, 2615, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Inferencing Samples: 100%|██████████| 4/4 [00:00<00:00,  7.69 Batches/s]
08/16/2020 13:02:13 - INFO - numexpr.utils -   Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
08/16/2020 13:02:13 - INFO - numexpr.utils -   NumExpr defaulting to 8 threads.


In [22]:
scores_df.style.background_gradient(cmap=cm)

Unnamed: 0,title,text,deep learning,natural language processing,computer vision,statistics,implementation,visualization,industry,software engineering,reddit question,arxiv,cloud computing,deployment,competitions,business,business intelligence
0,Evaluating the Impact of Knowledge Graph Context on Entity Disambiguation Models,Pretrained Transformer models have emerged as state-of-the-art approaches that learn contextual information from the text to improve the performance of several NLP tasks. Code: https://github.com/mulangonando/Impact-of-KG-Context-on-ED,0.613387,0.597722,0.531973,0.501481,0.593792,0.559076,0.545501,0.592912,0.543777,0.490492,0.511187,0.537554,0.530916,0.494104,0.496908
1,Feature Engineering with NLP,Converting tokens of text into features and applying machine learning and deep learning model on it.Continue reading on Towards AI — Multidisciplinary Science Journal »,0.597072,0.590448,0.549067,0.511888,0.567869,0.53675,0.531344,0.61711,0.547273,0.493482,0.531765,0.509824,0.53612,0.519174,0.530965
2,Introduction to Natural Language Processing,"NLP is a great tool to analyze text data and perform an amazing task when combined with machine learning and deep learning. So, let’s look…Continue reading on Towards AI — Multidisciplinary Science Journal »",0.606703,0.586747,0.556306,0.513555,0.553962,0.538722,0.533227,0.608819,0.54952,0.51401,0.531522,0.506409,0.531768,0.515964,0.540105
3,A single legal text representation at Doctrine: the legal camemBERT,"As a legal platform, Doctrine aggregates a lot of legal data with the intent of making them accessible, understandable and usable. The Machine Learning Engineers’ day-to-day material is mostly text: court decisions, legislation, legal commentaries, user queries, etc. All of our content is natural language, which we process in a number of ways: bag-of-words, embeddings or with language models.In an ideal world though, our product would be built on top of scalable, flexible and reusable modules, ones that would be generic enough to accommodate a wide variety of legal contents and feed the whole spectrum of our product features. It is exactly with that vision in mind that we started working on a unified language model a few months ago, whose associated challenges, findings and results we’ll do our best to summarize in this article.I. One language model to rule them allDepending on the project, we were representing our legal contents with:different techniques:TF-IDF vectorsBM25 (e.g., with ElasticSearch)A variant of Word2Vec, called Wang2Vec, embeddings fine-tuned on legal data — note that even if those embeddings work pretty well for a lot of tasks, they are not the state-of-the-art anymore. There’s not enough modeling power in simple word embeddings and we definitely see their limits now on some tasks.2. different data:vocabulary of the content itself,vocabulary of the linked contents from our legal graphvocabulary from some metadata provided by the courts…Yet eventually, we want to be able to represent all of our legal content using a unified framework for any text-understanding based feature, because of:Reusability: all teams can rely on this unique language model for their projects.2. Scalability:a modeling power sufficient to be applied to any new legal content (e.g., legal documents from the lower house and the upper house),robust enough to unlock use cases we’re not yet considering, like legal bots, legal trend detection, argument mining, etc,generic enough to be applied to a new language (with a retraining on the new language of course).3. Agnostic usage: one of the problems with our current representations is that the text follows some guidelines in the way they phrase statements, and a textual similarity is thus strongly biased towards documents that have the same overall phrasing (of the same court for example), despite the fact they’re not invoking the same laws about the same thing. For example, it is now difficult for us to match decisions from the High Court/Court of Appeal to those from the Supreme Court simply because of their different writing styles (the former tends to focus primarily and precisely on the facts, while the latter favors usually only relies on the legal matter, which has an adverse effect on our current representations).When we initially started thinking about this, there were some properties that we thought our language model should ideally cover:Taking advantage of the semantic proximity:In French:préjudice corporel should be equivalent to dommage corporelIn English: death should be equivalent to loss of life2. Being able to represent our content on different granularities:Token-level for Named Entity Recognition: anonymization, entity detection, …Paragraph-level: structure detection, argument similarities, …Document-level: legal domain classification, document recommendation, …It’s with all those things in mind that we started to work on a unique, all-encompassing language model serving all our use cases and features.II. Our legal language modelThe first step of this project was to design the architecture and implementation of our language model. This step was crucial since it would serve as the foundation to all of our future work and help us move towards our initial vision. We first thought about our technical constraints:use an existing and robust implementation, in order to take advantage of the support and the community,use a state-of-the-art technique to achieve very good performances,ideally use a PyTorch implementation, because our previous Deep Learning algorithms were made with PyTorch. Moreover, PyTorch (along with a few others) remains the dominant deep learning library at the time of writing this article,if possible, find an implementation with a French pre-trained model before fine-tuning, because transfer learning has shown its efficiency in NLP.It should also be noted that compared to other use-cases, especially in academic research, the framework should be efficient at representing very long texts. Here is an interesting blog post about different document embeddings techniques. We’ll come to that later.Under these constraints, the Hugging Face Transformers library appeared to be a very good choice:they offer all the recent state-of-the-art architectures (BERT, RoBERTa, ELMo, XLNet, …) complete with their associated PyTorch and TensorFlow implementations,some of them have a French pre-trained model,their implementation has quickly become an international reference, to the point where the famous NLP framework Spacy provides a Transformer implementation based on the Hugging Face one.Among the models providing a French pre-trained model, we had the choice between:BERT-Base, multilingualDistilmBERT, multilingualcamemBERT, French RoBERTa modelWe decided to go for camemBERT, since it already provided good results for the French language on several tasks according to this paper. Of course, multilingual models will probably be very useful for internationalization later, but we initially wanted to check that a transformer model could be relevant. Moreover, camemBERT has fewer parameters than multilingual models, which makes it a little easier to use.Note that camemBERT is case-sensitive, which will be useful for Named Entity Recognition and especially for anonymization.The legal CamemBERTNow that we had settled on the underlying technology, we decided to check how well it would perform on actual, real-life legal data.Knowing that camemBERT was initially trained on the French subcorpus of OSCAR, which features gigabytes of data crawled from the web, we knew that it would fare well at general French language tasks, but we suspected that the task of speaking the more specific French legalese would prove to be a tougher nut to crack, which our initial tests confirmed.For example, when asked to predict the next word of the sentence Par ces ... , camemBERT suggested the word mots, which is not exactly legal-oriented. We would expect something like moyens or motifs.It was obvious at this point that the trove of millions of legal documents we have at our disposal at Doctrine would prove to be great material for the subsequent fine-tuning needed to harness the full power of our model. At this point, we were confident that the model could be trained, however, we needed it to be potentially used universally across features. Yet, one issue remained: how to handle long texts, a strong prerequisite for legal documents, but something that doesn’t pair naturally with transformers’ inherent limitations.BERT models, for example, have a hard limit of 512 to 514 tokens (as enforced by the max_position_embeddings parameter), which would surely be a challenge when dealing with court decisions: texts that can be infamously verbose, with an average token count hovering around 2000 (and some even more extreme cases like this decision).To circumvent this issue, we envisioned two different approaches:Embedding each paragraphHaving sliding windows, as explained hereTo avoid ending up with redundancy in the embeddings, we decided to go with paragraph embeddings first, with exceedingly long paragraphs getting snipped past the limit during training. What was left for us to determine at that point was an aggregation strategy over the different paragraphs, so that we could harvest the final document embeddings, something that we would come back to later.We then proceeded with the implementation, which was done by splitting our legal documents on paragraphs and fine-tuning camemBERT on the masked language model task (using dedicated AWS GPU instances). It converged after a few days and we tested its relevance by using a few qualitative checks:Comparison between the standard pre-trained French camemBERT model and our legal camemBERT on a masked LM taskWe assessed the differences in prediction for semantically similar sentences, which seemed to be consistent. The qualitative check seemed to provide very good results. It was now time to validate the language model on a real task.III. Our first legal camemBERT use-case: classification of legal domainWe wanted to try our legal camemBERT on a simple task for a first validation: text classification of legal domains on court decisions.This is indeed a simple and well delimited task, and easy to compare to other basic models. Moreover, this classification has a huge product impact, on the search filters, recommender systems and analytics.We have two hierarchies on the legal domains at Doctrine:the main legal domain:Droit civil,Droit commercial,Droit social,Droit public,…2. the subdomain: for example in Droit civil, there areDivorce et séparation de corpsDroit locatifDroit des successionsDroit de la responsabilité…Today, we support 9 different domains and 40 different subdomains, where some are more complex than others to determine. These categories have a hierarchical structure, but we addressed the problem by reducing it to a 40-class classification problem.The HuggingFace repository suggests a classification head module integrated with CamemBERT. However, as discussed earlier, the main problem is that court decisions can be very verbose (have a look at this very long decision for example), and BERT does not work well on long texts. A very good review of document embeddings showed that there are no clear embedding technique that works better than others for very long documents. It really depends on your objective.Working at a paragraph level seemed more relevant, all the more so as the language model has been trained at a paragraph scale. BERT will then provide an embedding for each paragraph. We then had to think about a way to aggregate the paragraphs in order to get a decision embedding.ModelingParagraph embeddings methodIt is known that BERT architectures provide not only word-level contextual embeddings but also the special CLS-token whose output embedding is used for classification tasks. However it turns out to be a poor embedding of the input sequence of other tasks if not fine-tuned on the specific task:The paper Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks from Reimers et al, 2019, shows that BERT out-of-the-box maps sentences to a vector space that is rather unsuitable to be used with common similarity measures like cosine-similarity.According to BERT creator Jacob Devlin: “I’m not sure what these vectors are, since BERT does not generate meaningful sentence vectors. It seems that this is is doing average pooling over the word tokens to get a sentence vector, but we never suggested that this will generate meaningful sentence representations.” sourceStill, the most classic ways to embed a document (in our case, a paragraph) with BERT are:to use the [CLS]-tokento use an aggregation of the last X hidden states of the word embeddings ( we usually saw X=4)What is interesting in our case is that one paragraph does not represent the whole court decision. We had to plug something on top of it. We decided to go with the [CLS]-token as paragraph embeddings for a first shot, because our task is a classification task.2. Document embedding with an aggregation over paragraphsGiven embeddings for all our paragraphs, we then had to think of a way to get document embeddings.Here again, different approaches can be considered, since this is another sequence-to-one vector modeling:A simple average of all paragraph embeddings (the [CLS]-token of each BERT-output paragraphs),A weighted average of the paragraph embeddings, with weights built with a self-attention mechanism explained in the paper A Structured Self-attentive Sentence Embedding,A bi-LSTM to exploit the sequential information contained in the paragraphs,A Convolutional Neural Network,Another BERT that would learn the language at the paragraph scale,…Given that our task is a mere classification problem, the solution with a self-attention mechanism seemed to be pretty relevant for our case because:It’s a bit smarter than a simple average-pooling, and it will automatically get rid of the useless paragraphs that contain no information for the legal domain. Indeed, the final paragraphs of French decisions are often related to the operative part of the judgment, and about who pays the costs. This is usually not relevant to our current problem.It also provides some precious insights on how to best interpret the model. We can indeed have access to the attention weights and check on which paragraphs the model focused on the most for its prediction.With all that mind, here’s the final architecture for the classification task:Final architecture of our legal document classification on documents, using the legal camemBERTWe first tried to train the whole pipeline, including the fine-tuning of the legal camemBERT on this task, but we got memory errors. We quickly froze the BERT model and trained only the rest of the pipeline (attention layer + classification layer). It provided good results so we didn’t go with further experiments on an end-to-end training. This is something that we made a note of though, since unsupervised BERT outputs are known to be poor if not fine-tuned, as discussed earlier in this article.ResultsThe goal here was not only to improve our legal domains classification, but also to show that we could achieve at least the same results as a simple TF-IDF model.Dataset creationDeep learning in general often requires a consequent training set size. That’s why we used a semi-automatically labelled training dataset, labelled:by humans, using Prodi.gywith business rules, using the associated court as a reference. If a decision is linked to another one from Labor court, it’s very likely that the decision is about Droit du travail(labor laws).with the most reliable predictions of our former algorithm, based on TF-IDF for the domain, and a legal taxonomy for the subdomain.Comparison between models and discussionWe achieved the same performance with our legal camemBERT and with a simple TF-IDF, which is actually good news! We indeed didn’t spend a lot of time on the modeling part of camemBERT, and this classification task is in the end a rather simple NLP task.Moreover and perhaps just as interestingly, we noticed after a qualitative analysis of model’s prediction errors that the errors of the simple model were more often out of context. It means that when the TF-IDF gets it wrong, it’s really way off the mark. For example, this decision is predicted as Droit du transport with a probability of 0.96, instead of Droit des assurances because the decision is about a vehicle insurance claim and contains a lot of vocabulary related to transportation, and not that much about insurance.On the other hand, the legal camemBERT can of course be wrong, but it never steers too much out of context and will mostly predict subdomains that are very close, like Droit immobilier et de la construction and Droit de la copropriété et de la propriété immobilière, when we look at the confusion matrix.Moreover, CamemBERT managed to predict some subdomains that were not obvious at all, even for humans. For example, this decision has been predicted as Divorce et séparations de corpswithout any explicit mention of the word divorce in the decision! The subdomain here is very implicit and implied by a mention to a father that has to pay alimony to the mother of his child.Let’s now have a look at the attention weights of our modeling. Here are some examples below:Paragraph with the highest attention score (0.34) for the prediction of https://www.doctrine.fr/d/CA/Reims/2008/SK60FC7292250FC0B001E6 as Divorce et Séparation de corpsParagraph with the highest attention score (0.26) for the prediction of https://www.doctrine.fr/d/CA/Rouen/2016/1F43DFAE32435B18DC90 as Droit des étrangers et de la nationalitéThese attention scores totally make sense, and confirmed the approach.We also confirmed that paragraphs related to generic procedures had a very low attention weight, like this one:Paragraph with a very low attention weight of 0.01 for the prediction of https://www.doctrine.fr/d/CA/Rouen/2016/1F43DFAE32435B18DC90 as Droit des étrangers et de la nationalitéFinally, when we had a look at the errors of the models (both models), we quickly noticed that some classes were very well predicted and some others were not. Our intuition about the observed discrepancy boils down to the fact that language models are only ever as good as their training dataset. In our case, the issue seems to stem from volume and errors in the training set. This is definitely the next priority for this task to focus on, before trying to play with the different architectures. Indeed, the current one seems to work pretty well on subdomains when the training dataset is satisfactory.ConclusionWe built a legal language model with a state-of-the-art technique, that proved to be very efficient at capturing highly relevant information on a simple classification task. This is a huge step for Doctrine, as we have a lot of very complex tasks in Natural Language Processing to tackle! The granularity of this new language model, which can seamlessly provide token, paragraph and document embeddings will be key for us to find new applications for the technique on a wide array of complex Natural Language Processing tasks at Doctrine.In fact, the legal camemBERT has already found a second problem to tackle with the issue of semantic similarity between users and legal content in the context of a recommendation system and seems to already have yielded promising results, which we’ll be sharing in an upcoming blog post very soon. Stay tuned!A single legal text representation at Doctrine: the legal camemBERT was originally published in Inside Doctrine on Medium, where people are continuing the conversation by highlighting and responding to this story.",0.570231,0.586314,0.521751,0.499045,0.548911,0.511261,0.517473,0.595577,0.505991,0.492187,0.509414,0.505175,0.512034,0.511341,0.525712
4,A Large-Scale Chinese Short-Text Conversation Dataset,The cleaned dataset and the pre-training models will facilitate the research of short-text conversation modeling. Code: https://github.com/thu-coai/CDial-GPT,0.621184,0.586691,0.545824,0.541583,0.584535,0.550647,0.527896,0.600209,0.548811,0.484411,0.517256,0.519267,0.528229,0.519199,0.530539
5,"[Q] Data scientist here, working on gathering a corpus of academic papers focusing on ""Cognitive Linguistics"". Need your help!","Hello. I want to collect as many as papers as I can that will fall into this category. The main problem is that the ""tagging"" is not consistent for linguistic papers. Hence I'm looking for an exhausitve list of tags which are directly related to this field, in order to make better queries and find more relevant data. Thanks! submitted by /u/quit_daedalus [link] [comments]",0.5553,0.578011,0.498356,0.489413,0.526189,0.51321,0.489906,0.544217,0.557324,0.480271,0.496461,0.485666,0.530694,0.496379,0.504614
6,KR-BERT: A Small-Scale Korean-Specific Language Model,"Since the appearance of BERT, recent works including XLNet and RoBERTa utilize sentence embedding models pre-trained by large corpora and a large number of parameters. Code: https://github.com/snunlp/KR-BERT",0.574174,0.594593,0.522638,0.532562,0.583562,0.552562,0.547111,0.580898,0.551625,0.498453,0.515522,0.564181,0.550184,0.508991,0.513304
7,Dialogue State Induction Using Neural Latent Variable Models,Dialogue state modules are a useful component in a task-oriented dialogue system. Code: https://github.com/taolusi/dialogue-state-induction,0.609547,0.595336,0.546996,0.492073,0.601822,0.568787,0.546473,0.602195,0.57031,0.499499,0.532563,0.543948,0.563158,0.529331,0.538693
8,Inter-Image Communication for Weakly Supervised Localization,We learn a feature center for each category and realize the global feature consistency by forcing the object features to approach class-specific centers. Code: https://github.com/xiaomengyc/I2C,0.59386,0.596002,0.5415,0.521783,0.593086,0.556799,0.535707,0.601382,0.539489,0.486664,0.553956,0.532525,0.545695,0.532982,0.542407
9,TextRay: Contour-based Geometric Modeling for Arbitrary-shaped Scene Text Detection,"Arbitrary-shaped text detection is a challenging task due to the complex geometric layouts of texts such as large aspect ratios, various scales, random rotations and curve shapes. Code: https://github.com/LianaWang/TextRay",0.573135,0.569583,0.536931,0.505102,0.543021,0.542183,0.517943,0.571432,0.553461,0.498335,0.522298,0.509426,0.53634,0.498024,0.506026


In [23]:
[doc.text for doc in raw_results]

['Pretrained Transformer models have emerged as state-of-the-art approaches that learn contextual information from the text to improve the performance of several NLP tasks. Code: https://github.com/mulangonando/Impact-of-KG-Context-on-ED',
 'Converting tokens of text into features and applying machine learning and deep learning model on it.Continue reading on Towards AI\u200a—\u200aMultidisciplinary Science Journal »',
 'NLP is a great tool to analyze text data and perform an amazing task when combined with machine learning and deep learning. So, let’s look…Continue reading on Towards AI\u200a—\u200aMultidisciplinary Science Journal »',
 'As a legal platform, Doctrine aggregates a lot of legal data with the intent of making them accessible, understandable and usable. The Machine Learning Engineers’ day-to-day material is mostly text: court decisions, legislation, legal commentaries, user queries, etc. All of our content is natural language, which we process in a number of ways: bag-of-

In [24]:
scores_df

Unnamed: 0,title,text,deep learning,natural language processing,computer vision,statistics,implementation,visualization,industry,software engineering,reddit question,arxiv,cloud computing,deployment,competitions,business,business intelligence
0,Evaluating the Impact of Knowledge Graph Context on Entity Disambiguation Models,Pretrained Transformer models have emerged as state-of-the-art approaches that learn contextual ...,0.613387,0.597722,0.531973,0.501481,0.593792,0.559076,0.545501,0.592912,0.543777,0.490492,0.511187,0.537554,0.530916,0.494104,0.496908
1,Feature Engineering with NLP,Converting tokens of text into features and applying machine learning and deep learning model on...,0.597072,0.590448,0.549067,0.511888,0.567869,0.53675,0.531344,0.61711,0.547273,0.493482,0.531765,0.509824,0.53612,0.519174,0.530965
2,Introduction to Natural Language Processing,NLP is a great tool to analyze text data and perform an amazing task when combined with machine ...,0.606703,0.586747,0.556306,0.513555,0.553962,0.538722,0.533227,0.608819,0.54952,0.51401,0.531522,0.506409,0.531768,0.515964,0.540105
3,A single legal text representation at Doctrine: the legal camemBERT,"As a legal platform, Doctrine aggregates a lot of legal data with the intent of making them acce...",0.570231,0.586314,0.521751,0.499045,0.548911,0.511261,0.517473,0.595577,0.505991,0.492187,0.509414,0.505175,0.512034,0.511341,0.525712
4,A Large-Scale Chinese Short-Text Conversation Dataset,The cleaned dataset and the pre-training models will facilitate the research of short-text conve...,0.621184,0.586691,0.545824,0.541583,0.584535,0.550647,0.527896,0.600209,0.548811,0.484411,0.517256,0.519267,0.528229,0.519199,0.530539
5,"[Q] Data scientist here, working on gathering a corpus of academic papers focusing on ""Cognitive...",Hello. I want to collect as many as papers as I can that will fall into this category. The main ...,0.5553,0.578011,0.498356,0.489413,0.526189,0.51321,0.489906,0.544217,0.557324,0.480271,0.496461,0.485666,0.530694,0.496379,0.504614
6,KR-BERT: A Small-Scale Korean-Specific Language Model,"Since the appearance of BERT, recent works including XLNet and RoBERTa utilize sentence embeddin...",0.574174,0.594593,0.522638,0.532562,0.583562,0.552562,0.547111,0.580898,0.551625,0.498453,0.515522,0.564181,0.550184,0.508991,0.513304
7,Dialogue State Induction Using Neural Latent Variable Models,Dialogue state modules are a useful component in a task-oriented dialogue system. Code: https://...,0.609547,0.595336,0.546996,0.492073,0.601822,0.568787,0.546473,0.602195,0.57031,0.499499,0.532563,0.543948,0.563158,0.529331,0.538693
8,Inter-Image Communication for Weakly Supervised Localization,We learn a feature center for each category and realize the global feature consistency by forcin...,0.59386,0.596002,0.5415,0.521783,0.593086,0.556799,0.535707,0.601382,0.539489,0.486664,0.553956,0.532525,0.545695,0.532982,0.542407
9,TextRay: Contour-based Geometric Modeling for Arbitrary-shaped Scene Text Detection,Arbitrary-shaped text detection is a challenging task due to the complex geometric layouts of te...,0.573135,0.569583,0.536931,0.505102,0.543021,0.542183,0.517943,0.571432,0.553461,0.498335,0.522298,0.509426,0.53634,0.498024,0.506026
