[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jonasengelmann/erinnerungsluecken-im-nsu-untersuchungsausschuss/blob/master/Semantic_Matching.ipynb)

In [None]:
import os
import re
import string
import subprocess
import pickle
import urllib.request
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
pd.set_option('max_colwidth', 100)

In [None]:
# Create folder structure:
pdf_folder = Path.cwd() / '01_data' / '01_pdf'
Path.mkdir(pdf_folder, parents=True, exist_ok=True)

xml_folder = pdf_folder.parent / '02_xml'
Path.mkdir(xml_folder, exist_ok=True)

results_folder = pdf_folder.parent.parent / '02_results'
Path.mkdir(results_folder, exist_ok=True)

You can either download and scrape the pdfs, or use the provided files in the data folder
of the repository and skip the next two cells!

#1 Download transcripts as PDFs

I collected urls to the transcriptions containing witness interrogations in a list. Let's download them!

In [None]:
urls = ['http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2012.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2014.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2015.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2017.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2019.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2021.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2022a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2022b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2024a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2024b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2027.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2029a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2029b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2031.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2032.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2034a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2034b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2036.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2039.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2041.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2043.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2044.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2047.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2049a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2049b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2051.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2053.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2054.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2056a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2056b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2057.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2059a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2059b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2060.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2062.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2064a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2064b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2065.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2066a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2066b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2068a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2068b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2070a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2070b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2072a.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2072b.pdf',
        'http://dipbt.bundestag.de/doc/btd/17/CD14600/Protokolle/Protokoll-Nr%2074.pdf']

for url in urls:
    print(f'Downloading {url}')
    urllib.request.urlretrieve(url, pdf_folder / url.split("/")[-1])


#2 Scraping with pdfminer

Lets scrape the pdfs using pdfminer!

We want to preserve some of the layout information, such as font type and font size to later be able to easier differentiate between different speakers, remarks, quotations, etc.

In [None]:
# This step requires pdfminer to be installed. 
# For python2 use: pip install pdfminer
!pip install pdfminer.six

In [None]:
%%time

# Scraping all pdfs might take a while! [It took me around 10mins]

for filename in list(pdf_folder.glob('*.pdf')):
    target = xml_folder / f'{filename.stem}.xml'
    print(f'Scraping {filename}')
    subprocess.call(['pdf2txt.py', '-t', 'xml', '-n', '-o', target, filename])

In [None]:
%%time 

# Lets parse the generated xmls (this may take a while [2mins for me]):

def crop_bottom_and_top(document, crop_val):
    '''
    Removes text elements at the top and bottom of 
    the document given by percentage of the page size.
    '''
    crop_document = []
    for page in document:
        crop_page = []
        y1 = float(page.attrib["bbox"].split(",")[3])
        for char in page:
            if (char.tag == "text" 
               and float(char.attrib['bbox'].split(",")[3]) > y1*crop_val/100
               and float(char.attrib['bbox'].split(",")[3]) < y1*(1-crop_val/100)):
                
                crop_page.append(char)
        crop_document.append(crop_page)
    return crop_document

def check_if_characters_match_style(n_characters, font, size):
    '''
    Checks if all characters match a specified font and
    font size. n_characters has to be a list of text elements. 
    '''  
    checks = []
    for single_char in n_characters:
        if ((font in single_char.attrib["font"].lower()
           and single_char.attrib["size"].startswith(size))
           or not single_char.text.strip()):
            checks.append(True)
    # Check if only empty characters:
    if any(char.text.strip() for char in n_characters):
        return len(checks) == len(n_characters)
    else:
        return False

def find_next_speaker_and_text(characters):
    '''
    Finds the next speaker and text on the basis that speaker are always
    written with bold font and in font size 9.
    '''
    speaker, text = [], []
    record_speaker = False
    for idx, char in enumerate(characters):

        # Check if next 10 characters are bold and in font size 9:
        if (check_if_characters_match_style(characters[idx:idx+10], "bold", "9")
           and not record_speaker):

            yield "".join(speaker), clean_text("".join(text))
            
            record_speaker = True
            speaker = []

        if record_speaker:
            if char.attrib["size"].strip().startswith("9"):
                speaker.append(char.text)
        
            # Check if it is the end of the speaker's name:
            next_char = characters[idx+1] if (idx+1) != len(characters) else char 
            
            if not "bold" in next_char.attrib["font"].lower() and next_char.text.strip():
                record_speaker = False
                text = []

        elif char.attrib["size"].strip().startswith("9"):
                text.append(char.text)

def clean_text(text):
    '''
    Removes multiple space and hyphens resulting from linebreaks.
    '''
    text = re.sub(' +', ' ', text)
    return re.sub(r"([a-zßäöü])-[ ]*([a-zßäöü])", '\\1\\2', text)


protocol = []
for filename in sorted(list(xml_folder.glob('*.xml'))):

    document = ET.parse(filename).getroot()

    # Crop bottom and top by 7 procent to discard of headers and footers
    document = crop_bottom_and_top(document, 7)

    # Collect all text characters
    characters = [char for page in document for char in page if char.tag == "text"]

    # Parse content of xmls
    for speaker, text in find_next_speaker_and_text(characters):
        if speaker.strip():
            protocol.append((speaker, text))

In [None]:
# Save extracted data to disk:

with open(results_folder / 'parsed_dialog.txt', 'w') as output:
    for speaker, text in protocol:
        output.write(f'{speaker.strip()} {text.strip()}\n\n')
    
with open(results_folder / 'only_witnesses_text.txt', 'w') as output:
    for speaker, text in protocol: 
        if 'zeug' in speaker.lower():
            output.write(text + '. ')

pickle.dump(protocol, open( results_folder / 'parsed_dialog.p', 'wb'))

In [None]:
# You can download the parsed_dialog file like this:
from google.colab import files
files.download(results_folder / 'parsed_dialog.txt')


#3 Semantic Matching

Now let's try to identify instances in which the inabilty to remember is expressed.

In [None]:
sem_folder = results_folder.parent / '03_semantic_matching'
Path.mkdir(sem_folder, exist_ok=True)

## 3.1 With a simple regular expression

 Expressions of remembering really come down to a few words in German. A few nouns like memory (Erinnerung, Gedächtnis), a few verbs like remembering (sich erinnern, sich entsinnen), and some rather rare adjective (erinnerlich), etc. So let's try to match those in combination with a negation with a simple regular expression. We have to be careful with the scope of the regular expression as we do not want negating words in other parts of the sentence to be confused. As a simple window we can just try to match on half sentences.

In [None]:
def cant_remember_matcher(input_text):
    '''
    A simple regular expression that will match instances of not remembering.
    '''
    wildcards = r'[äöüßa-zÄÖÜA-Z- ]*'
    # we only want to match relevant half sentences here to avoid confusions
    # with negations in other parts of the same sentence:
    borders = r'[,.;:?!]' 
    bw = borders + wildcards
    wb = wildcards + borders

    sem_matcher = [f'{bw}keine{wildcards}Erinnerung{wb}',
                   f'{bw}nicht{wildcards}Erinnerung{wb}',
                   f'{bw}Erinnerung{wildcards}nicht{wb}',
                   f'{bw}weiß{wildcards}nicht{wildcards}mehr{wb}',
                   f'{bw}nicht{wildcards}erinner{wb}',
                   f'{bw}nicht{wildcards}entsinne{wb}',
                   f'{bw}nicht{wildcards}Gedächtnis{wb}',
                   f'{bw}Gedächtnis{wildcards}nicht{wb}',
                   f'{bw}nicht{wildcards}mehr {wildcards}sagen{wb}',
                   f'{bw}fällt{wildcards}nicht{wildcards}ein{wb}',
                   f'{bw}nicht{wildcards}mehr{wildcards}rekonstruieren{wb}',
                   f'{bw}nicht{wildcards}gegenwärtig{wb}',
                   f'{bw}nicht{wildcards}präsent{wb}',
                   f'{bw}nicht{wildcards}im Kopf{wb}',
                   f'{bw}nicht{wildcards}im Hinterkopf{wb}']
                   
    return re.findall("|".join(sem_matcher), text, re.IGNORECASE)
                      
erinnerungsluecken = []
for speaker, text in protocol:
    if "zeug" in speaker.lower():
        result = cant_remember_matcher(text)
        for match in result:
            erinnerungsluecken.append(f'{speaker.strip()} ...{match[1:-1].strip()}... ')

print(len(erinnerungsluecken))


In [None]:
# Save results to file:
with open(results_folder / "erinnerungsluecken.txt", "w") as output:
    for x in erinnerungsluecken:
        output.write(x +'\n')


Regular expression will only match the rules we defined, so it only takes us so far. How could we generalize our semantic matcher, in a way that other expressions with the same semantic meaning yet different words are matched as well?

## 3.2 Text classification

One promising approach is to move from words to word embeddings. Word embedinngs encode words into vectors in a n-dimensional vector space. Ideally, words with similar meaning share a similar location in the vector space, so by 'widening' the space around one known word vector we could achieve some generalization.

Further, we want to transform our task into a classification task. We could simply think of it as a 'spam' vs 'not spam' identification task, and swaping in 'expression of not remembering' vs 'anything else'. In order to do so, we need to first train a model. Fasttext is a simple tool, that let's us do all that fairly easy and really fast. Let's preprocess our data and see how well it will perform.

###3.2.1 Data Preprocessing

In [None]:
# First, let's collect our data in one long text:
long_text = []                        
for speaker, text in protocol: 
    if "zeug" in speaker.lower():
        long_text.append(text + ".")
long_text = " ".join(long_text)

In [None]:
def remove_punctuation(text):
    punctuation_mapping = {ord(c): None for c in string.punctuation}
    return text.translate(punctuation_mapping)

def sentence_tokenizer(text):
    '''
    Versy simple sentence tokenizer using the sequence '. ' to split. 
    However, as this sequence also appears in other contexts, we will
    try to remove them first.
    '''
    mapping = {'bzw.':'bzw',
               'Bzw.':'bzw',
               'etc. pp.': 'etc pp',
               'etc.':'etc',
               'usw.':'usw',
               'usf.':'usf',
               'z.B.':'zum Beispiel',
               'z. b.':'zum Beispiel',
               'a. D.':'außer Dienst',
               'Abs.':'abs',
               'u. a.':'unter anderem',
               'evtl.':'eventuell',
               'ggf.':'gegebenenfalls',
               'f.':'f',
               'o. g.':'oben genannten',
               'ca.':'ca',
               'ff.':'ff'}

    for k, v in mapping.items():
        text = text.replace(k, v)

    # remove dots in numbers specifically dates
    # '21.3.' and '5. September' but not '1998.':
    text = re.sub(r'([\. ]\d[\d])\. ', '\\1 ', text)
    
    # Some names are anonymized with a dot: (P., R., W. etc.)
    text = re.sub(r'([A-Z])\. ', '\\1 ', text)

    return list(text.split('. '))

# Now lets split into sentences:
data = sentence_tokenizer(long_text)

# And remove all punctuation and transform to lowercase. 
# Let's also remove very short sentences with less than 4 words:
#data = [remove_punctuation(x.lower()) for x in data if len(x.split(' ')) > 4]
data = [x for x in data if len(x.split(' '))> 4]

print(len(data))

In [None]:
# Now we need to label a bit of the data to create a training set for our text classification model:
# Let's use our regular expression from before:
    
def cant_remember_matcher(text):
      wildcards = r'[äöüßa-zÄÖÜA-Z- ]*'
      # we only want to match relevant half sentences here to avoid confusions
      # with negations in other parts of the same sentence:
      halfsentence_start = r'(,|.|;|:|\?|!|^)'
      halfsentence_end = r'(,|.|;|:|\?|!|$)'
      bw = halfsentence_start + wildcards
      wb = wildcards + halfsentence_end
      sem_matcher = [f'{bw}keine{wildcards}Erinnerung{wb}',
                     f'{bw}nicht{wildcards}Erinnerung{wb}',
                     f'{bw}Erinnerung{wildcards}nicht{wb}',
                     f'{bw}weiß{wildcards}nicht{wildcards}mehr{wb}',
                     f'{bw}nicht{wildcards}erinner{wb}',
                     f'{bw}erinner{wildcards}nicht{wb}',
                     f'{bw}nicht{wildcards}entsinne{wb}',
                     f'{bw}nicht{wildcards}Gedächtnis{wb}',
                     f'{bw}Gedächtnis{wildcards}nicht{wb}',
                     f'{bw}nicht{wildcards}mehr {wildcards}sagen{wb}',
                     f'{bw}fällt{wildcards}nicht{wildcards}ein{wb}',
                     f'{bw}nicht{wildcards}mehr{wildcards}rekonstruieren{wb}',
                     f'{bw}nicht{wildcards}gegenwärtig{wb}',
                     f'{bw}nicht{wildcards}präsent{wb}',
                     f'{bw}nicht{wildcards}im Kopf{wb}',
                     f'{bw}nicht{wildcards}im Hinterkopf{wb}']
      result = re.findall('|'.join(sem_matcher), text, re.IGNORECASE)
      return True if result else False

def build_dataset(data):
      df = []
      for idx, text in enumerate(data):
          if cant_remember_matcher(text):
              label = '1'
          elif idx < 2000:
              label = '0'
          else:
              label = ''

          df.append({'label':label,
                     'text':text})
      return pd.DataFrame(df)


df = build_dataset(data)      
print(df.shape)

In [None]:
def split_dataframe_by_ratio(df, ratio=0.8):
    df.reset_index(inplace=True, drop=True)
    ratio_index = int(df.shape[0]*ratio)
    return df[:ratio_index], df[ratio_index:]

df_labeled = df[(df['label']=="1") | (df['label'] == "0")]
# Lets shuffle it:
df_labeled = df_labeled.sample(frac=1)

# Now let's split this dataframe into three smaller ones:
# train (80% of the labeled rows)
# eval (20% of the labeld rows)
train_df, eval_df = split_dataframe_by_ratio(df_labeled, 0.8)

# predict (all the unlabeld rows)
predict_df = df[df['label']=='']
predict_df.reset_index(inplace=True, drop=True)

print(train_df.shape)
print(eval_df.shape)
print(predict_df.shape)

###3.2.2 Fasttext

In [None]:
fasttext_folder = sem_folder / '02_fasttext'
Path.mkdir(fasttext_folder, exist_ok=True)

In [None]:
# Let's save our datasets to disk:
# I strongly recommend to go through the train and eval dataset and check if everything has been set
# up correctly. We naively assumed that everything that does not match our simple regular expressions 
# could be labeld as 'anything else', so we defintely have some false negatives in that dataset.

with open(fasttext_folder / 'train.txt', 'w') as outfile:
    for _, row in train_df.iterrows():
        outfile.write(f'__label__{row["label"]} {row["text"]}\n')

with open(fasttext_folder / 'eval.txt', 'w') as outfile:
    for _, row in eval_df.iterrows():
        outfile.write(f'__label__{row["label"]} {row["text"]}\n')
                
with open(fasttext_folder / 'predict.txt', 'w') as outfile:
    for _, row in predict_df.iterrows():
        outfile.write(f'{row["text"]}\n')

In [None]:
# Let's doownload, unzip and compile fasttext:
os.chdir(fasttext_folder)
!wget https://github.com/facebookresearch/fastText/archive/v0.2.0.zip
!unzip v0.2.0.zip
os.chdir('./fastText-0.2.0')
# You will need a working c++ compiler to sucessfully build it. 
# See here for more info: https://fasttext.cc/docs/en/support.html
!make


In [None]:
# Let's train the model:
!./fasttext supervised -input ../train.txt -output model -epoch 25 -lr 0.5 

In [None]:
# Lets test the model:
!./fasttext test model.bin ../eval.txt

It seem like the model performs well, however, our labeled data is rather small and labeled with help to a regular expressions, so different expressions, which we want to find here, are not represented in our labeled data.

In [None]:
!./fasttext predict-prob model.bin ../predict.txt > ../predict_result.txt

In [None]:
# Let's change back to our project directory
os.chdir("../../..")

In [None]:
# Let's look at the results:
result = pd.read_csv(fasttext_folder / 'predict_result.txt', sep=" ", header=None)
result['text'] = predict_df['text']

# Subset label 1:
result = result[result[0]=='__label__1']
result = result.sort_values([1], ascending=[0])

result.to_csv(results_folder / 'fasttext_result.csv', sep="\t", index=False, encoding='utf-8')
files.download(results_folder / 'fasttext_result.csv')


The model performs poorly, we have a lot of false positives. It seems like it does not understand the negation in relation to remembering, it identifies many sentences that just contain negations, particularyl the word 'nicht'. So at best, this model is only useful to prefilter the data. Yet it was able to capture some expressions using words that were not reflected in the training data, for example: 

*   'Mir ist der Name Haydt auch heute nicht mehr geläufig'
*   'Darüber habe ich keine Kenntnis mehr'

### 3.2.3 BERT

In [None]:
!git clone https://github.com/google-research/bert.git 03_semantic_matching/03_bert

In [None]:
# Let's download the pretrained multilingual model:
!wget -P 03_semantic_matching/03_bert https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip 03_semantic_matching/03_bert/multi_cased_L-12_H-768_A-12.zip -d 03_semantic_matching/03_bert/

In [None]:
bert_folder = sem_folder / '03_bert'
data_folder = bert_folder / 'data'
output_folder = bert_folder / 'bert_output'
Path.mkdir(bert_folder, exist_ok=True)
Path.mkdir(data_folder, exist_ok=True)
Path.mkdir(output_folder, exist_ok=True)

#Prepare data:
train_df = train_df.assign(alpha='a')
train_df = train_df.reindex(['label','alpha','text'], axis=1)
train_df.index.names = ['user_id']
train_df.to_csv(data_folder / 'train.tsv', sep='\t', index=True, header=False)

eval_df = eval_df.assign(alpha='a')
eval_df = eval_df.reindex(['label','alpha','text'], axis=1)
eval_df.index.names = ['user_id']
eval_df.to_csv(data_folder / 'dev.tsv', sep="\t", index=True, header=False)

predict_df = predict_df.drop('label', axis=1)
predict_df.to_csv(data_folder / 'test.tsv', sep='\t', index=True, header=True)

In [None]:
%tensorflow_version 1.x

In [None]:
# Let's fine-tune the bert model. This will take some time (41mins for me)
%%time
!python 03_semantic_matching/03_bert/run_classifier.py \
--task_name=cola \
--do_train=true \
--do_eval=true \
--do_predict=true \
--data_dir=03_semantic_matching/03_bert/data/ \
--vocab_file=03_semantic_matching/03_bert/multi_cased_L-12_H-768_A-12/vocab.txt \
--bert_config_file=03_semantic_matching/03_bert/multi_cased_L-12_H-768_A-12/bert_config.json \
--init_checkpoint=03_semantic_matching/03_bert/multi_cased_L-12_H-768_A-12/bert_model.ckpt \
--max_seq_length=400 \
--train_batch_size=6 \
--learning_rate=2e-5 \
--num_train_epochs=3.0 \
--output_dir=03_semantic_matching/03_bert/bert_output/ \
--do_lower_case=False

In [None]:
!cat 03_semantic_matching/03_bert/bert_output/eval_results.txt

In [None]:
# Let's look at the results:
result = pd.read_csv('03_semantic_matching/03_bert/bert_output/test_results.tsv', sep="\t", header=None)
result['text'] = predict_df['text']

# Sort by highest probability for label 1:
result = result.sort_values(1, ascending=False)

print(result.loc[result[1] > 0.9].shape)
result.to_csv(results_folder / 'bert_result.tsv', sep="\t", index=False, encoding='utf-8')
files.download(results_folder / 'bert_result.tsv')