# Preprocessing Bert models

## installing missing libraries

In [None]:
!pip install transformers

In [None]:
# general libraries
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# bert related libraries
import torch
import tensorflow as tf
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel, AutoModelWithLMHead

## Reading data from google drive

In [None]:
baseDir = '/content/gdrive/My Drive/Colab Notebooks/AA'

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df = pd.read_json(baseDir+'/data/AllDS.json.zip', orient='records', compression='gzip')

In [None]:
df.head(3)

Unnamed: 0,filename,label,language,problem,set,text,dataset
0,known00001.txt,candidate00001,en,problem00001,known,"graceful ones.\n\n""One more,"" Marvelous said, ...",pan18_train
1,known00002.txt,candidate00001,en,problem00001,known,"before. If he can, he’ll remember a classmate ...",pan18_train
2,known00003.txt,candidate00001,en,problem00001,known,she thought - he was in Team Baron only becaus...,pan18_train


In [None]:
# row index will help simplify the joins/merges, tracking down the original line 
df['row_index']  = np.arange(len(df))
df.head(10)

Unnamed: 0,filename,label,language,problem,set,text,dataset,row_index
0,known00001.txt,candidate00001,en,problem00001,known,"graceful ones.\n\n""One more,"" Marvelous said, ...",pan18_train,0
1,known00002.txt,candidate00001,en,problem00001,known,"before. If he can, he’ll remember a classmate ...",pan18_train,1
2,known00003.txt,candidate00001,en,problem00001,known,she thought - he was in Team Baron only becaus...,pan18_train,2
3,known00007.txt,candidate00001,en,problem00001,known,"As far as she remembers, she's always hated pr...",pan18_train,3
4,known00006.txt,candidate00001,en,problem00001,known,"“Wait for me, please!”\n\nShe glanced towards ...",pan18_train,4
5,known00004.txt,candidate00001,en,problem00001,known,Zawame City’s no longer ‘home’ as it once was....,pan18_train,5
6,known00005.txt,candidate00001,en,problem00001,known,1. He showered her with smiles the first time ...,pan18_train,6
7,known00001.txt,candidate00002,en,problem00001,known,"love?"" The words should have been scalding, bu...",pan18_train,7
8,known00002.txt,candidate00002,en,problem00001,known,"'Asha Belarl'an', just as you are?""\n\n""Kalli,...",pan18_train,8
9,known00003.txt,candidate00002,en,problem00001,known,"EinsOne night, in a sleazy bar on Coridan, an ...",pan18_train,9


## Bert Code

In [None]:
#which bert model should be used for each language.

bert_config ={
    'en' :'bert-base-cased',
    'pt': "neuralmind/bert-large-portuguese-cased",
    'fr':'dbmdz/bert-base-french-europeana-cased',
    'sp': "dccuchile/bert-base-spanish-wwm-cased",
    'it': "dbmdz/bert-base-italian-cased",
    'pl': "dkleczek/bert-base-polish-cased-v1",
    'all': "bert-base-multilingual-cased",
}

## Generating token statistics to make sure 512 tokens is enough.

In [None]:
for lang in df['language'].unique():
    print("processing %s  - %s" % (lang, bert_config[lang]));

    tokenizer = AutoTokenizer.from_pretrained(bert_config[lang])
    tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t), df.query("language == '%s'" %lang)['text']))
    display(pd.DataFrame({'token_len':[len(t) for t in tokens]}).describe(percentiles=[0.25,0.5,0.75,0.8,0.9,0.99,1]).T)
    

processing en  - bert-base-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




Unnamed: 0,count,mean,std,min,25%,50%,75%,80%,90%,99%,100%,max
token_len,24657.0,141.943059,214.447691,5.0,33.0,77.0,153.0,164.0,302.4,1117.0,2359.0,2359.0


processing fr  - dbmdz/bert-base-french-europeana-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=420.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227141.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=83.0, style=ProgressStyle(description_w…




Unnamed: 0,count,mean,std,min,25%,50%,75%,80%,90%,99%,100%,max
token_len,833.0,1117.414166,78.090262,727.0,1104.0,1125.0,1153.0,1164.6,1186.0,1270.0,1315.0,1315.0


processing it  - dbmdz/bert-base-italian-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=235127.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=59.0, style=ProgressStyle(description_w…




Unnamed: 0,count,mean,std,min,25%,50%,75%,80%,90%,99%,100%,max
token_len,888.0,1124.912162,120.515658,671.0,1122.0,1157.0,1193.0,1201.0,1221.0,1268.0,1296.0,1296.0


processing pl  - dkleczek/bert-base-polish-cased-v1


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=459.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=489360.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=30.0, style=ProgressStyle(description_w…




Unnamed: 0,count,mean,std,min,25%,50%,75%,80%,90%,99%,100%,max
token_len,943.0,1171.708378,110.708655,703.0,1160.0,1196.0,1232.0,1242.0,1263.0,1317.0,1337.0,1337.0


processing sp  - dccuchile/bert-base-spanish-wwm-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242120.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=43.0, style=ProgressStyle(description_w…




Unnamed: 0,count,mean,std,min,25%,50%,75%,80%,90%,99%,100%,max
token_len,1037.0,1131.565092,117.715861,647.0,1119.0,1153.0,1185.0,1193.0,1220.0,1314.28,1503.0,1503.0


processing pt  - neuralmind/bert-large-portuguese-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=648.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=209528.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=155.0, style=ProgressStyle(description_…




Unnamed: 0,count,mean,std,min,25%,50%,75%,80%,90%,99%,100%,max
token_len,2608.0,297.72661,167.024843,140.0,215.0,260.0,327.0,346.6,419.6,989.24,2324.0,2324.0


In [None]:
def processDF(df, textToIdFunction):
    """
    returns Bert vector for chunks of 512 tokens
    df: 
    """

    bert_results = {
        'row_index':[],
        'lang': [],
        'vectors': [],
    }
    for lang in df['language'].unique():
        print(f"processing {lang}  - {bert_config[lang]}");

        tokenizer = AutoTokenizer.from_pretrained(bert_config[lang])
        model = AutoModel.from_pretrained(bert_config[lang]).cuda()


        subset_df = df.query("language == '%s'" %lang);
        print(" %d documents found." % len(subset_df));
        texts = subset_df['text']
        row_index = subset_df['row_index']

        #tokenizing
        MAX_TOKENS = 512
        tokens, row_index = textToIdFunction(tokenizer, texts, row_index, MAX_TOKENS)

        tokens_ids = list(map(tokenizer.convert_tokens_to_ids, tokens))
        tokens_ids = tf.keras.preprocessing.sequence.pad_sequences(tokens_ids, maxlen=MAX_TOKENS, truncating='post', padding='post', dtype='int64')

        tensors = torch.tensor(tokens_ids)
        dataloader = DataLoader(TensorDataset(tensors), batch_size=32)


        with torch.no_grad():
            results = []
            for t in tqdm(iter(dataloader)):
                t = t[0]
                result = model(t.cuda())
                result = result[0][range(len(t)), -1, :]
                results.append(result.cpu())
            results = torch.cat(results).numpy()


        bert_results['row_index'].append(row_index);
        bert_results['lang'].append([lang]* len(row_index));
        bert_results['vectors'].append(results);

    return bert_results;

In [None]:
def textToIdCrop(tokenizer, texts, row_index,max_tokens=512):
    tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:(max_tokens-1)], texts))
    return tokens, row_index;

## training bert with first 512 tokens

In [None]:
bert_results = processDF(df,textToIdCrop)

processing en  - bert-base-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…


 24657 documents found.


100%|██████████| 771/771 [14:31<00:00,  1.13s/it]


processing fr  - dbmdz/bert-base-french-europeana-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445018784.0, style=ProgressStyle(descri…


 833 documents found.


100%|██████████| 27/27 [00:30<00:00,  1.11s/it]


processing it  - dbmdz/bert-base-italian-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442256004.0, style=ProgressStyle(descri…


 888 documents found.


100%|██████████| 28/28 [00:32<00:00,  1.16s/it]


processing pl  - dkleczek/bert-base-polish-cased-v1


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=531146786.0, style=ProgressStyle(descri…


 943 documents found.


100%|██████████| 30/30 [00:34<00:00,  1.15s/it]


processing sp  - dccuchile/bert-base-spanish-wwm-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=441944381.0, style=ProgressStyle(descri…


 1037 documents found.


100%|██████████| 33/33 [00:38<00:00,  1.15s/it]


processing pt  - neuralmind/bert-large-portuguese-cased


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1342014951.0, style=ProgressStyle(descr…


 2608 documents found.


100%|██████████| 82/82 [04:58<00:00,  3.64s/it]


In [None]:
bert_results2 = pd.DataFrame({
    'row_index': [i for r in bert_results['row_index'] for i in r],
    'BERT_vector': [vv for v in bert_results['vectors'] for vv in v]
})
df_with_bert = pd.merge( df, bert_results2,on='row_index').drop(columns=['row_index'])
df_with_bert.to_json(baseDir+'/data/AllDS_BERT.json.zip', orient='records', compression='gzip')
print(len(df_with_bert))
df_with_bert.head(100)

30966


Unnamed: 0,filename,label,language,problem,set,text,dataset,BERT_vector
0,known00001.txt,candidate00001,en,problem00001,known,"graceful ones.\n\n""One more,"" Marvelous said, ...",pan18_train,"[0.4001088, -0.98967713, -0.28495646, -0.51626..."
1,known00002.txt,candidate00001,en,problem00001,known,"before. If he can, he’ll remember a classmate ...",pan18_train,"[-0.10366619, -0.21127939, -0.17623094, -0.178..."
2,known00003.txt,candidate00001,en,problem00001,known,she thought - he was in Team Baron only becaus...,pan18_train,"[0.15406814, -0.31223914, -0.023384072, -0.407..."
3,known00007.txt,candidate00001,en,problem00001,known,"As far as she remembers, she's always hated pr...",pan18_train,"[-0.013924662, -0.40364963, -0.30195427, 0.300..."
4,known00006.txt,candidate00001,en,problem00001,known,"“Wait for me, please!”\n\nShe glanced towards ...",pan18_train,"[-0.16476764, -0.25846422, 0.10318206, 0.09672..."
...,...,...,...,...,...,...,...,...
95,known00006.txt,candidate00014,en,problem00001,known,"it,"" he said. ""Polyurethane. I'm allergic to l...",pan18_train,"[-0.19633922, -0.7409366, -0.14413436, -0.4283..."
96,known00004.txt,candidate00014,en,problem00001,known,"word for it; getting drunk here, surrounded by...",pan18_train,"[-0.026008733, -0.32210076, 0.34522638, -0.637..."
97,known00005.txt,candidate00014,en,problem00001,known,you don't get it. I don't want it 'cleared up....,pan18_train,"[-0.08583247, -0.743303, -0.35682654, 0.018472..."
98,known00001.txt,candidate00015,en,problem00001,known,"the small reading table, and join Joscelin at ...",pan18_train,"[0.7618558, -0.528057, -0.5821774, 0.07313047,..."
