# Clean the data after correcting

## 1. Train a tokenizer with the cleaned corpus

The output folder will be the same, so the previous tokenizer will be overwritten.

In [1]:
import pandas as pd
from transformers import AutoTokenizer
import json
import nltk
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
VOCAB_SIZE = 52000
TRAINING_BATCH_SIZE = 1000
HF_CHECKPOINT = "dccuchile/bert-base-spanish-wwm-cased"

pretrained_tokenizer = AutoTokenizer.from_pretrained(HF_CHECKPOINT)
df = pd.read_csv("../data/old-spanish-corpus-cleaned.tsv", sep='\t')

training_corpus = (
    df.loc[i:i+TRAINING_BATCH_SIZE, "text"].astype(str) 
    for i in range(0, len(df), TRAINING_BATCH_SIZE)
)

tokenizer = pretrained_tokenizer.train_new_from_iterator(training_corpus, VOCAB_SIZE)

  df = pd.read_csv("../data/old-spanish-corpus-cleaned.tsv", sep='\t')


In [3]:
example = df.loc[5, "text"]
print("BEFORE:", pretrained_tokenizer.tokenize(example)[:20])
print("AFTER:", tokenizer.tokenize(example)[:20])
tokenizer.save_pretrained("../output/tokenizer")

BEFORE: ['CON', '##QU', '##ISTA', '4', '*', '*', 'cu', '##tar', 'lo', 'que', 'fuese', 'de', 'su', 'mayor', 'agrado', ',', 'sin', 'dis', '"', 'curr']
AFTER: ['CONQUISTA', '4', '*', '*', 'cu', '##tar', 'lo', 'que', 'fuese', 'de', 'su', 'mayor', 'agrado', ',', 'sin', 'dis', '"', 'cur', '##rir', 'en']


('../output/tokenizer\\tokenizer_config.json',
 '../output/tokenizer\\special_tokens_map.json',
 '../output/tokenizer\\vocab.txt',
 '../output/tokenizer\\added_tokens.json',
 '../output/tokenizer\\tokenizer.json')

## 2. Chunk the texts that have more than 512 tokens

512 is the maximum number of tokens that can be processed by BERT-like models.

### 1. Chunk all the texts that have more than 512 tokens

In [4]:
MAX_TOKENS_LENGTH = 256 # 512/2, a number much below 512 so that it will still fit for different tokenizers

def chunk(text):
    chunks = []
    sentences = nltk.sent_tokenize(text) # text.split('.'), but enhanced
    current_chunk = sentences[0]
    for sentence in sentences[1:]:
        new_chunk_tks = len(tokenizer(f"{current_chunk} {sentence}")['input_ids'])
        if (new_chunk_tks) > (MAX_TOKENS_LENGTH-2):
            chunks.append(current_chunk)
            current_chunk = sentence
        else:
            current_chunk = f"{current_chunk} {sentence}"
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

In [5]:
idx_max = dict()
for i, sentence in enumerate(df['text'].astype(str)):
    num_tokens = len(tokenizer(sentence)['input_ids'])
    if num_tokens > MAX_TOKENS_LENGTH:
        chunks = chunk(sentence)
        idx_max[i] = chunks
        # print(f"{i}. {num_tokens} tokens -> {len(chunks)} chunks")
        # print("Original:")
        # print(sentence)
        # print("Chunks: [")
        # [print(len(tokenizer(j)['input_ids']), j) for j in chunks]
        # print("]")
    if i % 100000 == 0:
        print(f"Chunked {i}/{len(df)} ({i/len(df):.2%})")

print("Finished 100%")

Chunked 0/1233487 (0.00%)


Token indices sequence length is longer than the specified maximum sequence length for this model (694 > 512). Running this sequence through the model will result in indexing errors


Chunked 100000/1233487 (8.11%)
Chunked 200000/1233487 (16.21%)
Chunked 300000/1233487 (24.32%)
Chunked 400000/1233487 (32.43%)
Chunked 500000/1233487 (40.54%)
Chunked 600000/1233487 (48.64%)
Chunked 700000/1233487 (56.75%)
Chunked 800000/1233487 (64.86%)
Chunked 900000/1233487 (72.96%)
Chunked 1000000/1233487 (81.07%)
Chunked 1100000/1233487 (89.18%)
Chunked 1200000/1233487 (97.29%)
Finished 100%


In [6]:
assert df.loc[list(idx_max.keys())[0], "text"][:10] == idx_max[list(idx_max.keys())[0]][0][:10], "Texts do not coincide"

In [7]:
with open("../data/chunkedTexts.json", "w") as json_file:
    json.dump(idx_max, json_file, indent=4)

#### 2. Update the original DataFrame with the chunked texts

In [8]:
with open("../data/chunkedTexts.json", "r") as json_file:
    idx_max = json.load(json_file)
idx_max = {int(k): v for k, v in idx_max.items()}

In [9]:
original_length = len(df)
updated_length = len(df)+sum([len(i)-1 for i in idx_max.values()])
print(f"\nAdded {updated_length - original_length} chunks ({updated_length / original_length - 1:.2%})")


Added 485510 chunks (39.36%)


In [10]:
finaldf = []
for idx, row in df.iterrows():
    if idx in idx_max:
        cI = 0
        for t in idx_max[idx]:
            new_row = row.copy()
            new_row['text'] = t
            new_row['chunk_id'] = cI
            finaldf.append(new_row)
            cI += 1
    else:
        row['chunk_id'] = 0
        finaldf.append(row)
    if idx % 100000 == 0:
        print(f"Added {len(finaldf)}/{updated_length} ({len(finaldf)/updated_length:.2%})")

print("Finished 100%")
assert len(finaldf) == updated_length, f"Lengths must match, expected length to be {updated_length}, but was {len(finaldf)}"

Added 1/1718997 (0.00%)
Added 282168/1718997 (16.41%)
Added 528719/1718997 (30.76%)
Added 741572/1718997 (43.14%)
Added 844939/1718997 (49.15%)
Added 946264/1718997 (55.05%)
Added 1053346/1718997 (61.28%)
Added 1155481/1718997 (67.22%)
Added 1259787/1718997 (73.29%)
Added 1368137/1718997 (79.59%)
Added 1474160/1718997 (85.76%)
Added 1579286/1718997 (91.87%)
Added 1684382/1718997 (97.99%)
Finished 100%


In [11]:
newcols = list(df.columns)
newcols.insert(3, "chunk_id")
finaldf = pd.DataFrame(finaldf, columns=newcols)
finaldf = finaldf.reset_index(drop=True)

finaldf

Unnamed: 0,source,source_id,source_text_id,chunk_id,title,date,place,text
0,The British Library,3436138,9,0,"Historia de la conquista de México, etc [With ...",1809,London,"HISTORIA DE LA CONQUISTA PE MÉXICO, POBLACIÓN ..."
1,The British Library,3436138,10,0,"Historia de la conquista de México, etc [With ...",1809,London,"This Work, as well as LAS FÁBULAS LITERARIAS, ..."
2,The British Library,3436138,11,0,"Historia de la conquista de México, etc [With ...",1809,London,"HISTORIA De la Conquista, población y Progreso..."
3,The British Library,3436138,12,0,"Historia de la conquista de México, etc [With ...",1809,London,"CONQUISTA quartel observando la batalla, y rec..."
4,The British Library,3436138,12,1,"Historia de la conquista de México, etc [With ...",1809,London,Pon deró con afectada seguridad el atrevimient...
...,...,...,...,...,...,...,...,...
1718992,Project Gutenberg,53294,916,0,Recuerdos Del Tiempo Viejo,1817-1893,?,Pero amo á Barcelona por tiranía de ley in...
1718993,Project Gutenberg,53294,917,0,Recuerdos Del Tiempo Viejo,1817-1893,?,Barcelona trabaja... y á su existencia el ...
1718994,Project Gutenberg,53294,919,0,Recuerdos Del Tiempo Viejo,1817-1893,?,Olvidaba que entre ambas hay diferencia: n...
1718995,Project Gutenberg,53294,920,0,Recuerdos Del Tiempo Viejo,1817-1893,?,La diferencia es esta: pero es preciso que...


In [12]:
finaldf.to_csv("../data/old-spanish-corpus-chunked.tsv", sep="\t", index=False)

In [13]:
total_tokens = finaldf['text'].apply(lambda x: len(tokenizer.tokenize(x))).sum()
total_words = finaldf['text'].apply(lambda x: len(x.split())).sum()

In [14]:
print(f"New dataset has:\n\ttokens: {total_tokens:,}\n\twords: {total_words:,}\n\ttexts: {len(finaldf):,}")

New dataset has:
	tokens: 193,828,170
	words: 149,299,085
	texts: 1,718,997
