# Cleaning and Filtering step

In [1]:
import pandas as pd
from transformers import AutoTokenizer
import re

  from .autonotebook import tqdm as notebook_tqdm


## 0. Load the original data

In [2]:
df = pd.read_csv("../data/old-spanish-corpus.tsv", sep="\t")
current_len, original_len = len(df), len(df)
df

  df = pd.read_csv("../data/old-spanish-corpus.tsv", sep="\t")


Unnamed: 0,source,source_id,source_text_id,title,date,place,text
0,The British Library,3436138,9,"Historia de la conquista de México, etc [With ...",1809,London,"HISTORIA DE LA CONQUISTA PE MÉXICO, POBLACIÓN ..."
1,The British Library,3436138,10,"Historia de la conquista de México, etc [With ...",1809,London,"This Work, as well as LAS FÁBULAS LITERARIAS, ..."
2,The British Library,3436138,11,"Historia de la conquista de México, etc [With ...",1809,London,"HISTORIA De la Conquista, población y Progreso..."
3,The British Library,3436138,12,"Historia de la conquista de México, etc [With ...",1809,London,"CONQUISTA quartel observando la batalla, y rec..."
4,The British Library,3436138,13,"Historia de la conquista de México, etc [With ...",1809,London,"DE NUEVA ESPAÑA. 3 "" pretexto á los sediciosos..."
...,...,...,...,...,...,...,...
715423,Project Gutenberg,53294,917,Recuerdos Del Tiempo Viejo,1817-1893,?,Barcelona trabaja... y á su existencia el ...
715424,Project Gutenberg,53294,918,Recuerdos Del Tiempo Viejo,1817-1893,?,VIII.
715425,Project Gutenberg,53294,919,Recuerdos Del Tiempo Viejo,1817-1893,?,Olvidaba que entre ambas hay diferencia: n...
715426,Project Gutenberg,53294,920,Recuerdos Del Tiempo Viejo,1817-1893,?,La diferencia es esta: pero es preciso que...


## 1. Remove duplicates and empty texts

In [3]:
to_remove = df[df.duplicated(subset=["text"], keep=False)]
df = df.drop_duplicates(subset=["text"], keep="first")
df = df.reset_index(drop=True)
print(f"Removed {current_len - len(df)} rows [{(current_len - len(df))/original_len:.2%}]")
current_len = len(df)
to_remove.tail(5)

Removed 24045 rows [3.36%]


Unnamed: 0,source,source_id,source_text_id,title,date,place,text
715401,Project Gutenberg,53294,895,Recuerdos Del Tiempo Viejo,1817-1893,?,IV.
715409,Project Gutenberg,53294,903,Recuerdos Del Tiempo Viejo,1817-1893,?,V.
715418,Project Gutenberg,53294,912,Recuerdos Del Tiempo Viejo,1817-1893,?,VI.
715420,Project Gutenberg,53294,914,Recuerdos Del Tiempo Viejo,1817-1893,?,VII.
715424,Project Gutenberg,53294,918,Recuerdos Del Tiempo Viejo,1817-1893,?,VIII.


In [4]:
to_remove = df[df['text'].astype(str).str.len() == 0]
df = df[df['text'].astype(str).str.len() > 0]
df = df.reset_index(drop=True)
print(f"Removed {current_len - len(df)} rows [{(current_len - len(df))/original_len:.2%}]")
current_len = len(df)
to_remove.tail(5)

Removed 0 rows [0.00%]


Unnamed: 0,source,source_id,source_text_id,title,date,place,text


## 2. Remove rows where 50%+ of the characters are not letters

In [5]:
def useful_chars(string):
    return re.sub(r'[^a-zA-ZÀ-ÿ]', '', string)

In [6]:
idx_min = []
for i, sentence in enumerate(df['text'].astype(str)):
    sentence = re.sub(r'\s+', ' ', sentence.strip())
    size = len(sentence)
    letters_pctg = (size-len(useful_chars(sentence)))/size
    if letters_pctg > 0.5:
        idx_min.append(i)

print("\n".join([df.loc[i, "text"] for i in idx_min]))

205 APÉNDICE I* ÍLSTÁDO GENERAL DE LA FOBLACION DEL REYNO DE GALICIA EN EL AÑO DE 1797. POBLACIONES, Ciuda' Villas. Feligre- Alde- Luga-Cotos re- Corregí- Casas Qasas arrui- des. sias. as. res. donaos, mietitos. útiles, nadas. ■1 > 3. ... 14. ... 61 1 43, . . 9 8. . . 46244. . . 6806. » d^nba~ \ 25 610 68. . . 38. . . 39803. . . 5762. » Seño- - rio I Eclesi- í 4 20. . . . 560 108. . . 14 r . . 48996. . . 9185. « astieo. I ' , s. 46. . . . 297. . . 7. . . 152. . . 44. 59367* • • 77I{^ ,f De Or- 1 denes. ( ' T 27 * r •- • 6499 1057. » Total. ". 106. 2105. 7. 371. 116. 8. 200909. 30520. *•» ■■!■ ¡ y^^m^mmt „ .. 1 BS5BB5C55K ' .'- - *"*T y *""T — SSSSSCj ' 8SCBC3 ' ™— "T"TWCBS^^^3 OFICINAS PÚBLICAS. Consistoriales. Cárceles. Pósitos. Juegos. Teatros. Pesos Reales. 79- 3^r- 19- 2^ $. 6. OFICINAS PARTICULARES. Mata- Carni- pesca- Taber- Casas de Boti- Posa- Moli* Bata- Teñe- Molinos deros. celias, derias. ñas. Comercio. Herías, das. nos. nes. rias. de papel. 24. 182. 2i. 3829 33 3 ¿95 8278 1

In [7]:
df.drop(idx_min, inplace=True)
df = df.reset_index(drop=True)
print(f"Removed {current_len - len(df)} rows [{(current_len - len(df))/original_len:.2%}]")
current_len = len(df)

Removed 7164 rows [1.00%]


## 3. Remove the rows that have too few tokens

The paremeter chosen was `6` tokens, including the [CLS] and [SEP] tokens, so the rows to remove really have less than `4` tokens

### 3.1. Train a new tokenizer, based on the original dataset

This is particularly useful for then removing the rows with very few tokens, which are likely to be noise.

In [8]:
VOCAB_SIZE = 52000
TRAINING_BATCH_SIZE = 1000
HF_CHECKPOINT = "dccuchile/bert-base-spanish-wwm-cased"

pretrained_tokenizer = AutoTokenizer.from_pretrained(HF_CHECKPOINT)

training_corpus = (
    df.loc[i:i+TRAINING_BATCH_SIZE, "text"].astype(str) 
    for i in range(0, len(df), TRAINING_BATCH_SIZE)
)

tokenizer = pretrained_tokenizer.train_new_from_iterator(training_corpus, VOCAB_SIZE)

In [9]:
example = df.loc[5, "text"]
print("BEFORE:", pretrained_tokenizer.tokenize(example)[:20])
print("AFTER:", tokenizer.tokenize(example)[:20])
tokenizer.save_pretrained("../output/tokenizer")

BEFORE: ['CON', '##QU', '##ISTA', '4', '*', '*', 'cu', '##tar', 'lo', 'que', 'fuese', 'de', 'su', 'mayor', 'agrado', ',', 'sin', 'dis', '"', 'curr']
AFTER: ['CONQUISTA', '4', '*', '*', 'cu', '##tar', 'lo', 'que', 'fuese', 'de', 'su', 'mayor', 'agrado', ',', 'sin', 'dis', '"', 'currir', 'en', 'los']


('../output/tokenizer\\tokenizer_config.json',
 '../output/tokenizer\\special_tokens_map.json',
 '../output/tokenizer\\vocab.txt',
 '../output/tokenizer\\added_tokens.json',
 '../output/tokenizer\\tokenizer.json')

### 3.2. Actually find and remove the rows

In [10]:
MIN_TOKENS_LENGTH = 6 # 4 tokens: <[CLS] token1 token2 token3 token4 [SEP]>

idx_min = []
for i, sentence in enumerate(df['text'].astype(str)):
    num_tokens = len(tokenizer(sentence)['input_ids'])
    if num_tokens < MIN_TOKENS_LENGTH:
        print(f"Removed '{sentence}' [{i}, {num_tokens} tokens]")
        idx_min.append(i)

Token indices sequence length is longer than the specified maximum sequence length for this model (692 > 512). Running this sequence through the model will result in indexing errors


Removed 'APÉNDICE' [8956, 3 tokens]
Removed 'I' [11108, 3 tokens]
Removed 'DE' [11510, 3 tokens]
Removed 'APÉNDICE.' [15593, 4 tokens]
Removed 'SAN FELIPE.' [42263, 5 tokens]
Removed 'MONTEVIDEO.' [43229, 4 tokens]
Removed 'Un tribunal civil' [50283, 5 tokens]
Removed 'DOCUMENTOS.' [62063, 4 tokens]
Removed 'MÉJICO' [62966, 3 tokens]
Removed 'SOLÓN' [63294, 4 tokens]
Removed 'HOMERO' [63351, 5 tokens]
Removed 'ITALIA.' [63930, 4 tokens]
Removed 'PARTE SEGUNDA.' [68124, 5 tokens]
Removed 'v*' [71102, 4 tokens]
Removed 'HISTÓRICO.' [72856, 4 tokens]
Removed 'Vista de Ferrol' [77442, 5 tokens]
Removed 'del autor.' [84004, 5 tokens]
Removed 'El Emperador Maximiliano' [84344, 5 tokens]
Removed 'Benito Juárez.' [84905, 5 tokens]
Removed 'Pió IX' [85021, 4 tokens]
Removed 'LEÓN.' [87323, 5 tokens]
Removed 'CONQUISTA ESPAÑOLA,' [88172, 5 tokens]
Removed 'DOMINACIÓN ESPAÑOLA,' [88255, 5 tokens]
Removed 'INDEPENDENCIA.' [88600, 4 tokens]
Removed 'geografía física' [89053, 4 tokens]
Removed 'geog

In [11]:
df.drop(idx_min, inplace=True)
df = df.reset_index(drop=True)
print(f"Removed {current_len - len(df)} rows [{(current_len - len(df))/original_len:.2%}]")
current_len = len(df)

Removed 3272 rows [0.46%]


In [12]:
print(f"Final rows: {len(df)}")
df

Final rows: 680947


Unnamed: 0,source,source_id,source_text_id,title,date,place,text
0,The British Library,3436138,9,"Historia de la conquista de México, etc [With ...",1809,London,"HISTORIA DE LA CONQUISTA PE MÉXICO, POBLACIÓN ..."
1,The British Library,3436138,10,"Historia de la conquista de México, etc [With ...",1809,London,"This Work, as well as LAS FÁBULAS LITERARIAS, ..."
2,The British Library,3436138,11,"Historia de la conquista de México, etc [With ...",1809,London,"HISTORIA De la Conquista, población y Progreso..."
3,The British Library,3436138,12,"Historia de la conquista de México, etc [With ...",1809,London,"CONQUISTA quartel observando la batalla, y rec..."
4,The British Library,3436138,13,"Historia de la conquista de México, etc [With ...",1809,London,"DE NUEVA ESPAÑA. 3 "" pretexto á los sediciosos..."
...,...,...,...,...,...,...,...
680942,Project Gutenberg,53294,916,Recuerdos Del Tiempo Viejo,1817-1893,?,Pero amo á Barcelona por tiranía de ley in...
680943,Project Gutenberg,53294,917,Recuerdos Del Tiempo Viejo,1817-1893,?,Barcelona trabaja... y á su existencia el ...
680944,Project Gutenberg,53294,919,Recuerdos Del Tiempo Viejo,1817-1893,?,Olvidaba que entre ambas hay diferencia: n...
680945,Project Gutenberg,53294,920,Recuerdos Del Tiempo Viejo,1817-1893,?,La diferencia es esta: pero es preciso que...


In [13]:
df.to_csv("../data/old-spanish-corpus-cleaned.tsv", sep="\t", index=False)