# Collect all the Dataset in a single file

In [1]:
from datasets import load_dataset
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

  from .autonotebook import tqdm as notebook_tqdm


## Dataset 1: The British Library

In [2]:
dataset = load_dataset("TheBritishLibrary/blbooks", "1800_1899", trust_remote_code=True)["train"]
dataset = dataset.filter(lambda x: x["Language_1"] == "Spanish")
dataset = dataset.filter(lambda x: x["mean_wc_ocr"] > 0.5)
df_blbooks = pd.DataFrame(dataset)

In [3]:
df_blbooks.to_csv('../data/old-spanish-blbooks.tsv', sep='\t', index=False)

## Dataset 2: 19th century Latam Newspapers

In [5]:
df_latamnp = pd.read_csv('../data/old-spanish-latamnp.tsv', sep='\t')

## Dataset 3: Project Gutenberg

#### 1. Get the dates and available URLs to extract and chunk the text

In [11]:
def parse_date(text):
    year_match = re.search(r'(?:(?P<start>\d{4})(?:-(?P<end>\d{4}))?|-(?P<end_only>\d{4}))', text)
    if year_match:
        start = year_match.group('start')
        end = (year_match.group('end') if year_match.group('end') else year_match.group('end_only')) or start
        if start is None: start = end
        return int(start.replace('?', '')), int(end.replace('?', ''))
    else:
        bce_match = re.search(r'(?P<start>\d{1,4}\??) BCE-(?P<end>\d{1,4}\??) BCE', text)
        if bce_match:
            start = bce_match.group('start') 
            end = bce_match.group('end')
            return -int(start.replace('?', '')), -int(end.replace('?', ''))
        else:
            century_match = re.search(r'(?P<century>\d{2})th century', text)
            if century_match:
                century = century_match.group('century')
                return int(century)*100, int(century)*100+99

url = "https://www.gutenberg.org/browse/languages/es"
gut_dates = dict()

response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    book_links = soup.find_all('li', class_='pgdbetext')

    for i,link in enumerate(book_links):
        book_href = "https://www.gutenberg.org" + link.find('a')['href']
        id = (book_href).split('/')[-1]
        url = f"https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt"

        response = requests.get(book_href)
        soup = BeautifulSoup(response.content, 'html.parser')
        date_text = soup.find('div', id='bibrec').find('td').text.strip()
        date = parse_date(date_text)
        if date:
            start, end = date
            date = f"{start}-{end}"
            if ((start < 1800) or (start > 1914)) and ((end < 1800) or (end > 1914)):
                print(f"{i}. Date ({date}) not in range. Skipping book {url}.")
                continue
        else:
            print(f"{i}. Missing date ({date_text}). Skipping book {url}.")
            continue

        gut_dates[id] = date

2. Date (1540-1600) not in range. Skipping book https://www.gutenberg.org/cache/epub/70219/pg70219.txt.
3. Date (1540-1600) not in range. Skipping book https://www.gutenberg.org/cache/epub/50430/pg50430.txt.
5. Date (-526--457) not in range. Skipping book https://www.gutenberg.org/cache/epub/66023/pg66023.txt.
6. Date (1600-1699) not in range. Skipping book https://www.gutenberg.org/cache/epub/39947/pg39947.txt.
7. Date (1600-1699) not in range. Skipping book https://www.gutenberg.org/cache/epub/39948/pg39948.txt.
24. Date (1404-1472) not in range. Skipping book https://www.gutenberg.org/cache/epub/57505/pg57505.txt.
41. Missing date (Alvarez y Tejero, Luis Prudencio). Skipping book https://www.gutenberg.org/cache/epub/30892/pg30892.txt.
44. Date (1535-1535) not in range. Skipping book https://www.gutenberg.org/cache/epub/25317/pg25317.txt.
48. Missing date (Anonymous). Skipping book https://www.gutenberg.org/cache/epub/16149/pg16149.txt.
49. Missing date (Anonymous). Skipping book htt

In [12]:
print(f"# of books after filtering by year: {len(gut_dates)}")

# of books after filtering by year: 642


#### 2. Actually extract the text, filter and chunk it

In [13]:
TITLE_NOT_FOUND = "?"
AUTHOR_NOT_FOUND = "N/A"
RELEASE_DATE_NOT_FOUND = "N/A"
LANGUAGE_NOT_FOUND = "?"

data_gutenberg = []

for i, id in enumerate(gut_dates):
    url = f"https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt"
    title=None; author=None; release_date=None; language=None
    skip=False

    response = requests.get(url)
    if response.status_code == 200:
            text = response.text.split('\n')
            startline = 0
            # Extract metadata and extract only pure text
            for j, line in enumerate(text):
                line = line.strip()
                if line.startswith("*** START OF THE PROJECT GUTENBERG EBOOK"):
                    startline = j+1
                elif line.startswith("End of Project Gutenberg's"):
                    # Filter the non-desired books
                    if not title: title = TITLE_NOT_FOUND
                    if not author: author = AUTHOR_NOT_FOUND
                    if not release_date: release_date = RELEASE_DATE_NOT_FOUND
                    if not language: language = LANGUAGE_NOT_FOUND
                    if 'spanish' not in language:
                        print(f"{i}. Language {language} not expected. Skipping book {url}.")
                        skip = True
                        break
                    text = ''.join(text[startline:j])
                    break

                if line.startswith("Title: "):
                    title = line.replace("Title: ", "").strip()
                elif line.startswith("Author: "):
                    author = line.replace("Author: ", "").strip()
                elif line.startswith("Language: "):
                    language = line.replace("Language: ", "").lower().strip()
                elif line.startswith("Release date: "):
                    release_date = line.replace("Release Date: ", "").strip()
                    release_date = release_date.split(" [")[0]
            else:
                print(f"{i}. Not found end of text (***). Skipping book {url}.")
                continue
            if skip:
                continue

            chunks = re.split(r'(\r\n|\r|\n){2,}', text)
            chunks = [c.replace('\r', ' ').strip() for c in chunks if c.strip()]

            if 'project' in chunks[0]:
                del chunks[0]

            # for each chunk, add a row to the df_gutenberg
            for j,chunk in enumerate(chunks):
                data_gutenberg.append({'id': id, 'title': title, 'author': author, 'date': gut_dates[id], 'release_date': release_date, 'chunkNo': j, 'text': chunk})
    else:
        print(f"{i}. Failed to retrieve book {url}. Status code:", response.status_code)

    print(f"{i}. OK ({len(chunks)} chunks) [{title}]")

df_gutenberg = pd.DataFrame(data_gutenberg)
print(df_gutenberg)

0. OK (1380 chunks) [Germana]
1. OK (587 chunks) [La nariz de un notario]
2. OK (172 chunks) [Místicas; poesías]
3. OK (198 chunks) [Tratado de Ortografía Valenciana Clásica]
4. OK (217 chunks) [Reseña Veridica de la Revolución Filipina]
5. OK (3160 chunks) [Los Merodeadores de Fronteras]
6. OK (4175 chunks) [Las noches mejicanas]
7. OK (2661 chunks) [Novelas Cortas]
8. OK (4701 chunks) [El Capitán Veneno]
9. OK (657 chunks) [El clavo]
10. OK (1840 chunks) [Cosas que fueron: Cuadros de costumbres]
11. OK (1507 chunks) [El Niño de la Bola: Novela]
12. OK (4602 chunks) [El sombrero de tres picos]
13. OK (1362 chunks) [Viajes por España]
14. OK (1530 chunks) [Doctor Sutilis (Cuentos)]
15. OK (5968 chunks) [La Regenta]
16. OK (955 chunks) [El Señor y los demás son Cuentos]
17. OK (1181 chunks) [Su único hijo]
18. OK (792 chunks) [Nuevas poesías y evangélicas]
19. OK (743 chunks) [El Gíbaro]
20. OK (2803 chunks) [La Navidad en las Montañas]
21. OK (522 chunks) [La transformación de las raza

In [14]:
print(f"Tamaño del dataframe: {len(df_gutenberg)}")

Tamaño del dataframe: 1063380


In [15]:
df_gutenberg.to_csv('../data/non-final/old-spanish-gutenberg.tsv', sep='\t', index=False)

### 4. Compress all datasets in a single file

In [16]:
# print the list of columns of each dataset
print("Columnas blbooks:")
print(df_blbooks.columns)
print("Columnas latamnp:")
print(df_latamnp.columns)
print("Columnas gutenberg:")
print(df_gutenberg.columns)

Columnas blbooks:
Index(['record_id', 'date', 'raw_date', 'title', 'place', 'empty_pg', 'text',
       'pg', 'mean_wc_ocr', 'std_wc_ocr', 'name', 'all_names', 'Publisher',
       'Country of publication 1', 'all Countries of publication',
       'Physical description', 'Language_1', 'Language_2', 'Language_3',
       'Language_4', 'multi_language'],
      dtype='object')
Columnas latamnp:
Index(['id', 'title', 'year', 'city', 'file', 'page', 'text_id', 'text'], dtype='object')
Columnas gutenberg:
Index(['id', 'title', 'author', 'date', 'release_date', 'chunkNo', 'text'], dtype='object')


For the compiled `tsv` file, the columns to be taken are:
- `id` for both the book/newspaper and the page/chunk of text
- `date` year(s) of the text
- `place` where the text was published, for the sources where it's available
- `title` of the book/newspaper
- `text` the actual text
- `source`: which refers to which of the 3 dataset the row belongs to

In [18]:
keys = ['source', 'source_id', 'source_text_id', 'title', 'date', 'place', 'text']

df_blbooks['source'] = 'The British Library'
df_latamnp['source'] = '19th century Latam Newspapers'
df_gutenberg['source'] = 'Project Gutenberg'

df_blbooks['source_id'] = df_blbooks['record_id']
df_latamnp['source_id'] = df_latamnp['newspaper_id']
df_gutenberg['source_id'] = df_gutenberg['id']

df_blbooks['source_text_id'] = df_blbooks['pg']
df_latamnp['source_text_id'] = df_latamnp['text_id'].astype(str)
df_gutenberg['source_text_id'] = df_gutenberg['chunkNo']

df_blbooks['date'] = df_blbooks['raw_date']
df_latamnp['date'] = df_latamnp['year']
df_gutenberg['date'] = df_gutenberg['date']

df_blbooks['place'] = df_blbooks['place']
df_latamnp['place'] = df_latamnp['city']
df_gutenberg['place'] = "?"

# 'title' and 'text' are already in all sources

df_blbooks = df_blbooks[keys]
df_latamnp = df_latamnp[keys]
df_gutenberg = df_gutenberg[keys]

df_comp = pd.concat([df_blbooks, df_latamnp, df_gutenberg])
len(df_comp)

1354198

In [19]:
df_comp.to_csv('../data/old-spanish-corpus.tsv', sep='\t', index=False)