In [5]:
import os
import glob

import pandas as pd
from datasets import Dataset, load_from_disk

In [6]:
from data.data import Delpher, Plakaatboeken, Wikipedia, DBNL

years = [1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819]
# loading cleaned versions
ds_delpher = Delpher().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_dbnl = DBNL().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_wiki = Wikipedia(language='nl').dataset.filter(lambda example: len(example["CleanedText"])>100)
ds_plakaatboek = Plakaatboeken().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)


Cleaning data since cleaned version not found


Saving the dataset (0/2 shards):   0%|          | 0/486703 [00:00<?, ? examples/s]

Filter:   0%|          | 0/486703 [00:00<?, ? examples/s]

Cleaning data since cleaned version not found


Saving the dataset (0/1 shards):   0%|          | 0/90725 [00:00<?, ? examples/s]

Filter:   0%|          | 0/90725 [00:00<?, ? examples/s]

Cleaning data since cleaned version not found


Saving the dataset (0/1 shards):   0%|          | 0/1772 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1772 [00:00<?, ? examples/s]

Cleaning data since cleaned version not found


Saving the dataset (0/1 shards):   0%|          | 0/28487 [00:00<?, ? examples/s]

Filter:   0%|          | 0/28487 [00:00<?, ? examples/s]

In [10]:
ds_paths = glob.glob("*-cleaned") 
ds_paths.sort()

In [11]:
ds_paths

['dbnl-subset-cleaned',
 'delpher-subset-cleaned',
 'plakaatboeken-cleaned',
 'wikipedia-subset-cleaned']

In [12]:
datasets = [load_from_disk(x) for x in ds_paths]
datasets[0] = datasets[0].rename_columns({"Title": "Filename"}) # dbnl
datasets[1] = datasets[1].rename_columns({"Newspaper": "Filename"}) # delpher
datasets[2] = datasets[2].rename_columns({"Book": "Filename"}) # plakaatboek
datasets[3] = datasets[3].rename_columns({"Topic": "Filename"}) # wikipedia

In [13]:
subset_ds = [x.select_columns(["Year", "Filename", "CleanedText"]) for x in datasets]
sources = [x.split('-')[0] for x in ds_paths]

In [14]:
new_dataset = pd.DataFrame()

for source, subset in zip(sources, subset_ds):
    df = subset.to_pandas()
    df["Source"] = [source]*len(df)
    df["ID"] = [source+str(i) for i in range(len(df))]
    print(f"On source: {source}")
    new_dataset = pd.concat([new_dataset, df], ignore_index=True)

On source: dbnl
On source: delpher
On source: plakaatboeken
On source: wikipedia


In [15]:
new_dataset["SourceCountry"] = ["NL"]*len(new_dataset)
new_dataset["SourceCountry"] = ["NL"]*len(new_dataset)
new_dataset["SourceLang"] = ["nl"]*len(new_dataset)
new_dataset["CurrentLang"] = ["nl"]*len(new_dataset)

Create two datasets: one with text and information, one with metadata

df 1 columns: source, id, cleaned_text, 
df 2 columns: id, filename, source_country, source_language, current_language, translated?, topic, year(published) 

In [16]:
new_ds = Dataset.from_pandas(new_dataset)
ds1 = new_ds.select_columns(["Source", "ID", "CleanedText"])
ds2 = new_ds.select_columns(["ID", "Filename", "SourceCountry", "SourceLang", "CurrentLang", "Year"])
ds1.save_to_disk("all-texts")
ds2.save_to_disk("all-metadata")
# TODO: add the topic column using the topic model
print(ds1, ds2)

Saving the dataset (0/1 shards):   0%|          | 0/607687 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/607687 [00:00<?, ? examples/s]

Dataset({
    features: ['Source', 'ID', 'CleanedText'],
    num_rows: 607687
}) Dataset({
    features: ['ID', 'Filename', 'SourceCountry', 'SourceLang', 'CurrentLang', 'Year'],
    num_rows: 607687
})


In [17]:
ds1[2]['CleanedText']

'onjuiste paginanummer 54 is verbeterd. p. 79: Franschon  Franschen: dat hij en zijne Franschen, of zelfs. p. 84: V.  Z.: Z. Die Uitleiders veroorzaakten derhalven. p. 100: wraen  waren: V. De meesten waren voorgangers. p. 101: Z.  V.: V. Op allerlei manieren! Mannen, Vrouwen. p. 101: mogeijk  mogelijk: Hoe was dat mogelijk. aa__002tijr01_01 DBNL-TEI 1 2024 dbnl unicode exemplaar Koninklijke Bibliotheek Den Haag, signatuur: KW NOM T 24, scan van Google Books Cornelis van der Aa, De tijrannijen der Franschen in de jaaren 1747, 1795-1813, in de Nederlanden gepleegd. Wouter Brave, Amsterdam 1814 Wijze van coderen: standaard Nederlands De tijrannijen der Franschen in de jaaren 1747, 1795-1813, in de'

In [18]:
ds1 = load_from_disk("all-texts")

In [19]:
ds1["ID"]

['dbnl0',
 'dbnl1',
 'dbnl2',
 'dbnl3',
 'dbnl4',
 'dbnl5',
 'dbnl6',
 'dbnl7',
 'dbnl8',
 'dbnl9',
 'dbnl10',
 'dbnl11',
 'dbnl12',
 'dbnl13',
 'dbnl14',
 'dbnl15',
 'dbnl16',
 'dbnl17',
 'dbnl18',
 'dbnl19',
 'dbnl20',
 'dbnl21',
 'dbnl22',
 'dbnl23',
 'dbnl24',
 'dbnl25',
 'dbnl26',
 'dbnl27',
 'dbnl28',
 'dbnl29',
 'dbnl30',
 'dbnl31',
 'dbnl32',
 'dbnl33',
 'dbnl34',
 'dbnl35',
 'dbnl36',
 'dbnl37',
 'dbnl38',
 'dbnl39',
 'dbnl40',
 'dbnl41',
 'dbnl42',
 'dbnl43',
 'dbnl44',
 'dbnl45',
 'dbnl46',
 'dbnl47',
 'dbnl48',
 'dbnl49',
 'dbnl50',
 'dbnl51',
 'dbnl52',
 'dbnl53',
 'dbnl54',
 'dbnl55',
 'dbnl56',
 'dbnl57',
 'dbnl58',
 'dbnl59',
 'dbnl60',
 'dbnl61',
 'dbnl62',
 'dbnl63',
 'dbnl64',
 'dbnl65',
 'dbnl66',
 'dbnl67',
 'dbnl68',
 'dbnl69',
 'dbnl70',
 'dbnl71',
 'dbnl72',
 'dbnl73',
 'dbnl74',
 'dbnl75',
 'dbnl76',
 'dbnl77',
 'dbnl78',
 'dbnl79',
 'dbnl80',
 'dbnl81',
 'dbnl82',
 'dbnl83',
 'dbnl84',
 'dbnl85',
 'dbnl86',
 'dbnl87',
 'dbnl88',
 'dbnl89',
 'dbnl90',
 'dbnl91'

## Get topic labels for ds1

In [None]:
ds1 = load_from_disk("all-texts")

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1", model_kwargs={"torch_dtype": "float16"})
