In [1]:
import glob
import pandas as pd
from datasets import Dataset, load_from_disk
from data.data import Delpher, Plakaatboeken, Wikipedia, DBNL, AmericanStories, BLBooks

years = [1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819]

## Dutch sources

In [6]:
# loading cleaned versions
ds_delpher = Delpher().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_dbnl = DBNL().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_wiki_nl = Wikipedia(language='nl').dataset.filter(lambda example: len(example["CleanedText"])>100)
ds_plakaatboek = Plakaatboeken().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)

Cleaning data since cleaned version not found


Saving the dataset (0/2 shards):   0%|          | 0/486703 [00:00<?, ? examples/s]

Filter:   0%|          | 0/486703 [00:00<?, ? examples/s]

Cleaning data since cleaned version not found


Saving the dataset (0/1 shards):   0%|          | 0/90725 [00:00<?, ? examples/s]

Filter:   0%|          | 0/90725 [00:00<?, ? examples/s]

Cleaning data since cleaned version not found


Saving the dataset (0/1 shards):   0%|          | 0/1772 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1772 [00:00<?, ? examples/s]

Cleaning data since cleaned version not found


Saving the dataset (0/1 shards):   0%|          | 0/28487 [00:00<?, ? examples/s]

Filter:   0%|          | 0/28487 [00:00<?, ? examples/s]

In [None]:
datasets = [ds_dbnl, ds_delpher, ds_plakaatboek, ds_wiki_nl]
ds_names = ['dbnl', 'delpher', 'plakaatboek', 'wiki']

In [12]:
datasets[0] = datasets[0].rename_columns({"Title": "Filename"}) # dbnl
datasets[1] = datasets[1].rename_columns({"Newspaper": "Filename"}) # delpher
datasets[2] = datasets[2].rename_columns({"Book": "Filename"}) # plakaatboek
datasets[3] = datasets[3].rename_columns({"Topic": "Filename"}) # wikipedia

In [13]:
subset_ds = [x.select_columns(["Year", "Filename", "CleanedText"]) for x in datasets]
sources = [x for x in ds_names]

In [14]:
new_dataset_nl = pd.DataFrame()

for source, subset in zip(sources, subset_ds):
    df = subset.to_pandas()
    df["Source"] = [source]*len(df)
    df["ID"] = [source+str(i) for i in range(len(df))]
    print(f"On source: {source}")
    new_dataset_nl = pd.concat([new_dataset_nl, df], ignore_index=True)

On source: dbnl
On source: delpher
On source: plakaatboeken
On source: wikipedia


In [15]:
new_dataset_nl["SourceCountry"] = ["NL"]*len(new_dataset_nl)
new_dataset_nl["SourceLang"] = ["NL"]*len(new_dataset_nl)

## American/English sources

In [5]:
ds_american_s = AmericanStories(year_list=years).dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_wiki_en = Wikipedia(language='en').dataset.filter(lambda example: len(example["CleanedText"])>100)
ds_blbooks = BLBooks().dataset.filter(lambda example: len(example["CleanedText"])>100)

Filter:   0%|          | 0/1253543 [00:00<?, ? examples/s]

In [None]:
datasets = [ds_american_s, ds_wiki_en, ds_blbooks]
ds_names = ['amserican_stories', 'wiki', 'blboooks']

In [7]:
datasets[0] = datasets[0].rename_columns({"newspaper_name": "Filename"}) # american stories
datasets[1] = datasets[1].rename_columns({"Topic": "Filename"}) # wikipedia

Dataset({
    features: ['article_id', 'newspaper_name', 'edition', 'date', 'page', 'headline', 'byline', 'Text', 'Year', 'CleanedText'],
    num_rows: 1191951
})

In [None]:
subset_ds = [x.select_columns(["Year", "Filename", "CleanedText"]) for x in datasets]
sources = [x for x in ds_names]

In [None]:
new_dataset_en = pd.DataFrame()

for source, subset in zip(sources, subset_ds):
    df = subset.to_pandas()
    df["Source"] = [source]*len(df)
    df["ID"] = [source+str(i) for i in range(len(df))]
    print(f"On source: {source}")
    new_dataset_en = pd.concat([new_dataset_en, df], ignore_index=True)

In [None]:
new_dataset_en["SourceCountry"] = ["EN"]*len(new_dataset_en)
new_dataset_en["SourceLang"] = ["EN"]*len(new_dataset_en)

Create two datasets: one with text and information, one with metadata

df 1 columns: source, id, cleaned_text, 
df 2 columns: id, filename, source_country, source_language, current_language, translated?, topic, year(published) 

In [16]:
new_ds = pd.concat([Dataset.from_pandas(new_dataset_nl), Dataset.from_pandas(new_dataset_en)])
ds1 = new_ds.select_columns(["Source", "ID", "CleanedText"])
ds2 = new_ds.select_columns(["ID", "Filename", "SourceCountry", "SourceLang", "Year"])

# TODO: add way to add topics in ds1
ds1.save_to_disk("all-texts")
ds2.save_to_disk("all-metadata")

Saving the dataset (0/1 shards):   0%|          | 0/607687 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/607687 [00:00<?, ? examples/s]

Dataset({
    features: ['Source', 'ID', 'CleanedText'],
    num_rows: 607687
}) Dataset({
    features: ['ID', 'Filename', 'SourceCountry', 'SourceLang', 'CurrentLang', 'Year'],
    num_rows: 607687
})
