In [2]:
import glob
import pandas as pd
from datasets import Dataset, load_from_disk
from data.data import Delpher, Plakaatboeken, Wikipedia, DBNL, AmericanStories, BLBooks

years = [1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819]

## Dutch sources

In [3]:
# loading cleaned versions
ds_delpher = Delpher().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_dbnl = DBNL().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_wiki_nl = Wikipedia(language='nl').dataset.filter(lambda example: len(example["CleanedText"])>100)
ds_plakaatboek = Plakaatboeken().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)

Filter:   0%|          | 0/486703 [00:00<?, ? examples/s]

Filter:   0%|          | 0/90725 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1772 [00:00<?, ? examples/s]

Filter:   0%|          | 0/482 [00:00<?, ? examples/s]

Filter:   0%|          | 0/28487 [00:00<?, ? examples/s]

In [4]:
datasets = [ds_dbnl, ds_delpher, ds_plakaatboek, ds_wiki_nl]
ds_names = ['dbnl', 'delpher', 'plakaatboek', 'wiki']

In [5]:
datasets[0] = datasets[0].rename_columns({"Title": "Filename"}) # dbnl
datasets[1] = datasets[1].rename_columns({"Newspaper": "Filename"}) # delpher
datasets[2] = datasets[2].rename_columns({"Book": "Filename"}) # plakaatboek
datasets[3] = datasets[3].rename_columns({"Topic": "Filename"}) # wikipedia

In [6]:
subset_ds = [x.select_columns(["Year", "Filename", "CleanedText"]) for x in datasets]
sources = [x for x in ds_names]

In [7]:
new_dataset_nl = pd.DataFrame()

for source, subset in zip(sources, subset_ds):
    df = subset.to_pandas()
    df["Source"] = [source]*len(df)
    df["ID"] = [source+str(i) for i in range(len(df))]
    print(f"On source: {source}")
    new_dataset_nl = pd.concat([new_dataset_nl, df], ignore_index=True)

On source: dbnl
On source: delpher
On source: plakaatboek
On source: wiki


In [8]:
new_dataset_nl["SourceCountry"] = ["NL"]*len(new_dataset_nl)
new_dataset_nl["SourceLang"] = ["NL"]*len(new_dataset_nl)

In [32]:
## delpher had french english and dutch articles in one newspaper/had specific french and english newspapers: the french ones were highly censored so need to further refine SourceLang
from lingua import Language, LanguageDetectorBuilder # fastest I could find

languages = [Language.ENGLISH, Language.DUTCH, Language.FRENCH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()
# detector.detect_language_of(x[2]).iso_code_639_1.name
def detect_lang(example):
    try:
        return detector.detect_language_of(example).iso_code_639_1.name
    except:
        return "NL"
    
new_dataset_nl['SourceLang'] = new_dataset_nl['CleanedText'].map(detect_lang)

In [34]:
ds_nl = Dataset.from_pandas(new_dataset_nl)
ds_nl.save_to_disk("all-texts-dutch")

Saving the dataset (0/1 shards):   0%|          | 0/575182 [00:00<?, ? examples/s]

## American/English sources

In [10]:
ds_american_s = AmericanStories(year_list=years).dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_wiki_en = Wikipedia(language='en').dataset.filter(lambda example: len(example["CleanedText"])>100)
ds_blbooks = BLBooks().dataset.filter(lambda example: len(example["CleanedText"])>100)

In [11]:
datasets = [ds_american_s, ds_wiki_en, ds_blbooks]
ds_names = ['amserican_stories', 'wiki', 'blboooks']

In [12]:
datasets[0] = datasets[0].rename_columns({"newspaper_name": "Filename"}) # american stories
datasets[1] = datasets[1].rename_columns({"Topic": "Filename"}) # wikipedia
datasets[2] = datasets[2].add_column("Year", [x.year for x in datasets[2]['date']]) # BL books
datasets[2] = datasets[2].rename_columns({"title": "Filename"}) # BL books

Flattening the indices:   0%|          | 0/508561 [00:00<?, ? examples/s]

In [14]:
subset_ds = [x.select_columns(["Year", "Filename", "CleanedText"]) for x in datasets]
sources = [x for x in ds_names]

In [15]:
new_dataset_en = pd.DataFrame()

for source, subset in zip(sources, subset_ds):
    df = subset.to_pandas()
    df["Source"] = [source]*len(df)
    df["ID"] = [source+str(i) for i in range(len(df))]
    print(f"On source: {source}")
    new_dataset_en = pd.concat([new_dataset_en, df], ignore_index=True)

On source: amserican_stories
On source: wiki
On source: blboooks


In [16]:
new_dataset_en["SourceCountry"] = ["EN"]*len(new_dataset_en)
new_dataset_en["SourceLang"] = ["EN"]*len(new_dataset_en)

## French sources

Create two datasets: one with text and information, one with metadata

df 1 columns: source, id, cleaned_text, 
df 2 columns: id, filename, source_country, source_language, current_language, translated?, topic, year(published) 

In [16]:
new_ds = pd.concat([Dataset.from_pandas(new_dataset_nl), Dataset.from_pandas(new_dataset_en)])
ds1 = new_ds.select_columns(["Source", "ID", "CleanedText"])
ds2 = new_ds.select_columns(["ID", "Filename", "SourceCountry", "SourceLang", "Year"])

# TODO: add way to add topics in ds1
ds1.save_to_disk("all-texts")
ds2.save_to_disk("all-metadata")

Saving the dataset (0/1 shards):   0%|          | 0/607687 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/607687 [00:00<?, ? examples/s]

Dataset({
    features: ['Source', 'ID', 'CleanedText'],
    num_rows: 607687
}) Dataset({
    features: ['ID', 'Filename', 'SourceCountry', 'SourceLang', 'CurrentLang', 'Year'],
    num_rows: 607687
})
