In [1]:
from pathlib import Path
from datetime import date 

In [2]:
import numpy as np
def report_stats(dst):
    names, counts = np.unique(dst["server"], return_counts=True)
    for name, count in zip(names, counts):
        print(f"{name}: {count}")
    print(f"Total: {sum(counts)}")

In [3]:
NUM_PROCESS = 4

In [4]:
from datasets import disable_caching
disable_caching()

In [5]:
from preprocess_utils import load_jsonb, save_jsonb
from datasets import Dataset
from datasets import Features, Value
from urllib.parse import urlparse
from datasets import Sequence


def get_server(url):
    parsed = urlparse(url).netloc.split(".")
    server = parsed[-1]
    if server == "cz":
        server = parsed[-2]

    if server == "cz:443":
        server = "aktualne"

    return server


dataset = Dataset.from_json(
    "final.jsonb",
    features=Features(
        {
            "url": Value("string"),
           "author": Sequence(Value("string")),
           "headline": Value("string"),
           "brief": Value("string"),
           "publication_date": Value("string"),
           "keywords": Sequence(Value("string")),
           "category": Value("string"),
           "content": Value("large_string"),  
           "comments_num": Value("int32"),
        }
    ),
)
# We will postprocess string values later and cast it manually
dataset = dataset.map(lambda batch: {"server": [get_server(url) for url in batch["url"]]}, num_proc=NUM_PROCESS, batched=True)
dataset = dataset.filter(lambda batch: [server != "ihned" for server in batch["server"] ], num_proc=NUM_PROCESS, batched=True)




Downloading and preparing dataset json/default to /home/kydliceh/.cache/huggingface/datasets/json/default-778ef7b35e31a789/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/kydliceh/.cache/huggingface/datasets/json/default-778ef7b35e31a789/0.0.0. Subsequent calls will reuse this data.


Map (num_proc=8):   0%|          | 0/3193720 [00:00<?, ? examples/s]

Filter (num_proc=8):   0%|          | 0/3193720 [00:00<?, ? examples/s]

In [14]:
report_stats(dataset)

aktualne: 165799
denik: 1636639
idnes: 530855
irozhlas: 201681
novinky: 549661
seznamzpravy: 77442
Total: 3162077


In [6]:
translate_cat = [
    (["zprávy z domova", "z domova", "domov"], "domácí"),
    (["zprávy ze světa", "svět", "zahraničí", "ze světa"], "zahraniční"),
    (
        ["tipy deníku", "rádce", "rady učitele", "eko rady a tipy"],
        "tipy",
    ),
    (
        [
            "nepoužívat - věda",
            "věda a školy",
            "věda a technologie",
            "věda a technika",
            "věda a vesmír",
            "věda a příroda",
            "věda & vesmír",
            "vesmír",
        ],
        "věda",
    ),
    (
        [
            "technet",
            "tech & trendy",
            "tech",
            "technika",
            "hardware",
            "software",
            "internet",
            "web",
            "internet a pc",
            "aplikace",
            "bonusweb",
            "plné hry",
        ],
        "technologie",
    ),
    (
        [
            "osobní finance",
            "hypotéky a půjčky",
            "banky a spoření",
            "pojištení",
            "investice",
            "daně",
            "penze",
        ],
        "finance",
    ),
    (
        [
            "nová auta",
            "automoto",
            "motorismus",
            "auta",
            "moje auto",
            "auta a motorky",
            "motocykly",
            "autohistorie",
            "ojetá auta",
            "život řidiče",
            "motorky",
        ],
        "auto",
    ),
    (
        ["televize", "film", "tv", "film a tv"],
        "kultura",
    ),
    (["lidé"], "člověk"),
    (
        ["na kole po česku", "cyklorady", "cyklotrasy"],
        "kolo",
    ),
    (["stavba", "bydlení", "koupelna", "kuchyně"], "bydlení"),
    (
        ["cestovánía dovolená", "po česku", "cestujeme"],
        "cestování",
    ),
    (
        [
            "životní styl a společnost",
            "ona",
            "žena",
            "ženy",
            "pro ženy" "styl",
            "životnístyl",
            "nakupování",
            "nákupy",
            "móda",
            "vztahy",
            "vztahy a sex",
            "sex",
        ],
        "životní styl",
    ),
    (["zdraví", "zdraví a fitness", "fit", "krása"], "životní styl"),
    (["jídelníček a recepty", "recepty"], "životní styl"),
    (
        ["nepouzivat - kultura", "tipy na kulturu", "tipyna kulturu", "divadlo"],
        "kultura",
    ),
    (["česká ekonomika", "světová ekonomika"], "ekonomika"),
    (["práce a podnikání", "podniky", "byznys", "aktuálně o eet"], "byznys"),
    (
        ["hokej", "ms v hokeji", "ms hokej", "vancouver - hokej", "nhl", "extraliga"],
        "sport",
    ),
    (
        [
            "kraje",
            "regiony",
            "z okolí",
            "praha",
            "brněnsko",
            "blanensko",
            "Českobudějovicko",
            "chebsko",
            "berounsko",
            "děčínsko",
            "bruntálsko",
            "Boleslavsko",
            "Hradecko",
            "Českolipsko",
            "Havlíčkobrodsko",
            "Břeclavsko",
            "Domažlicko",
            "Chrudimsko",
            "Litoměřicko",
            "Ústecko",
            "Karlovarsko",
            "Benešovsko",
            "Chomutovsko",
            "Liberecko",
            "Frýdecko-místecko",
            "Českokrumlovsko",
            "Hranicko",
            "Žďársko",
            "Olomoucko",
            "Hodonínsko",
            "Prachaticko",
            "Krkonoše",
            "Jindřichohradecko",
            "Jihlavsko",
            "Jablonecko",
            "Ostravsko",
            "Plzeňsko",
            "Jihomoravský kraj",
            "Jičínsko",
            "Rakovnicko",
            "Slovácko",
            "Orlicko",
            "Zlínsko",
            "Nymbursko",
            "Žatecko a lounsko",
            "Prostějovsko",
            "Ústecký kraj",
            "Vyškovsko",
            "Moravskoslezský kraj",
            "Středočeský kraj",
            "Písecko",
            "Strakonicko",
            "Tachovsko",
            "Táborsko",
            "Rokycansko",
            "Klatovsko",
            "Novojičínsko",
            "Kladensko",
            "Znojemsko",
            "Kutnohorsko",
            "Teplicko",
            "Svitavsko",
            "Kroměřížsko",
            "Třebíčsko",
            "Mostecko",
            "Opavsko",
            "Pardubicko",
            "Přerovsko",
            "Šumpersko",
            "Kolínsko",
            "Zlínský kraj",
            "Sokolovsko",
            "Příbramsko",
            "Olomoucký kraj",
            "Královéhradecký kraj",
            "Liberecký kraj",
            "Karvinsko",
            "Valašsko",
            "Pelhřimovsko",
            "Pardubický kraj",
            "Plzeňský kraj",
            "Jihočeský kraj",
            "Rychnovsko",
            "Náchodsko",
            "Kraj vysočina",
            "Mělnicko",
            "Karlovarský kraj",
        ],
        "domácí",
    ),
    (
        [
            "ms ve fotbale",
            "česká liga",
            "liga mistrů",
            "evropská liga",
            "fotbal",
            "euro 2016",
            "euro",
        ],
        "sport",
    ),
    (
        [
            "klasická hudba",
            "hudba",
            "mff kv 2010",
            "mff kv 2011",
            "mff kv 2012",
            "mff kv 2014",
            "mff kv 2015",
            "mff kv 2016",
            "mff kv 2017",
            "mff kv 2018",
            "mff kv 2019",
            "mff kv",
        ],
        "kultura",
    ),
    (["basket", "basketbal"], "sport"),
    (["mobily", "telefony", "mobil"], "technologie"),
    (["názory"], "komentáře"),
    (
        [
            "loh 2020",
            "loh 2016 rio de janeiro",
            "tokio 2021",
            "tokio 2020",
            "loh 2012 londýn",
            "peking 2022",
            "vancouver - olympijské deníky",
        ],
        "sport",
    ),
    (
        [
            "tenis",
            "ostatní sporty",
            "zahraniční ligy",
            "atletika",
            "reprezentace",
            "formule 1",
            "zimní sporty",
            "lyže",
            "tour de france",
            "rallye",
            "biatlon",
        ],
        "sport",
    ),
    (["černá kronika"], "krimi"),
    # Domácí as all other politics are in domácí
    (["volby", "politika"], "domácí"),
    (["koktejl", "kuriozity", "zajímavosti", "rady a zajímavosti"], "koktejl"),
    (["revue", "modelky", "celebrity"], "revue"),
    (["příběhy a rozhovory", "rozhovory"], "rozhovory"),
]


translate_cat = dict(
    [(x, y) for items, res in translate_cat for x, y in zip(items, [res] * len(items))]
)

with open("categories.txt") as f:
    cats = set(f.read().splitlines())


In [10]:
from postprocessing_utils import Gender
authors = {}
with open("authors.txt") as f:
    for line in f:
        author, gender = line.split(",")
        gender = Gender.MAN if gender.strip() == "male" else Gender.WOMAN
        authors[author] = gender
    

In [8]:
with open("headline_filter.txt") as f:
    headline_filters = set(f.read().splitlines())

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from typing import Callable, List
from filtering import (
    create_filter_by_stats,
    create_filter_by_cz_lang,
    create_config,
    create_tokenized_filter,
    create_filter,
    is_cz,
)
from preprocess_utils import load_jsonb, save_jsonb
from postprocessing_utils import (
    postprocess_authors,
    add_day,
    filter_author,
    create_translate,
    create_filter_by_set,
    postprocess_brief,
    postprocess_category,
    postprocess_headline,
    postprocess_date,
    postprocess_content,
    as_Article,
    JSONArticleEncoder,
    add_cum_gender,
    create_add_gender,
    postprocess_keywords,
    create_none_to_x,
)




In [15]:
# Custom config for each server
from filtering import between
configs = {
    "idnes": {},
    "denik": {},
    "aktualne": {},
    "irozhlas": {},
    "seznamzpravy": {},
    "novinky": {},
}

default = {
    "article_length": between(400, None),
    "avg_word_length": between(4.0, None),
    "num_words_ratio": between(0.11, 0.22),
    "headline_length": between(20, None),
    "brief_length": between(40, None),
    "non_alpha_ratio": between(None, 0.045),
    "date": between(date(2000, 1, 1), date(2022, 8, 31)),
}

In [16]:
cz_ratio = 0.85
batch_filters = {
    "content": [create_filter_by_cz_lang(ratio=cz_ratio)],
    "headline": [create_filter(headline_filters, lambda head,f: head.startswith(f))]
}
unbatched_filters = [create_filter_by_stats(create_config({}, default))]

dataset_filtered = dataset

for col, filters in batch_filters.items():
    for fc in filters:
        dataset_filtered = dataset_filtered.filter(lambda batch: [fc(ct) for ct in batch[col]], batched=True, num_proc=NUM_PROCESS)
        report_stats(dataset_filtered)

for fc in unbatched_filters:
    dataset_filtered = dataset_filtered.filter(fc, num_proc=NUM_PROCESS)
    report_stats(dataset_filtered)



Filter (num_proc=8):   0%|          | 0/3162077 [00:00<?, ? examples/s]

aktualne: 157713
denik: 1518244
idnes: 477690
irozhlas: 197183
novinky: 496186
seznamzpravy: 76209
Total: 2923225


Filter (num_proc=8):   0%|          | 0/2923225 [00:00<?, ? examples/s]

aktualne: 154603
denik: 1466683
idnes: 464481
irozhlas: 196200
novinky: 492023
seznamzpravy: 75294
Total: 2849284


Filter (num_proc=8):   0%|          | 0/2849284 [00:00<?, ? examples/s]

aktualne: 144014
denik: 1282138
idnes: 446690
irozhlas: 181956
novinky: 415122
seznamzpravy: 72195
Total: 2542115


In [17]:
from functools import reduce
from datetime import datetime


def to_date(date_str):
    if date_str is None:
        return None
    return datetime.fromisoformat(date_str).date()


postprocessing = {
    "category": [create_translate(translate_cat, lower=True), postprocess_category, create_filter_by_set(cats, lower=True)],
    "authors": [postprocess_authors, create_filter_by_set(set(authors.keys()), lower=True), create_non_to_x([])],
    "brief": [postprocess_brief],
    "headline": [postprocess_headline],
    "content": [postprocess_content],
    "publication_date": [postprocess_date],
    "keywords": [postprocess_keywords, create_none_to_x([])],
}
dataset_post = dataset_filtered
# Backup catogories
dataset_post = dataset_post.map(lambda batch: {"category_unclean": batch["category"]}, batched=True, num_proc=NUM_PROCESS)


dataset_post = dataset_post.rename_column("author", "authors")

for col, funcs in postprocessing.items():
    dataset_post = dataset_post.map(lambda batch: {col: reduce(lambda arts, f:[ f(art) for art in arts], funcs, batch[col])}, batched=True, num_proc=NUM_PROCESS)

# Convert to date
dataset_post = dataset_post.map(
    lambda batch: { "date": [to_date(dtm) for dtm in batch["publication_date"]] }
    , batched=True, batch_size=None, num_proc=NUM_PROCESS, remove_columns=["publication_date"]
)

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

In [18]:
dataset_post.save_to_disk("dataset_post")

Saving the dataset (0/18 shards):   0%|          | 0/2542115 [00:00<?, ? examples/s]

In [8]:
from datasets import load_from_disk
dataset_post = load_from_disk("dataset_post")

In [14]:
augmentations = {
    "authors_gender": create_add_gender(authors),
    "authors_cum_gender": add_cum_gender,
    "day_of_week": add_day,
}



dataset_augmented = dataset_post

for col, func in augmentations.items():
    dataset_augmented = dataset_augmented.map(lambda batch: {col: func(batch)}, batched=False, num_proc=NUM_PROCESS)


Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

In [15]:
# Keep compatibility
from datasets import load_dataset
dst = load_dataset("hynky/czech_news_dataset", split="test")
NONE_LABEL = dst.features["category"].names[0]

def NoneToNoneLabel(dst, column):
    return dst.map(lambda batch: {column: [NONE_LABEL if x is None else x for x in batch[column]]}, batched=True, num_proc=NUM_PROCESS)


Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7e1b6d781523c0de/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [21]:
from datasets import ClassLabel, Sequence
casted = dataset_augmented
cast_columns = ["category", "server"]
for col in cast_columns:
    casted = NoneToNoneLabel(casted, col)
    casted = casted.cast_column(col, dst.features[col])

cast_columns = ["authors_gender", "authors_cum_gender", "day_of_week"]
for col in cast_columns:
    casted = casted.cast_column(col, dst.features[col])

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2542115 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2542115 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2542115 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2542115 [00:00<?, ? examples/s]

KeyError: 'day'

In [26]:
casted = casted.cast_column("day_of_week", dst.features["day_of_week"])

Casting the dataset:   0%|          | 0/2542115 [00:00<?, ? examples/s]

In [4]:
casted.save_to_disk("casted")

KeyboardInterrupt: 

In [1]:
from datasets import load_from_disk
casted = load_from_disk("casted")
NUM_PROCESS = 12

In [6]:

def create_uniq_filter():
    uniqs = set()
    def filter(item):
        if item in uniqs:
            return False
        uniqs.add(item)
        return True

    return filter

filters = {
    "headline": create_uniq_filter(),
    "brief": create_uniq_filter(),
    "content": create_uniq_filter(),
}

uniq_dataset = casted
uniq_dataset = uniq_dataset.map(lambda item: {
    "not_null": sum([val != None for val in item.values()]),
    "article_length": len(item["content"]),
    }, batched=False, num_proc=NUM_PROCESS)


# Ensures article with most data will survive
uniq_dataset = uniq_dataset.sort(["not_null", "article_length"], reverse=True)
for col, fc in filters.items():
    # Don't multiprocess, because it's not thread safe
    uniq_dataset = uniq_dataset.filter(lambda batch: [fc(item) for item in batch[col] ], batched=True, num_proc=1)
    report_stats(uniq_dataset)

uniq_dataset = uniq_dataset.remove_columns(["not_null", "article_length"])

Map (num_proc=12):   0%|          | 0/2542115 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2542115 [00:00<?, ? examples/s]

1: 72134
2: 436691
3: 143752
4: 367258
5: 762246
6: 181753
Total: 1963834


Filter:   0%|          | 0/1963834 [00:00<?, ? examples/s]

1: 71272
2: 433646
3: 142164
4: 365895
5: 743493
6: 180396
Total: 1936866


Filter:   0%|          | 0/1936866 [00:00<?, ? examples/s]

1: 71261
2: 433573
3: 141961
4: 365813
5: 738288
6: 180248
Total: 1931144


In [7]:
uniq_dataset.save_to_disk("uniq_dataset")

Flattening the indices:   0%|          | 0/1931144 [00:00<?, ? examples/s]

Saving the dataset (0/14 shards):   0%|          | 0/1931144 [00:00<?, ? examples/s]

In [7]:
from datasets import load_from_disk
uniq_dataset = load_from_disk("uniq_dataset")

In [8]:
# Create splits 85/7.5/7.5 by date
from datetime import date
import random


without_date = uniq_dataset.filter(
    lambda batch: [art is None for art in batch["date"]],
    batched=True,
    num_proc=NUM_PROCESS,
)
with_date = uniq_dataset.filter(
    lambda batch: [art is not None for art in batch["date"]],
    batched=True,
    num_proc=NUM_PROCESS,
)



Filter (num_proc=4):   0%|          | 0/1931144 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/1931144 [00:00<?, ? examples/s]

In [9]:
without_date.save_to_disk("without_date")

Flattening the indices:   0%|          | 0/1310 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1310 [00:00<?, ? examples/s]

In [12]:
with_date.save_to_disk("with_date")

Flattening the indices:   0%|          | 0/1929834 [00:00<?, ? examples/s]

Saving the dataset (0/13 shards):   0%|          | 0/1929834 [00:00<?, ? examples/s]

In [13]:
with_date = load_from_disk("with_date")
without_date = load_from_disk("without_date")

In [15]:
with_date = with_date.flatten_indices(num_proc=NUM_PROCESS)

Flattening the indices (num_proc=4):   0%|          | 0/1929834 [00:00<?, ? examples/s]

In [16]:

with_date = with_date.sort("date")

In [21]:
with_date = with_date.flatten_indices(num_proc=NUM_PROCESS)

Flattening the indices (num_proc=4):   0%|          | 0/1929834 [00:00<?, ? examples/s]

In [22]:
without_date = without_date.shuffle(seed=42).flatten_indices(num_proc=NUM_PROCESS)

Flattening the indices (num_proc=4):   0%|          | 0/1310 [00:00<?, ? examples/s]

In [None]:
splits = {
    "train": 0.85,
    "validation": 0.925,
    "test": 1.0
}

prev_ratio = 0
final = {}
for split,ratio in splits.items():
    dst = [with_date, without_date]
    dst = [d.select(range(int(prev_ratio * len(d)), int(ratio * lend(d))), num_proc=NUM_PROCESS) for d in dst]
    dst = concatenate_datasets(dst)
    final[split] = dst


In [78]:
from datsets import DatasetDict
DatasetDict(final).push_to_hub("hynky/czech_news_datasetv_2", use_auth_token=True)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/8 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/8 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/123 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/123 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
dst

NameError: name 'dst' is not defined

In [30]:
cat_names =  ["None",
"Zahraniční",
"Domácí",
"Sport",
"Kultura",
"Revue",
"Koktejl",
"Ekonomika",
"Krimi",
"Podnikání",
"Auto",
"Věda",
"Komentáře",
"Cestování",
"Finance",
"Technologie",
"Bydlení",
"Koronavirus",
"Byznys",
"Rozhovory",
"Podcasty",
"Životní styl",
"Literatura",
"Vánoce",
"Výtvarné umění",
"Kolo"
]

In [32]:
len(cat_names)

26

In [36]:
from datasets import load_dataset, Features, Value, Sequence, ClassLabel
dst = load_dataset("hynky/czech_news_dataset", features=Features({
    "url": Value("string"),
    "server": Value("string"),
    "brief": Value("string"),
    "headline": Value("string"),
    "content": Value("large_string"),
    "authors_gender": Sequence(ClassLabel(names=[
    "None", "Man", "Woman"])),
    "authors_cum_gender": ClassLabel(names=["None", "Man", "Woman", "Mixed"]),
    "authors": Sequence(Value("string")),
    "category": ClassLabel(names=cat_names),
    "day_of_week": ClassLabel(names=["None", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]),
    "date": Value("timestamp[us]")
}))


Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-c5e76f2bb6b5ac40/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/6 [00:00<?, ?it/s]

In [39]:
dst.push_to_hub("hynky/czech_news_dataset")

['None',
 'Zahraniční',
 'Domácí',
 'Sport',
 'Kultura',
 'Revue',
 'Koktejl',
 'Ekonomika',
 'Krimi',
 'Podnikání',
 'Auto',
 'Věda',
 'Komentáře',
 'Cestování',
 'Finance',
 'Technologie',
 'Bydlení',
 'Koronavirus',
 'Byznys',
 'Rozhovory',
 'Podcasty',
 'Životní styl',
 'Literatura',
 'Vánoce',
 'Výtvarné umění',
 'Kolo']

In [37]:
dst["train"][0]

{'url': 'https://www.idnes.cz/fotbal/prvni-liga/fotbaliste-zacali-pripravu-na-jaro.A_000103_182303_fotbal_vas',
 'server': '2',
 'brief': 'Do 19. února, kdy začne jarní část fotbalové ligy, je daleko, ale včera se začalo naplno trénovat téměř ve všech týmech. Ty, které se drží v tabulce nahoře, udělaly několik změn',
 'headline': 'Fotbalisté začali přípravu na jaro',
 'content': 'Slavia: konečně s Kozlem Ligový půlmistr začal vstup do historického roku bez výrazné posily, přesto vyztužený čtyřmi jmény: poprvé od květnové operace kolena maká naplno reprezentant Kozel, už od konce podzimu s týmem trénují bývalí hráči Krištofík a Hyský a z Rychnova nad Kněžnou přišel devatenáctiletý talent Jan Plašil, jehož bratr hraje v Monaku. "Třeba Krištofík ukázal výkony za béčko, že by nám měl pomoci v důležitých zápasech," ocenil trenér František Cipro. Slávisté se budou připravovat nejen na ligu, ale i Pohár UEFA proti Udine. Oba zápasy byly přesunuty ze čtvrtků na úterý, a to 29. února na Strahov

In [8]:
day_mapping = [x.lower().capitalize() for x in dst["train"].features["day_of_week"].names]
server_mapping = {
    "seznamzpravy": "SeznamZprávy.cz",
    "idnes": "iDNES.cz",
    "aktualne": "Aktuálně.cz",
    "novinky": "Novinky.cz",
    "denik": "Deník.cz",
    "irozhlas": "iRozhlas.cz",
    "None": "None"
}
gender_mapping = {
    "MAN": "Man",
    "WOMAN": "Woman",
    "MIXED": "Mixed",
    "None": "None"
}

mapping = {
    "day_of_week": day_mapping,
    "server": server_mapping,
    "authors_cum_gender": gender_mapping
}

In [11]:
new_dataset = {}
for split, dataset in dst.items():
    for f in ["server", "authors_cum_gender", "day_of_week"]:
        dataset.features[f].names = mapping[f]
    new_dataset[split] = dataset

In [13]:
new_dataset["train"].features["day_of_week"].names

['None',
 'Monday',
 'Tuesday',
 'Wednesday',
 'Thursday',
 'Friday',
 'Saturday',
 'Sunday']

In [15]:
from datasets import DatasetDict
DatasetDict(new_dataset).push_to_hub("hynky/czech_news_dataset")

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/8 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/173 [00:00<?, ?ba/s]