In [18]:
import json
from pathlib import Path
from datetime import date 

NUM_PROCESS = 4
from datasets import Dataset
from datasets import Features, Value
from urllib.parse import urlparse
from datasets import Sequence


def get_server(url):
    parsed = urlparse(url).netloc.split(".")
    server = parsed[-1]
    if server == "cz":
        server = parsed[-2]

    if server == "cz:443":
        server = "aktualne"

    return server

records = []
with open("../records_with_add.jsonl") as f:
    for line in f:
        records.append(json.loads(line))

print(len(records))
print(records[0].keys())

dataset = Dataset.from_list(
    records,
    features=Features(
        {
           "url": Value("string"),
           "author": Sequence(Value("string")),
           "headline": Value("string"),
           "brief": Value("string"),
           "publication_date": Value("string"),
           "keywords": Sequence(Value("string")),
           "category": Value("string"),
           "content": Value("large_string"),  
           "comments_num": Value("int32"),
           "domain_record": {
               "digest": Value("string"),
               "encoding": Value("string"),
               "filename": Value("string"),
               "length": Value("int32"),
               "offset": Value("int32"),
               "timestamp": Value("string"),
               "url": Value("string"),
           },
           "additional_info": {
               "sets": Sequence(Value("string")),
           }
        }
    )
)
# We will postprocess string values later and cast it manually
dataset = dataset.map(lambda batch: {"server": [get_server(url) for url in batch["url"]]}, num_proc=NUM_PROCESS, batched=True)

1627267
dict_keys(['author', 'brief', 'category', 'comments_num', 'content', 'domain_record', 'headline', 'keywords', 'publication_date', 'url', 'additional_info'])


Map (num_proc=4):   0%|          | 0/1627267 [00:00<?, ? examples/s]

FileNotFoundError: [Errno 2] No such file or directory: 'headline_filter.txt'

In [21]:
with open("headline_filter.txt") as f:
    headline_filters = set(f.read().splitlines())

In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [117]:
from postprocessing_utils import (
    postprocess_authors,
    add_day,
    filter_author,
    create_translate,
    create_filter_by_set,
    postprocess_brief,
    postprocess_category,
    postprocess_headline,
    postprocess_date,
    postprocess_content,
    as_Article,
    JSONArticleEncoder,
    add_cum_gender,
    create_add_gender,
    postprocess_keywords,
    create_none_to_x,
)

from postprocessing.authors_utils import auts 
from categories_utils import translate_cat_cz_en, translate_cat

postprocessing = {
    "category": [create_translate(translate_cat, lower=True), postprocess_category, create_filter_by_set(set(translate_cat_cz_en.keys()), lower=True),
     create_translate(translate_cat_cz_en)],
    "authors": [postprocess_authors, create_filter_by_set(set(auts.keys()), lower=True), create_none_to_x([])],
    "brief": [postprocess_brief],
    "headline": [postprocess_headline],
    "content": [postprocess_content],
    "publication_date": [],
    "keywords": [postprocess_keywords, create_none_to_x([])]
}

In [118]:
def to_date(date_str):
    if date_str is None:
        return None
    return datetime.fromisoformat(date_str).date()

from functools import reduce
dataset_post = dataset.select(range(100))
dataset_post = dataset_post.map(lambda batch: {"category_unclean": batch["category"]}, batched=True, num_proc=NUM_PROCESS)


dataset_post = dataset_post.rename_column("author", "authors")

for col, funcs in postprocessing.items():
    dataset_post = dataset_post.map(lambda batch: {col: reduce(lambda arts, f:[ f(art) for art in arts], funcs, batch[col])}, batched=True, num_proc=NUM_PROCESS)



# Convert to date
dataset_post = dataset_post.map(
    lambda batch: { "date": [to_date(dtm) for dtm in batch["publication_date"]] }
    , batched=True, batch_size=None, num_proc=NUM_PROCESS, remove_columns=["publication_date"]
)

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

In [120]:
augmentations = {
    "authors_gender": create_add_gender(auts),
    "authors_cum_gender": add_cum_gender,
    "day_of_week": add_day,
}



dataset_augmented = dataset_post

for col, func in augmentations.items():
    dataset_augmented = dataset_augmented.map(lambda batch: {col: func(batch)}, batched=False, num_proc=NUM_PROCESS)


Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

In [64]:
NONE_LABEL = "None"

def NoneToNoneLabel(dst, column):
    return dst.map(lambda batch: {column: [NONE_LABEL if x is None else x for x in batch[column]]}, batched=True, num_proc=NUM_PROCESS)


# %%
from datasets import ClassLabel, Sequence
features = {
    "category": ClassLabel(names=[NONE_LABEL] + list(translate_cat_cz_en.values())),
    "authors_cum_gender": ClassLabel(names=[NONE_LABEL, "Man", "Woman", "Mixed"]),
    "authors_gender": Sequence(ClassLabel(names=[NONE_LABEL ,"Man", "Woman"])),
    "day_of_week": ClassLabel(names=[NONE_LABEL ,"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]),
    "server": ClassLabel(names=[NONE_LABEL, "idnes", "denik", "aktualne", "irozhlas", "seznamzpravy", "novinky"]),
}




# %%
from datasets import ClassLabel, Sequence
casted = dataset_augmented
cast_columns = ["category", "server", "day_of_week"]
for col in cast_columns:
    casted = NoneToNoneLabel(casted, col)
    casted = casted.cast_column(col, features[col])

cast_columns = ["authors_gender", "authors_cum_gender", ]
for col in cast_columns:
    casted = casted.cast_column(col, features[col])


Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

In [89]:
from datasets import DatasetDict
splits = {
    "human": [],
    "train": [],
    "test": [],
    "validation": [],
    "train_small": [],
    "test_small": [],
}

for i,row in enumerate(casted):
    for split in row["additional_info"]["sets"]:
        splits[split].append(i)

splits = {k: casted.select(v) for k,v in splits.items()}
final = DatasetDict(splits)

In [92]:
final.remove_columns(["additional_info", "domain_record"])

DatasetDict({
    human: Dataset({
        features: ['authors', 'brief', 'category', 'comments_num', 'content', 'headline', 'keywords', 'url', 'server', 'category_unclean', 'date', 'authors_gender', 'authors_cum_gender', 'day_of_week'],
        num_rows: 0
    })
    train: Dataset({
        features: ['authors', 'brief', 'category', 'comments_num', 'content', 'headline', 'keywords', 'url', 'server', 'category_unclean', 'date', 'authors_gender', 'authors_cum_gender', 'day_of_week'],
        num_rows: 88
    })
    test: Dataset({
        features: ['authors', 'brief', 'category', 'comments_num', 'content', 'headline', 'keywords', 'url', 'server', 'category_unclean', 'date', 'authors_gender', 'authors_cum_gender', 'day_of_week'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['authors', 'brief', 'category', 'comments_num', 'content', 'headline', 'keywords', 'url', 'server', 'category_unclean', 'date', 'authors_gender', 'authors_cum_gender', 'day_of_week'],
    

In [94]:
final["train"].features

{'authors': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'brief': Value(dtype='string', id=None),
 'category': ClassLabel(names=['None', 'Krimi', 'Výtvarné umění', 'Kultura', 'Bydlení', 'Technologie', 'Podcasty', 'Ekonomika', 'Auto', 'Byznys', 'Kolo', 'Věda', 'Koktejl', 'Komentáře', 'Literatura', 'Cestování', 'Podnikání', 'Revue', 'Finance', 'Rozhovory', 'Sport', 'Vánoce', 'Koronavirus', 'Životní styl', 'Domácí', 'Zahraniční'], id=None),
 'comments_num': Value(dtype='int32', id=None),
 'content': Value(dtype='large_string', id=None),
 'domain_record': {'digest': Value(dtype='string', id=None),
  'encoding': Value(dtype='string', id=None),
  'filename': Value(dtype='string', id=None),
  'length': Value(dtype='int32', id=None),
  'offset': Value(dtype='int32', id=None),
  'timestamp': Value(dtype='string', id=None),
  'url': Value(dtype='string', id=None)},
 'headline': Value(dtype='string', id=None),
 'keywords': Sequence(feature=Value(dtype='string', id=None),

In [91]:
final.save_to_disk("final_dataset")

Saving the dataset (0/1 shards): 0 examples [00:00, ? examples/s]

Flattening the indices:   0%|          | 0/88 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/88 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/5 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/7 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/2 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]