In [None]:
import tensorflow as tf
import keras
import keras_nlp
print("Tensorflow 2.16.2 is expected. The running version is", tf.__version__)
print("Keras 3.4.1 is expected. The running version is", keras.__version__)
print("KerasNLP 0.12.1 is expected. The running version is", keras_nlp.__version__)

import numpy as np
import tensorflow_datasets as tfds
# SageMaker cannot use @keras.saving
from keras import saving
import datetime

Tensorflow 2.16.2 is expected. The running version is 2.16.2
Keras 3.4.1 is expected. The running version is 3.4.1
KerasNLP 0.12.1 is expected. The running version is 0.12.1


In [None]:
import datasets
import tensorflow as tf

# This works!
# @see https://huggingface.co/datasets/ccdv/cnn_dailymail
# However, this dataset is for huggingface.
# Hence it should be converted to the Tensorflow format.
# How to convert
# @see https://huggingface.co/docs/datasets/v1.3.0/torch_tensorflow.html#setting-the-format
def convert_hf2tf(dataset: datasets.DatasetDict, split: list[str]):
    dataset.set_format(
        type='tensorflow',
        columns=['article', 'highlights', 'id',]
    )
    l = []
    for s in split:
        d = dataset[s]
        features = {x: d[x] for x in ['article', 'highlights', 'id',]}
        # .batch(32) is not used to show a simple sampled data below with take(1)
        tf_dataset = tf.data.Dataset.from_tensor_slices(features)
        l.append(tf_dataset)
    return tuple(l)
"""
dataset = datasets.load_dataset(
    "ccdv/cnn_dailymail",
    version="3.0.0",
    trust_remote_code=False
)
"""

***
# CNN/Daily Mail

In [None]:
train_ds, validation_ds, test_ds = tfds.load(
    'huggingface:ccdv__cnn_dailymail/3.0.0',
    split=['train', 'validation', 'test'],
    builder_kwargs={
        'trust_remote_code': True,
    },
)

In [None]:
df = tfds.as_dataframe(train_ds)

2024-07-13 01:44:12.758019: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
df.head(5)

Unnamed: 0,article,highlights,id,highlights_count,article_count
0,b'It\'s official: U.S. President Barack Obama ...,b'Syrian official: Obama climbed to the top of...,b'0001d1afc246a7964130f43ae940af6bc6c57f01',55,1541
1,b'(CNN) -- Usain Bolt rounded off the world ch...,"b""Usain Bolt wins third gold of world champion...",b'0002095e55fcbd3a2f366d9bf92a95433dc305ef',32,522
2,"b'Kansas City, Missouri (CNN) -- The General S...",b'The employee in agency\'s Kansas City office...,b'00027e965c8264c35cc1bc55556db388da82b07f',41,1013
3,b'Los Angeles (CNN) -- A medical doctor in Van...,b'NEW: A Canadian doctor says she was part of ...,b'0002c17436637c4fe1837c935c04de47adb18e9a',53,932
4,b'(CNN) -- Police arrested another teen Thursd...,b'Another arrest made in gang rape outside Cal...,b'0003ad6ef0c37534f80b55b4235108024b407f0b',36,288


In [None]:
"""
The first code takes seconds to simply count words.
On the other hand, the latter code that can precisely tokenize takes more than 7 minutes.
"""
df['highlights_count'] = df['highlights'].apply(lambda x: len(x.split()))
df['article_count'] = df['article'].apply(lambda x: len(x.split()))

# import nltk

# def count_words(s):
#     s = s.decode('utf-8')
#     return len(nltk.word_tokenize(s))

# df['highlights_count'] = df['highlights'].map(count_words)
# df['article_count'] = df['article'].map(count_words)

'\nThe above code takes \n'

In [None]:
df.describe()

Unnamed: 0,highlights_count,article_count
count,287113.0,287113.0
mean,51.478118,691.626074
std,21.18601,336.493356
min,4.0,8.0
25%,38.0,443.0
50%,48.0,632.0
75%,60.0,877.0
max,1296.0,2347.0


***
# BillSum

In [None]:
ds = datasets.load_dataset(
    "FiscalNote/billsum",
    trust_remote_code=False
)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [None]:
ds.set_format("pandas")
df = ds['train'][:]

In [None]:
df.head(5)

Unnamed: 0,text,summary,title
0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,Shields a business entity from civil liability...,A bill to limit the civil liability of busines...
1,SECTION 1. SHORT TITLE.\n\n This Act may be...,Human Rights Information Act - Requires certai...,Human Rights Information Act
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,Jackie Robinson Commemorative Coin Act - Direc...,Jackie Robinson Commemorative Coin Act
3,SECTION 1. NONRECOGNITION OF GAIN WHERE ROLLOV...,Amends the Internal Revenue Code to provide (t...,To amend the Internal Revenue Code to provide ...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Native American Energy Act - (Sec. 3) Amends t...,Native American Energy Act


In [None]:
df['title_count'] = df['title'].apply(lambda x: len(x.split()))
df['summary_count'] = df['summary'].apply(lambda x: len(x.split()))
df['text_count'] = df['text'].apply(lambda x: len(x.split()))


In [None]:
df.describe()

Unnamed: 0,title_count,summary_count,text_count
count,18949.0,18949.0,18949.0
mean,18.461977,179.119268,1289.393055
std,14.413841,115.694566,518.105838
min,1.0,8.0,194.0
25%,7.0,92.0,857.0
50%,14.0,157.0,1166.0
75%,27.0,240.0,1644.0
max,137.0,808.0,3055.0


***
# aeslc

In [None]:
ds = datasets.load_dataset(
    "Yale-LILY/aeslc",
    trust_remote_code=False
)

In [None]:
ds.set_format("pandas")
df = ds['train'][:]
df.head(5)

Unnamed: 0,email_body,subject_line
0,"Greg/Phillip, Attached is the Grande Communic...",Service Agreement
1,Phillip & Keith Attached is the first draw re...,Bishops Corner
2,Your Internet Banking accounts are now setup a...,Internet Banking
3,To our IBS Customers that are still hanging in...,Internet Banking
4,Phillip Good Morning!\nI hope you had a wonder...,SMEs for expert stories


In [None]:
df['email_body_count'] = df['email_body'].apply(lambda x: len(x.split()))
df['subject_line_count'] = df['subject_line'].apply(lambda x: len(x.split()))

In [None]:
df.describe()

Unnamed: 0,email_body_count,subject_line_count
count,14436.0,14436.0
mean,118.231366,3.975686
std,148.9686,2.553487
min,25.0,1.0
25%,45.0,2.0
50%,74.0,3.0
75%,131.0,5.0
max,3136.0,15.0


***
# booksum

In [None]:
ds = datasets.load_dataset(
    "kmfoda/booksum",
    trust_remote_code=False
)
ds.set_format("pandas")
df = ds['train'][:]
df.head(5)

Unnamed: 0,bid,is_aggregate,source,chapter_path,summary_path,book_id,summary_id,content,summary,chapter,chapter_length,summary_name,summary_url,summary_text,summary_analysis,summary_length,analysis_length
0,27681,True,cliffnotes,all_chapterized_books/27681-chapters/chapters_...,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapters 1-2,chapters 1-2,,"{""name"": ""Chapters 1-2"", ""url"": ""https://web.a...","\n ""Mine ear is open, and my heart prepared:\...",6471.0,Chapters 1-2,https://web.archive.org/web/20201101053205/htt...,"Before any characters appear, the time and geo...",These two chapters introduce the reader to the...,388.0,473.0
1,27681,False,cliffnotes,all_chapterized_books/27681-chapters/03.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 3,chapter 3,,"{""name"": ""Chapter 3"", ""url"": ""https://web.arch...","\n ""Before these fields were shorn and tilled...",3132.0,Chapter 3,https://web.archive.org/web/20201101053205/htt...,In another part of the forest by the river a f...,This chapter introduces the other three main a...,198.0,149.0
2,27681,False,cliffnotes,all_chapterized_books/27681-chapters/04.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 4,chapter 4,,"{""name"": ""Chapter 4"", ""url"": ""https://web.arch...","\n ""Well, go thy way: thou shalt not from thi...",3075.0,Chapter 4,https://web.archive.org/web/20201101053205/htt...,When the mounted party from Fort Howard approa...,Since this chapter is mostly one of surface ac...,319.0,75.0
3,27681,False,cliffnotes,all_chapterized_books/27681-chapters/05.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 5,chapter 5,,"{""name"": ""Chapter 5"", ""url"": ""https://web.arch...","\n ""In such a night\n Di...",3268.0,Chapter 5,https://web.archive.org/web/20201101053205/htt...,"The pursuit of Magua is unsuccessful, but Hawk...",Here the reader encounters the first bloodshed...,329.0,156.0
4,27681,False,cliffnotes,all_chapterized_books/27681-chapters/06.txt,finished_summaries/cliffnotes/The Last of the ...,The Last of the Mohicans.chapter 6,chapter 6,,"{""name"": ""Chapter 6"", ""url"": ""https://web.arch...","\n ""Those strains that once did sweet in Zion...",3873.0,Chapter 6,https://web.archive.org/web/20201101053205/htt...,Heyward and the girls are uneasy and Gamut is ...,This chapter shows Cooper in his most inventiv...,321.0,128.0


In [None]:
df['summary_analysis_count'] = df['summary_analysis'].apply(lambda x: 0 if x is None else len(x.split()))
df['summary_text_count'] = df['summary_text'].apply(lambda x: 0 if x is None else len(x.split()))

In [None]:
df.describe()

Unnamed: 0,bid,content,chapter_length,summary_length,analysis_length,summary_analysis_count,summary_text_count
count,9600.0,0.0,9600.0,9600.0,9600.0,9600.0,9600.0
mean,5314.669375,,3897.230625,376.896354,274.324063,272.387187,376.80375
std,10504.196539,,4203.548176,331.915025,385.446081,383.108941,331.87908
min,11.0,,42.0,2.0,1.0,0.0,2.0
25%,345.0,,1674.0,171.0,1.0,0.0,171.0
50%,1254.0,,2779.0,283.0,133.0,132.0,283.0
75%,2641.0,,4571.0,467.0,466.0,465.0,467.0
max,45631.0,,114226.0,4852.0,5761.0,5761.0,4852.0


***
# newsroom

In [None]:
ds = datasets.load_dataset(
    "newsroom",
    trust_remote_code=True
)
ds.set_format("pandas")
df = ds['train'][:]
df.head(5)

Downloading builder script:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

ManualDownloadError:                   The dataset newsroom with config default requires manual data.
                  Please follow the manual download instructions:
                     You should download the dataset from https://lil.nlp.cornell.edu/newsroom/download/index.html
The webpage requires registration.
To unzip the .tar file run `tar -zxvf complete.tar`. To unzip the .gz files
run `gunzip train.json.gz` , ...
After downloading, please put the files under the following names
dev.jsonl, test.jsonl and train.jsonl in a dir of your choice,
which will be used as a manual_dir, e.g. `~/.manual_dirs/newsroom`
Newsroom can then be loaded via:
`datasets.load_dataset("newsroom", data_dir="~/.manual_dirs/newsroom")`.

                  Manual data can be loaded with:
                   datasets.load_dataset("newsroom", data_dir="<path/to/manual/data>")

***
# reddit

**Downloading and preparing dataset 2.93 GiB (download: 2.93 GiB, generated: 18.09 GiB, total: 21.01 GiB) to**

In [None]:
ds = tfds.load('reddit')

***
# reddit_tifu

In [74]:
ds = datasets.load_dataset(
    "ctr4si/reddit_tifu", 'short',
    trust_remote_code=True
)
ds.set_format("pandas")
df = ds['train'][:]
df.head(5)

Downloading data:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/79740 [00:00<?, ? examples/s]

Unnamed: 0,ups,num_comments,upvote_ratio,score,documents,tldr,title
0,50.0,13.0,0.77,50.0,i was on skype on my tablet as i went to the t...,,forgetting to pull my underwear down before i ...
1,115.0,23.0,0.88,115.0,this actually happened a couple of years ago. ...,confuse a 5th grade girl for a boy in front of...,gender-stereotyping
2,14.0,11.0,0.78,14.0,i tend to leave half-drank beers in my garage....,,drinking a beer
3,16.0,12.0,0.79,16.0,"it was last october, but i'm feeling the fall-...","i found my estranged dad, thought i loved him ...",telling my dad that i love him.
4,0.0,2.0,0.42,0.0,"flashback to the past, almost exactly one year...",,how hard can you fail at something?


In [77]:
df['title_count'] = df['title'].apply(lambda x: 0 if x is None else len(x.split()))
df['documents_count'] = df['documents'].apply(lambda x: 0 if x is None else len(x.split()))

In [78]:
df.describe()

Unnamed: 0,ups,num_comments,upvote_ratio,score,title_count,documents_count
count,79740.0,79740.0,79740.0,79740.0,79740.0,79740.0
mean,254.228363,37.300388,0.763873,254.228363,6.781703,308.035892
std,1677.370361,155.337738,0.178532,1677.370361,3.709242,259.630296
min,0.0,0.0,0.0,0.0,1.0,1.0
25%,2.0,3.0,0.67,2.0,4.0,148.0
50%,9.0,7.0,0.81,9.0,6.0,242.0
75%,40.0,18.0,0.9,40.0,8.0,387.0
max,96587.0,4506.0,1.0,96587.0,58.0,6019.0


***
# samsum

In [82]:
%pip install py7zr

Collecting py7zr
  Downloading py7zr-0.21.1-py3-none-any.whl.metadata (17 kB)
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.20.0-cp35-abi3-macosx_10_9_universal2.whl.metadata (3.4 kB)
Collecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.16.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.1.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.7 kB)
Collecting pybcj<1.1.0,>=1.0.0 (from py7zr)
  Downloading pybcj-1.0.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (4.0 kB)
Collecting multivolumefile>=0.2.3 (from py7zr)
  Downloading multivolumefile-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting inflate64<1.1.0,>=1.0.0 (from py7zr)
  Downloading inflate64-1.0.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (4.0 kB)
Collecting brotli>=1.1.0 (from py7zr)
  Downloading Brotli-1.1.0-cp310

In [83]:
ds = datasets.load_dataset(
    'Samsung/samsum',
    trust_remote_code=True
)
ds.set_format("pandas")
df = ds['train'][:]
df.head(5)

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."


In [84]:
df['summary_count'] = df['summary'].apply(lambda x: 0 if x is None else len(x.split()))
df['dialogue_count'] = df['dialogue'].apply(lambda x: 0 if x is None else len(x.split()))

In [86]:
len(df)

14732

In [85]:
df.describe()

Unnamed: 0,summary_count,dialogue_count
count,14732.0,14732.0
mean,20.317472,93.786383
std,11.153815,74.033457
min,1.0,0.0
25%,12.0,39.0
50%,18.0,73.0
75%,27.0,128.0
max,64.0,803.0


***
# scientific_papers

In [None]:
ds = tfds.load('scientific_papers')

**Downloading and preparing dataset 4.20 GiB (download: 4.20 GiB, generated: 7.07 GiB, total: 11.27 GiB) to**