In [None]:
!pip install -U transformers
!pip install -U datasets



## News dataset

In [None]:
import json
import random
from datasets import Dataset, load_dataset
import pandas as pd

random.seed(42)

In [None]:
def load_json_file(year):
  path = f'/content/drive/MyDrive/Colab Notebooks/news_agent/news_dataset/20{str(year)}_processed.json'
  with open(path, "r") as f:
      return json.load(f)

In [None]:
data = {}

for year in range(17,24):
  print(f'Loading year 20{year} data...')
  data[year] = load_json_file(year)

print('Finished')

Loading year 2017 data...
Loading year 2018 data...
Loading year 2019 data...
Loading year 2020 data...
Loading year 2021 data...
Loading year 2022 data...
Loading year 2023 data...
Finished


In [None]:
for year in data.keys():
  print(len(data[year]))


403
407
1588
1855
20706
21204
24811


In [None]:
def sample_articles(data, n):
    return random.sample(data, min(n, len(data)))

In [None]:
train_articles = []

for year in range(17,20):
  train_articles += data[year]

for year in range(21,24): # using year 2020 for news API simulation - not using in training to prevent data leakage
  train_articles += sample_articles(data[year], 1000)


In [None]:
len(train_articles)

5398

In [None]:
d = {
    'Articles': [],
    'Summaries': []
}

for a in train_articles:
  d['Articles'].append(a['maintext'])
  d['Summaries'].append(a['description'])


df = pd.DataFrame(data=d)
df['len'] = df['Articles'].apply(lambda x: len(x.split()))
df.describe()

Unnamed: 0,len
count,5398.0
mean,955.890886
std,1218.315055
min,19.0
25%,387.0
50%,577.5
75%,823.0
max,10544.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5398 entries, 0 to 5397
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Articles   5398 non-null   object
 1   Summaries  5378 non-null   object
 2   len        5398 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 126.6+ KB


In [None]:
def filter_by_token_length(data, key, max_tokens):
    return [
        item for item in data
        if key in item and isinstance(item[key], str) and len(item[key].split()) <= max_tokens
    ]

In [None]:
train_articles = filter_by_token_length(train_articles, "maintext", 577)
len(train_articles)

2699

In [None]:
# Process articles:
# * keep article and summary keys
# * drop None values

train_articles = [
    {
        "article": "summarize: " + item["maintext"].strip(),
        "summary": item["description"].strip()
    }
    for item in train_articles
    if item["description"] is not None
]

In [None]:
len(train_articles)

2680

In [None]:
train_dataset = Dataset.from_list(train_articles)

In [None]:
train_dataset

Dataset({
    features: ['article', 'summary'],
    num_rows: 2680
})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
train_dataset.push_to_hub("stock-news-summaries")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Kallia/stock-news-summaries/commit/1870f44dc12ef59fc6a3200e9a30b81832fff268', commit_message='Upload dataset', commit_description='', oid='1870f44dc12ef59fc6a3200e9a30b81832fff268', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Kallia/stock-news-summaries', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Kallia/stock-news-summaries'), pr_revision=None, pr_num=None)

In [None]:
remote_dataset = load_dataset("Kallia/stock-news-summaries", split="train")


README.md:   0%|          | 0.00/313 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.34M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2680 [00:00<?, ? examples/s]

In [None]:
remote_dataset

Dataset({
    features: ['article', 'summary'],
    num_rows: 2680
})