In [1]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("multi_news",trust_remote_code=True)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})


In [5]:
dataset["train"][0]

{'document': 'National Archives \n \n Yes, it’s that time again, folks. It’s the first Friday of the month, when for one ever-so-brief moment the interests of Wall Street, Washington and Main Street are all aligned on one thing: Jobs. \n \n A fresh update on the U.S. employment situation for January hits the wires at 8:30 a.m. New York time offering one of the most important snapshots on how the economy fared during the previous month. Expectations are for 203,000 new jobs to be created, according to economists polled by Dow Jones Newswires, compared to 227,000 jobs added in February. The unemployment rate is expected to hold steady at 8.3%. \n \n Here at MarketBeat HQ, we’ll be offering color commentary before and after the data crosses the wires. Feel free to weigh-in yourself, via the comments section. And while you’re here, why don’t you sign up to follow us on Twitter. \n \n Enjoy the show. ||||| Employers pulled back sharply on hiring last month, a reminder that the U.S. economy 

In [7]:
dataset["test"][0]

{'document': 'GOP Eyes Gains As Voters In 11 States Pick Governors \n \n Enlarge this image toggle caption Jim Cole/AP Jim Cole/AP \n \n Voters in 11 states will pick their governors tonight, and Republicans appear on track to increase their numbers by at least one, with the potential to extend their hold to more than two-thirds of the nation\'s top state offices. \n \n Eight of the gubernatorial seats up for grabs are now held by Democrats; three are in Republican hands. Republicans currently hold 29 governorships, Democrats have 20, and Rhode Island\'s Gov. Lincoln Chafee is an Independent. \n \n Polls and race analysts suggest that only three of tonight\'s contests are considered competitive, all in states where incumbent Democratic governors aren\'t running again: Montana, New Hampshire and Washington. \n \n While those state races remain too close to call, Republicans are expected to wrest the North Carolina governorship from Democratic control, and to easily win GOP-held seats in

In [9]:
articles = [x for x in dataset["train"]["document"]]

In [11]:
article_count = [len(y.split('|||||')) for y in articles]

In [13]:
for i in set(article_count):
    print(i,':',article_count.count(i))

1 : 504
2 : 23743
3 : 12577
4 : 4921
5 : 1845
6 : 707
7 : 371
8 : 194
9 : 81
10 : 29


# Article Splitting and Text Cleaning

In [16]:
import pandas as pd

In [18]:
news_df = pd.DataFrame(dataset["train"])

In [20]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44972 entries, 0 to 44971
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  44972 non-null  object
 1   summary   44972 non-null  object
dtypes: object(2)
memory usage: 702.8+ KB


In [22]:
import string

In [24]:
import re

In [26]:
punct_to_remove = string.punctuation

In [28]:
# Convert the content to lowercase
news_df["document"] = news_df["document"].str.lower()
# Remove the punctuations
news_df["document"] = news_df["document"].str.translate(str.maketrans('','',punct_to_remove))
# Replace White Spaces
news_df["document"] = news_df["document"].str.replace(" ","")
# Remove Special Characters
news_df["document"] = news_df["document"].apply(lambda x : re.sub(r'[^a-zA-Z0-9]',"",x))

In [30]:
# Convert to lowercase
news_df["summary"] = news_df["summary"].str.lower()
# Remove the punctuations
news_df["summary"] = news_df["summary"].str.translate(str.maketrans('','',punct_to_remove))
# Replace White Spaces
news_df["summary"] = news_df["summary"].str.replace(" ","")
# Remove Special Characters
news_df["summary"] = news_df["summary"].apply(lambda x : re.sub(r'[^a-zA-Z0-9]',"",x))

In [32]:
news_df["articles"] = news_df["document"].str.split(r"\s*\|\|\|\|\|\s*")

In [34]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44972 entries, 0 to 44971
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  44972 non-null  object
 1   summary   44972 non-null  object
 2   articles  44972 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


# Tokenization

In [37]:
from transformers import AutoTokenizer

In [39]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

In [41]:
news_df["tokenized_article"] = news_df["articles"].apply(
    lambda x : tokenizer.batch_encode_plus(x, truncation=True, padding="max_length", max_length=1024)
                            )

In [43]:
news_df["tokenized_article"][0]

{'input_ids': [[0, 11535, 45974, 10932, 405, 620, 16001, 4235, 23201, 22588, 2258, 405, 620, 700, 9502, 12997, 21746, 20168, 700, 2151, 14746, 1990, 1264, 6294, 2527, 428, 24062, 27363, 1342, 627, 15979, 2527, 506, 12764, 20521, 605, 40886, 463, 17894, 20521, 1322, 1250, 36967, 261, 39677, 8223, 41207, 2001, 23053, 45061, 2533, 700, 3698, 119, 27877, 2963, 405, 9762, 1990, 9193, 16705, 9591, 620, 16152, 7948, 415, 35781, 424, 4651, 219, 9657, 958, 1529, 2961, 1264, 20168, 700, 7877, 41975, 3277, 282, 1115, 10393, 1478, 9178, 627, 28310, 219, 506, 6537, 37460, 627, 5234, 24963, 7861, 43726, 13771, 1635, 1322, 1990, 25534, 151, 4651, 30056, 620, 2413, 3204, 34381, 39388, 560, 28310, 1952, 23605, 196, 1409, 417, 1722, 267, 6909, 4651, 4184, 7948, 11828, 6537, 560, 29240, 151, 41207, 26551, 9433, 3209, 48540, 627, 879, 37011, 7954, 1496, 1178, 23088, 90, 2678, 279, 16118, 219, 415, 6361, 700, 12353, 2989, 1610, 2681, 1343, 3056, 1610, 1529, 2961, 21685, 44951, 1766, 23033, 5219, 23782, 90,

In [45]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44972 entries, 0 to 44971
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   document           44972 non-null  object
 1   summary            44972 non-null  object
 2   articles           44972 non-null  object
 3   tokenized_article  44972 non-null  object
dtypes: object(4)
memory usage: 1.4+ MB


In [None]:
news_df["token_length"] = news_df["document"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))
print(news_df["token_length"].describe())