In [1]:
import pandas as pd
import numpy as np
import transformers
import nltk
import spacy
import datasets
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
ds = load_dataset('multi_news')

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

In [4]:
ds['train'][0]

{'document': 'National Archives \n \n Yes, it’s that time again, folks. It’s the first Friday of the month, when for one ever-so-brief moment the interests of Wall Street, Washington and Main Street are all aligned on one thing: Jobs. \n \n A fresh update on the U.S. employment situation for January hits the wires at 8:30 a.m. New York time offering one of the most important snapshots on how the economy fared during the previous month. Expectations are for 203,000 new jobs to be created, according to economists polled by Dow Jones Newswires, compared to 227,000 jobs added in February. The unemployment rate is expected to hold steady at 8.3%. \n \n Here at MarketBeat HQ, we’ll be offering color commentary before and after the data crosses the wires. Feel free to weigh-in yourself, via the comments section. And while you’re here, why don’t you sign up to follow us on Twitter. \n \n Enjoy the show. ||||| Employers pulled back sharply on hiring last month, a reminder that the U.S. economy 

In [5]:
ds['train'].features

{'document': Value(dtype='string', id=None),
 'summary': Value(dtype='string', id=None)}

In [6]:
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])
ds_validation = pd.DataFrame(ds['validation'])

In [7]:
# remove punctuation, capitalization and stop words from ds_train, ds_test, ds_validation in place
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

ds_train['document'] = ds_train['document'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
ds_test['document'] = ds_test['document'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
ds_validation['document'] = ds_validation['document'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kiddstudio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# convert all uppercase to lower case
ds_train['document'] = ds_train['document'].str.lower()
ds_test['document'] = ds_test['document'].str.lower()

# remove special characters 
ds_train['document'] = ds_train['document'].str.replace('[^a-zA-Z0-9\s]','', regex=True)
ds_test['document'] = ds_test['document'].str.replace('[^a-zA-Z0-9\s]','', regex=True)

In [9]:
#remove string "(ap)"  from all rows in ds_train and ds_test
ds_train['document'] = ds_train['document'].str.replace(r'\(ap\)','', regex=True).fillna('')
ds_test['document'] = ds_test['document'].str.replace(r'\(ap\)','', regex=True).fillna('')



In [10]:
ds_train['document']

0        national archives yes its time again folks its...
1        los angeles ap  in first interview since nba b...
2        gaithersburg md ap  a small private jet crashe...
3        tucker carlson exposes his own sexism twitter ...
4        a man accused removing another mans testicle m...
                               ...                        
44967    more 670000 copies pearls selfpublished book c...
44968    seeking costconscious consumers gravitated tow...
44969    click email friend opens new window click shar...
44970    barrington ri ap  women clad yoga pants plan p...
44971    based real story hit john travolta film saturd...
Name: document, Length: 44972, dtype: object

In [11]:
# Convert dataframes into Dataset objects
# assign the splits
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)

# reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)

In [12]:
new_ds

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

In [13]:
new_ds['train']

Dataset({
    features: ['document', 'summary'],
    num_rows: 44972
})

In [14]:
new_ds['train']['document'][0]

'national archives yes its time again folks its first friday month one eversobrief moment interests wall street washington main street aligned one thing jobs a fresh update us employment situation january hits wires 830 am new york time offering one important snapshots economy fared previous month expectations 203000 new jobs created according economists polled dow jones newswires compared 227000 jobs added february the unemployment rate expected hold steady 83 here marketbeat hq well offering color commentary data crosses wires feel free weighin yourself via comments section and youre here dont sign follow us twitter enjoy show  employers pulled back sharply hiring last month reminder us economy may growing fast enough sustain robust job growth the unemployment rate dipped mostly americans stopped looking work the labor department says economy added 120000 jobs march 200000 previous three months the unemployment rate fell 82 percent lowest since january 2009 the rate dropped fewer peo

In [15]:
# Convert DatasetDict to DataFrames
train_df = pd.DataFrame(new_ds['train'])
test_df = pd.DataFrame(new_ds['test'])

# Export DataFrames to CSV
train_df.to_csv('../data/processed/new_ds_train_dataset.csv', index=False)
test_df.to_csv('../data/processed/new_ds_test_dataset.csv', index=False)


In [16]:
ds_validation.to_csv('../data/validation_set.csv', index=False)