In [None]:
# Download and untar the XSUM dataset 
! wget http://bollin.inf.ed.ac.uk/public/direct/XSUM-EMNLP18-Summary-Data-Original.tar.gz
! tar -xf XSUM-EMNLP18-Summary-Data-Original.tar.gz

In [3]:
import os, io, boto3, sagemaker
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

s3_resource = boto3.resource('s3')

session = sagemaker.Session()
session_bucket = session.default_bucket()

In [4]:
# Extract every summary and text body from the downloaded files 
summaries = []
bodies = []

for filename in tqdm(os.listdir('./bbc-summary-data')):
    with open(f'./bbc-summary-data/{filename}') as h: 
        data = h.readlines()
        end_of_file = len(data)
        
        i = 0        
        while i < end_of_file:
            
            if data[i] == '[SN]FIRST-SENTENCE[SN]\n':
                i += 1
                new_summary = ''
                while data[i] != '\n':
                    new_summary += f'{data[i].strip()} '
                    i += 1
                summaries.append(new_summary)
                
            elif data[i] == '[SN]RESTBODY[SN]\n':
                i += 1
                new_body = ''
                while i != end_of_file - 1:
                    new_body += f'{data[i].strip()} '
                    i += 1
                bodies.append(new_body)
            
            i += 1

100%|██████████| 237018/237018 [01:40<00:00, 2352.69it/s]


In [5]:
# Clean all missing text bodies / summaries 
n_nulls = 0
for i,e in reversed(list(enumerate(bodies))):
    if bodies[i] == '' or summaries[i] == '':
        del bodies[i], summaries[i]
        n_nulls += 1

print(f'Deleted {str(n_nulls)} empty samples')

Deleted 395 empty samples


In [6]:
# Convert data to Pandas Dataframe, split into train/validation
train_df, validation_df = train_test_split(pd.DataFrame({'text':bodies,'summary':summaries}), test_size=0.15, random_state=7)

train_df.name = 'train'
validation_df.name = 'validation'

In [7]:
# Take a look at the layout of the DataFrame 
train_df.head()

Unnamed: 0,text,summary
76491,"Rail, Maritime and Transport union (RMT) membe...",Possible strike action could disrupt Caledonia...
231752,The vote in favour - by 46 out of 81 MPs - pav...,Montenegro's parliament has ratified the count...
18064,While National Museum Wales can thank the rema...,The richness of an art gallery's collection is...
186141,Media playback is unsupported on your device 3...,A video showing SNP councillors burning a copy...
83710,23 September 2016 Last updated at 14:12 BST It...,Scientists are trying to help save coral reefs...


In [56]:
# Write dataframe to a buffer in CSV format, and upload it to S3 
for df in [train_df, validation_df]:
    buffer = io.StringIO()
    df.to_csv(buffer, index=False)
    s3_resource.Object(session_bucket,f'xsum-dataset/{df.name}.csv').put(Body=buffer.getvalue())