In [None]:
# Download and untar the XSUM dataset 
! wget http://bollin.inf.ed.ac.uk/public/direct/XSUM-EMNLP18-Summary-Data-Original.tar.gz
! tar -xf XSUM-EMNLP18-Summary-Data-Original.tar.gz

In [1]:
import os, io, boto3, sagemaker
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

s3_resource = boto3.resource('s3')

session = sagemaker.Session()
session_bucket = session.default_bucket()

In [5]:
# Extract every summary and text body from the downloaded files 
summaries = []
bodies = []

for filename in tqdm(os.listdir('./bbc-summary-data')):
    with open(f'./bbc-summary-data/{filename}') as h: 
        data = h.readlines()
        end_of_file = len(data)
        
        i = 0        
        while i < end_of_file:
            
            if data[i] == '[SN]FIRST-SENTENCE[SN]\n':
                i += 1
                new_summary = ''
                while data[i] != '\n':
                    new_summary += f'{data[i].strip()} '
                    i += 1
                summaries.append(new_summary)
                
            elif data[i] == '[SN]RESTBODY[SN]\n':
                i += 1
                new_body = ''
                while i != end_of_file - 1:
                    new_body += f'{data[i].strip()} '
                    i += 1
                bodies.append(new_body)
            
            i += 1

100%|██████████| 237018/237018 [01:33<00:00, 2524.89it/s]


In [7]:
# Create DataFrame, clean all missing text bodies / summaries
df = pd.DataFrame({'text':bodies,'summary':summaries})
raw_size = len(df.index)

df.replace("",float("NaN"),inplace=True)
df.dropna(inplace=True)
print(f'Deleted {raw_size-len(df.index)} empty samples')

Deleted 395 empty samples


In [8]:
# Split into train/validation datasets
train_df, validation_df = train_test_split(df,test_size=0.15,random_state=7)

train_df.name = 'train'
validation_df.name = 'validation'

In [9]:
# Take a look at the layout of the DataFrame 
train_df.head()

Unnamed: 0,text,summary
76618,"Rail, Maritime and Transport union (RMT) membe...",Possible strike action could disrupt Caledonia...
232137,The vote in favour - by 46 out of 81 MPs - pav...,Montenegro's parliament has ratified the count...
18091,While National Museum Wales can thank the rema...,The richness of an art gallery's collection is...
186450,Media playback is unsupported on your device 3...,A video showing SNP councillors burning a copy...
83850,23 September 2016 Last updated at 14:12 BST It...,Scientists are trying to help save coral reefs...


In [11]:
# Take a look at a full example
print(f'BODY: {train_df.text[10]}\n\n')
print(f'SUMMARY: {train_df.summary[10]}')

BODY: Romanian tourist Andreea Cristea, 29, was in London with partner Andrei Burnaz to celebrate his birthday, when she was hurled into the Thames. She remains unconscious in a London hospital, the Romanian ambassador, Dan Mihalache, told BBC News. He described her condition as "stable, but in a good direction". "It's a miracle she survived", he told BBC News on Friday. "She was practically thrown into the Thames." Mr Mihalache said he thought the attacker's car mounted the pavement and hit Mr Burnaz first, before pushing Ms Cristea into the Thames. "That's quite dramatic", he said. "We hope that all will be okay. In the end she survived, she was strong enough." It was previously not known whether she jumped to escape the car or was hit and hurled into the water. After being rescued from the water, Ms Cristea had an operation for a blood clot on her brain while Mr Burnaz sustained a broken foot. Her family, who are now in London, have asked for privacy as she recovers and Mr Mihalache

In [56]:
# Write dataframe to a buffer in CSV format, and upload it to S3 
for df in [train_df, validation_df]:
    buffer = io.StringIO()
    df.to_csv(buffer, index=False)
    s3_resource.Object(session_bucket,f'xsum-dataset/{df.name}.csv').put(Body=buffer.getvalue())