In [45]:
PATH = "/diskA/jethro/cnn/"
RAW = f'{PATH}/raw'

In [46]:
import pandas as pd
import os
from tqdm import tqdm_notebook

In [47]:
stories = os.listdir(RAW)
len(stories)

92579

# Processing Stories

We limit ourselves to `STORY_LENGTH` because the input sequence size has a direct correlation to the number of parameters in the model. We also pick a small number of summaries because the shorter the input `STORY_LENGTH`, the less probable it is to arrive at the later highlights. 

In [4]:
STORY_LENGTH          = 100
NUM_SUMMARIES_TO_SAVE = 1

For processing, we do a few things:

1. Remove non-ascii characters.
2. Remove the CNN story prepends, because these do not contribute to the summaries at all.
3. Replace different quotation marks with normal quotation marks

In [5]:
def is_ascii(token):
    return all(ord(c) < 138 for c in token)

In [48]:
def process_stories(stories):
    processed_stories = []
    for s in tqdm_notebook(stories):
        tt = open(f'{RAW}/{s}').readlines()
        tt = [t.replace("-LRB- CNN Student News -RRB- --", "")
              .replace("-LRB- CNN -RRB- --", "")
              .replace("-LRB-", "")
              .replace("-RRB-", "")
              .replace("''", '"')
              .replace("``", '"')
              .replace("NEW :", "")
              .strip() for t in tt if t != "\n"]
        paragraph = []
        summary = []
        flag = True
        for i, t in enumerate(tt):
            if t == "@highlight":
                flag = False
                summary.append(tt[i+1])
            elif flag:
                paragraph.append(t)
        paragraph_tokens = " ".join(paragraph).split()
        summary_tokens = " ".join(summary[:NUM_SUMMARIES_TO_SAVE]).split()
        paragraph_tokens = [token for token in paragraph_tokens if is_ascii(token) and not token.isspace()]
        summary_tokens = [token for token in summary_tokens if is_ascii(token) and not token.isspace()]
        if paragraph_tokens and summary_tokens:
            paragraph = " ".join(paragraph_tokens[:STORY_LENGTH])
            summary = " ".join(summary_tokens)
            processed_stories.append({
                "story": paragraph,
                "summary": summary
            })
    return processed_stories

In [None]:
processed_stories = process_stories(stories)

We then dump everything into a Pandas Dataframe for data exploration.

In [15]:
df = pd.DataFrame(processed_stories)

In [16]:
df.head()

Unnamed: 0,story,summary
0,"Fort Hood , Texas The shooting at Fort Hood wa...",NEW : Pfc. Lance Aviles says he was ordered to...
1,WikiLeaks founder Julian Assange is trying to ...,Julian Assange is seeking to avoid extradition...
2,The news that Scotland has rejected independen...,Hammond : UK allies will breathe sigh of relie...
3,BP plans to continue using a controversial sub...,"EPA says it "" will continue to work over the n..."
4,Barcelona may be licking their wounds after a ...,Barcelona 's Eric Abidal is given the all clea...


In [17]:
df.describe()

Unnamed: 0,story,summary
count,92465,92465
unique,89255,88318
top,"Leszek Balcerowicz , Poland 's former finance ...",This page includes the show Transcript
freq,4,148


In [23]:
df.drop_duplicates(subset=["story", "summary"], inplace=True)

In [24]:
df.dropna(inplace=True)

In [25]:
df.describe()

Unnamed: 0,story,summary
count,89287,89287
unique,89255,88318
top,Congressman Jared Polis D Colorado : District ...,This page includes the show Transcript
freq,2,148


In [26]:
df.iloc(0)[0][0]

'Fort Hood , Texas The shooting at Fort Hood was captured on video by a soldier using his cell phone camera as he hid from the shooter , but he was ordered to erase it , the soldier said Friday . Pfc. Lance Aviles spoke of the video as he testified on the third day of the Article 32 military hearing for Maj. Nidal Hasan , who is accused of killing 13 people and wounding 32 in the November 2009 shooting . Aviles said he was told by a non-commissioned officer , who Aviles said was acting on the orders'

In [28]:
df = df.reset_index()

We save the stories to feather format so it is quick and easy to load later on.

In [29]:
df.to_feather(f'{PATH}/stories.feather')

In [30]:
df = pd.read_feather(f'{PATH}/stories.feather')

# Splitting the Dataset

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
train, test = train_test_split(df, test_size=0.2)
test, valid = train_test_split(test, test_size=0.2)

In [33]:
train = train.reset_index()
train.to_feather(f'{PATH}/train.feather')

In [34]:
test = test.reset_index()
test.to_feather(f'{PATH}/test.feather')

In [35]:
valid = valid.reset_index()
valid.to_feather(f'{PATH}/valid.feather')

In [41]:
train.to_csv(f'{PATH}/train.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)
test.to_csv(f'{PATH}/test.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)
valid.to_csv(f'{PATH}/valid.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)

In [42]:
len(train), len(test), len(valid)

(71429, 14286, 3572)

We produce a small dataset for quicker iteration.

In [43]:
SMALL_TRAIN_SIZE = 3000
SMALL_TEST_SIZE= 1000
SMALL_VALID_SIZE= 1000

In [44]:
train[:SMALL_TRAIN_SIZE].to_csv(f'{PATH}/train_small.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)
test[:SMALL_TEST_SIZE].to_csv(f'{PATH}/test_small.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)
valid[:SMALL_VALID_SIZE].to_csv(f'{PATH}/valid_small.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)