In [8]:
PATH = "/diskA/jethro/cnn/"
RAW = f'{PATH}/raw'

In [14]:
import pandas as pd
import os
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm_notebook

In [15]:
stories = os.listdir(RAW)
len(stories)

92579

# Processing Stories

We keep the first two sentences of the article, and use the first highlight to be the summary of the article.

In [16]:
NUM_SUMMARIES_TO_SAVE = 1
NUM_SENTENCES = 2

For processing, we do a few things:

1. Remove non-ascii characters.
2. Remove the CNN story prepends, because these do not contribute to the summaries at all.
3. Replace different quotation marks with normal quotation marks

In [17]:
def is_ascii(token):
    return all(ord(c) < 138 for c in token)

In [18]:
def get_sentences(text, count):
    """Returns `count` sentences from `text`."""
    tokens = sent_tokenize(text)
    return " ".join(tokens[:count])
    
get_sentences("First sentence. Second? Third.", 2)

'First sentence. Second?'

In [23]:
def process_story(story):
    tt = open(f'{RAW}/{story}').readlines()
    tt = [t.replace("-LRB- CNN Student News -RRB- --", "")
          .replace("-LRB- CNN -RRB- --", "")
          .replace("-LRB-", "")
          .replace("-RRB-", "")
          .replace("''", '"')
          .replace("``", '"')
          .replace("NEW :", "")
          .strip() for t in tt if t != "\n"]
    paragraph = []
    summary = []
    flag = True
    for i, t in enumerate(tt):
        if t == "@highlight":
            flag = False
            summary.append(tt[i+1])
        elif flag:
            paragraph.append(t)
    paragraph = " ".join(paragraph)
    summary = " ".join(summary[:NUM_SUMMARIES_TO_SAVE])
    paragraph = get_sentences(paragraph, 2)
    paragraph_tokens = word_tokenize(paragraph)
    summary_tokens = word_tokenize(summary)
    paragraph_tokens = [token for token in paragraph_tokens if is_ascii(token) and not token.isspace()]
    summary_tokens = [token for token in summary_tokens if is_ascii(token) and not token.isspace()]
    if paragraph_tokens and summary_tokens:
        paragraph = " ".join(paragraph_tokens)
        summary = " ".join(summary_tokens)
        return {
            "story": paragraph,
            "summary": summary
        }
    return None

In [26]:
processed_stories = []
for story in tqdm_notebook(stories):
    story = process_story(story)
    if story is not None:
        processed_stories.append(story)

We then dump everything into a Pandas Dataframe for data exploration.

In [27]:
df = pd.DataFrame(processed_stories)

In [28]:
df.head()

Unnamed: 0,story,summary
0,"Fort Hood , Texas The shooting at Fort Hood wa...",Pfc . Lance Aviles says he was ordered to eras...
1,WikiLeaks founder Julian Assange is trying to ...,Julian Assange is seeking to avoid extradition...
2,The news that Scotland has rejected independen...,Hammond : UK allies will breathe sigh of relie...
3,BP plans to continue using a controversial sub...,EPA says it `` will continue to work over the ...
4,Barcelona may be licking their wounds after a ...,Barcelona 's Eric Abidal is given the all clea...


In [29]:
df.describe()

Unnamed: 0,story,summary
count,92459,92459
unique,89076,88272
top,Editor 's note : A nationally syndicated colum...,This page includes the show Transcript
freq,14,148


In [30]:
df.drop_duplicates(inplace=True)

In [31]:
df.dropna(inplace=True)

In [32]:
df.describe()

Unnamed: 0,story,summary
count,89248,89248
unique,89076,88272
top,Editor 's note : The staff at CNN.com has rece...,This page includes the show Transcript
freq,14,148


In [35]:
df.iloc(0)[0][0]

'Fort Hood , Texas The shooting at Fort Hood was captured on video by a soldier using his cell phone camera as he hid from the shooter , but he was ordered to erase it , the soldier said Friday . Pfc .'

In [36]:
df = df.reset_index()

We save the stories to feather format so it is quick and easy to load later on.

In [37]:
df.to_feather(f'{PATH}/stories.feather')

In [38]:
df = pd.read_feather(f'{PATH}/stories.feather')

# Splitting the Dataset

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
train, test = train_test_split(df, test_size=0.2)
test, valid = train_test_split(test, test_size=0.2)

In [41]:
train = train.reset_index()
train.to_feather(f'{PATH}/train.feather')

In [42]:
test = test.reset_index()
test.to_feather(f'{PATH}/test.feather')

In [43]:
valid = valid.reset_index()
valid.to_feather(f'{PATH}/valid.feather')

In [44]:
train.to_csv(f'{PATH}/train.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)
test.to_csv(f'{PATH}/test.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)
valid.to_csv(f'{PATH}/valid.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)

In [45]:
len(train), len(test), len(valid)

(71398, 14280, 3570)

We produce a small dataset for quicker iteration.

In [46]:
SMALL_TRAIN_SIZE = 3000
SMALL_TEST_SIZE= 1000
SMALL_VALID_SIZE= 1000

In [47]:
train[:SMALL_TRAIN_SIZE].to_csv(f'{PATH}/train_small.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)
test[:SMALL_TEST_SIZE].to_csv(f'{PATH}/test_small.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)
valid[:SMALL_VALID_SIZE].to_csv(f'{PATH}/valid_small.tsv', sep="\t", columns=["story", "summary"], index=False, header=False)