In [26]:
from os import listdir
import string
 
# load doc into memory
def load_doc(filename):
    file = open(filename, encoding='utf-8')
    text = file.read()
    file.close()
    return text
 
# split a document into news story and highlights
def split_story(doc):
    index = doc.find('@highlight')
    story, highlights = doc[:index], doc[index:].split('@highlight')
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights
 
# load all stories in a directory
def load_stories(directory):
    stories = list()
    for name in listdir(directory):
        filename = directory + '/' + name
        doc = load_doc(filename)
        story, highlights = split_story(doc)
        stories.append({'story':story, 'highlights':highlights})
    return stories
 
# clean a list of lines
def clean_lines(lines):
    cleaned = list()
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        index = line.find('(CNN) -- ')
        if index > -1:
            line = line[index+len('(CNN)'):]
        line = line.split()
        line = [word.lower() for word in line]
        line = [w.translate(table) for w in line]
        line = [word for word in line if word.isalpha()]
        cleaned.append(' '.join(line))
    cleaned = [c for c in cleaned if len(c) > 0]
    return cleaned

In [56]:
cnn_directory = '../../../Documents/cnn/stories/'
cnn_stories = load_stories(cnn_directory)
print('Loaded Stories %d' % len(cnn_stories))

Loaded Stories 92579


In [14]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
pd.set_option('display.width', 999)
cnn_df = pd.DataFrame.from_dict(cnn_stories)

In [19]:
cnn_df.columns = ['summary', 'article']
cnn_df.to_csv('../data/CleanedCNN.csv')

In [28]:
dailymail_directory = '../../../Documents/dailymail/stories/'
dailymail_stories = load_stories(dailymail_directory)
print('Loaded Stories %d' % len(dailymail_stories))

Loaded Stories 219506


In [31]:
dailymail_df = pd.DataFrame.from_dict(dailymail_stories)
dailymail_df.columns = ['summary', 'article']

In [48]:
dailymail_df = dailymail_df.replace({'\n': ''}, regex=True)
dailymail_df = dailymail_df.replace({'\t': ''}, regex=True)
dailymail_df.to_csv('../data/CleanedDailyMail.csv')