The output of this file is training and testing data that has been cleaned and is ready to use. Long articles have beeen omitted and the remaining ones have been truncated to 300 token. Additionally the gold standard summaries have beeen limited to 75 tokens.

In [0]:
from google.colab import drive
drive.mount('/content/drive/')
%cd "/content/drive/My Drive/630Project"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/630Project


In [0]:
import re
import pickle
import random

In [0]:
data = pickle.load(open('all_article_info.pkl', 'rb'))

Creating dictionaries of the articles to work with

In [0]:
full_article_dict = {}
full_summary_dict = {}

i = 0
for x in data:
  summary = x['article']
  if summary[:3] == 'cnn':
    summary = summary[3:]
  full_article_dict[i] = summary
  full_summary_dict[i] = x['gold_summary']
  i+=1

article_dict = {}
summary_dict = {}

i = 0
for x in data:
  summary = x['article']
  if summary[:3] == 'cnn':
    summary = summary[3:]
  article_dict[i] = summary
  summary_dict[i] = x['gold_summary']
  i+=1

Processing the articles/summaries by adding space between punctuation, replacing all numbers with #, and adding start and end tokens

In [0]:
def preprocess_sentence(w):
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub('\d', '#', w)
    # w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.strip()

    w = '<start> ' + w + ' <end>'
    return w

In [0]:
for x in range(len(article_dict)):
  article_dict[x] = preprocess_sentence(article_dict[x])
  summary_dict[x] = preprocess_sentence(summary_dict[x])

In [0]:
print(len(article_dict))
print(len(summary_dict))

311971
311971


Here, we are cleaning up the articles a little moree by dropping those that start with 'by'. Thes articles have a header block that adds uneeded information to the model

In [0]:
i=0
by_list = []
for article in list(article_dict.values()):
  if article.split()[1] == 'by':
    del article_dict[i]
    del summary_dict[i]
  i+=1

We are getting rid of long articles here (> 500 tokens)

In [0]:
keys = list(article_dict.keys())

for x in keys:
  if len(article_dict[x].split())>500:
    del article_dict[x]
    del summary_dict[x]

In [0]:
print(len(article_dict))
print(len(summary_dict))

70305
70305


Following steps close to See et. al. I am truncating articles and summaries under the assumption that most important information is at the beginning of the article and to make training less computationally expensive

In [0]:
keys = list(article_dict.keys())

for x in keys:
  article_dict[x] = ' '.join(article_dict[x].split()[:250])
  summary_dict[x] = ' '.join(summary_dict[x].split()[:75])

Now we ar making word count dictionaries so that we can replace uncommon words with unk token

In [0]:
articles = list(article_dict.values())
summaries = list(summary_dict.values())

freq_dict_articles = {}

for article in articles:
    tokens = article.split()
    for token in tokens:
        if token in freq_dict_articles:
            freq_dict_articles[token] += 1
        else:
            freq_dict_articles[token] = 1

freq_dict_summaries = {}

for summary in summaries:
    tokens = summary.split()
    for token in tokens:
        if token in freq_dict_summaries:
            freq_dict_summaries[token] += 1
        else:
            freq_dict_summaries[token] = 1

In [0]:
keys = list(article_dict.keys())

Now we are replacing the rare words (in this case, words that only happen once) with the unk token

In [0]:
new_articles = []
for article in articles:
    temp = []
    tokens = article.split()
    for token in tokens:
      if freq_dict_articles[token]>30:
        temp.append(token)
      else:
        temp.append('<unk>')

    new = " ".join(temp)
    new_articles.append(new)


new_summaries = []
for summary in summaries:
    temp = []
    tokens = summary.split()
    for token in tokens:
      if freq_dict_summaries[token]>20:
        temp.append(token)
      else:
        temp.append('<unk>')

    new = " ".join(temp)
    new_summaries.append(new)


i = 0
for x in keys:
  article_dict[x] = new_articles[i]
  summary_dict[x] = new_summaries[i]
  i+=1


Here we get rid of article/summary combos that contain a lot of unknown tokens

In [0]:
print('Length of article and summary dicts before filtering:\n')
print(len(article_dict))
print(len(summary_dict))
print()

for x in keys:
  # print(x)
  article = article_dict[x]
  summary = summary_dict[x]
  if article.count('<unk>') > 3 and summary.count('<unk>') > 2:
    del article_dict[x]
    del summary_dict[x]

print('Length of article and summary dicts after filtering ones with a lot of rare words:\n')
print(len(article_dict))
print(len(summary_dict))

Length of article and summary dicts before filtering:

70305
70305

Length of article and summary dicts after filtering ones with a lot of rare words:

34540
34540


In [0]:
keys = list(article_dict.keys())
print(keys[:10])
random.shuffle(keys)
print(keys[:10])

filtered_articles = []
filtered_summaries = []

for x in keys:
  filtered_articles.append(article_dict[x])
  filtered_summaries.append(summary_dict[x])


train_pct_index = int(0.9 * len(keys))
X_train, X_test = filtered_articles[:train_pct_index], filtered_articles[train_pct_index:]
y_train, y_test = filtered_summaries[:train_pct_index], filtered_summaries[train_pct_index:]

[7, 10, 13, 14, 15, 22, 29, 35, 47, 48]
[4085, 215621, 288211, 79841, 123883, 21292, 37063, 56451, 14447, 224167]


In [0]:
with open("x_train_filtered.txt", "w") as filehandle:
    for listitem in X_train:
        filehandle.write('%s\n' % listitem)

In [0]:
with open("x_test_filtered.txt", "w") as filehandle:
    for listitem in X_test:
        filehandle.write('%s\n' % listitem)

In [0]:
with open("y_train_filtered.txt", "w") as filehandle:
    for listitem in y_train:
        filehandle.write('%s\n' % listitem)

In [0]:
with open("y_test_filtered.txt", "w") as filehandle:
    for listitem in y_test:
        filehandle.write('%s\n' % listitem)

In [0]:
filtered_articles = []
filtered_summaries = []

for x in keys:
  filtered_articles.append(full_article_dict[x])
  filtered_summaries.append(full_summary_dict[x])


train_pct_index = int(0.8 * len(keys))
X_train_full, X_test_full = filtered_articles[:train_pct_index], filtered_articles[train_pct_index:]
y_train_full, y_test_full = filtered_summaries[:train_pct_index], filtered_summaries[train_pct_index:]

with open("X_train_full.txt", "w") as filehandle:
    for listitem in X_train_full:
        filehandle.write('%s\n' % listitem)


with open("X_test_full.txt", "w") as filehandle:
    for listitem in X_test_full:
        filehandle.write('%s\n' % listitem)

with open("y_train_full.txt", "w") as filehandle:
    for listitem in y_train_full:
        filehandle.write('%s\n' % listitem)

with open("y_test_full.txt", "w") as filehandle:
    for listitem in y_test_full:
        filehandle.write('%s\n' % listitem)