In [None]:
import pandas as pd
import numpy as np
from time import time
from collections import Counter
import random

df = pd.read_csv('newsroom_training_data_original.csv')
df.columns

In [None]:
len(df)

# remove null article/summary

6 entries have no summaries because the original json was "null", read into pandas as nan. Those are removed. 1 entry has no article.

In [None]:
print(len(df[df['summary'].isna()]))
df = df[df['summary'].notna()]

In [None]:
print(len(df[df['article'].isna()]))
df = df[df['article'].notna()]

# remove foreign language

In [None]:
foreign = df[df['language']!='en']
row = random.randint(0, len(foreign)-1)
print(foreign.iloc[row]['language'])
print('------------------')
print(foreign.iloc[row]['summary'])
print('------------------')
print(foreign.iloc[row]['article'])

2200 Non-English articles removed

In [None]:
print(len(df[df['language']!='en']))
    
df = df[df['language'] == 'en']

## clean up ascii character error

When splitting summary by spaces, we observe some extreme outliers in terms of summary lengths. On the lower end, we saw 2411 summaries that have length of 1. Manually checking those, we see that some of the summaries were proper summaries but with ascii symbols improperly encoded. There were 190 examples affected by this, vast majority of them belonging to aol. We addressed this by manually replacing the ascii symbols.

In [None]:
def ascii_replacer(col):
    if '%20' not in col:
        return col
    
    ascii_dict = {'%20': ' ', '%27': "'", '%2C': ',', '%0A': ' ', '%24': '$', '%3A': ':', '%25': 'percent',
                  '%28': '(', '%29': ')', '%3F': '?', '%26': '&', '%3B': ';', 
                  '%u2019': "'", '%7C': '|', '%22': '"', '%21': '!'}
    for key, value in ascii_dict.items():
        col = col.replace(key, value)
    return col

df['summary'] = df['summary'].apply(ascii_replacer)
df['summary_length'] = df['summary'].apply(lambda x: len(x.split(' ')))

# remove hyperlink

2159 summaries contain a hyperlink. Manual inspection noted majority of these summaries were long and the url does not contribute information to the summary (often promotional links). These summaries do not match the type of summaries we aim to obtain, therefore they are excluded.

In [None]:
hyperlinked = df[(df['summary'].str.contains('http')) | (df['summary'].str.contains('html')) |\
              (df['summary'].str.contains('www'))]
row = random.randint(0, len(hyperlinked)-1)
print(hyperlinked.iloc[row]['summary'])

In [None]:
hyperlinked = df[(df['summary'].str.contains('http')) | (df['summary'].str.contains('html')) |\
              (df['summary'].str.contains('www'))]

df = df[(~df['summary'].str.contains('http')) & (~df['summary'].str.contains('html')) &\
        (~df['summary'].str.contains('www'))]

After previous removal, there are 2212 summaries with length of 1. Manual inspection shows they are largely nonsensical words and do not provide information to the articles. These were removed. We repeated this with summary length 2-5 as well, as they do not provide a reasonable summarization. A total 22553 summaries were removed. 

# remove summary of 5 words or shorter

In [None]:
df['summary_length'] = df['summary'].apply(lambda x: len(str(x).split()) )

In [None]:
np.percentile(df['summary_length'], [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 3])

In [None]:
num_words = 6

short = df[df['summary_length'] < num_words]
row = random.randint(0, len(short)-1)
print(short.iloc[row]['summary'])

print(f"summaries shorter than {num_words} words: {len(df[df['summary_length'] < num_words])}\n")

# df = df[df['summary_length'] >= num_words]

# remove duplicate summaries

While manually examining the short summaries, we noticed duplicates such as "Follow 2008 Elections & Campaigns at washingtonpost.com." that do not actually sum up the article. This is the result of poor data sanitiation during data scrapping by the provider of dataset. Therefore, we check for summaries with identical texts and remove those. This removes 16179 records.

In [None]:
print('before deleting duplicate summaries,', len(df))

summary_counts = pd.DataFrame(df['summary'].value_counts()).reset_index()
summary_counts = summary_counts[summary_counts['summary'] != 1]
duplicate_summary_list = summary_counts['index']

df['summary_duplicate'] = df['summary'].isin(duplicate_summary_list)
df = df[~df['summary_duplicate']]

del df['summary_duplicate']

print('after deleting duplicate summaries,', len(df))

# remove duplicate articles

In [None]:
article_counts = pd.DataFrame(df['article'].value_counts()).reset_index()
article_counts = article_counts[article_counts['article']>1]
duplicate_article_list = article_counts['index']

df['article_duplicate'] = df['article'].isin(duplicate_article_list)
df = df[~df['article_duplicate']]

del df['article_duplicate']

print('after deleting duplicate articles,', len(df))

On the higher end, the distribution of summary length has a long tail, with 1% having more than 183 words, and less than 0.2% having more than 260 words. A closer look at the 99.5th percentile shows that almost all summaries were copied from the beginning section of the article verbatim. In contrast, summaries with length between 134-183 words (99th to 99.5th percentile) shows mostly summarizations. Therefore, we removed any summaries longer than 183 words.

# remove summaries 135 words or longer

In [None]:
import seaborn as sns

sns.histplot(data=df[df['summary_length']<200], x="summary_length")

In [None]:
# for d in df[(df['summary_length'] > 134) & (df['summary_length'] < 183)]['summary']:
#     print(d)
#     print('***************')

print(np.percentile(df['summary_length'], [98, 99, 99.5, 99.8, 99.9, 99.95, 99.99]))

df = df[df['summary_length'] < 137]

Histogram shows an interesting peak at length = 100. It's possible that this dataset was scraped from digests that limits to displaying the first 100 words. This is confirmed by looking at the summaries with exactly 100 words. Many ended with the character ..., indicating a truncation. This type of summary are displaying the first 100 words verbatim, which is not our goal of summarization. Therefore, any summaries that ends with ... were excluded. This excludes 49057 records

# remove summaries with [...] at the end

In [None]:
print(len(df[df['summary'].str.contains('\...$')]))

df = df[~df['summary'].str.contains('\...$')]

# remove articles shorter than 100 words or longer than 1000 words

We need some minimal length for articles to prevent there been too little information. Therefore, articles with fewer than 100 words were removed. This removed 47299 records. On the upper end, due to limitation of our model architecture, we don't want articles to be overly long.

In [None]:
df['article_length'] = df['article'].apply(lambda x: len(x.split()))
print(len(df[df['article_length']<100]))
print(len(df[df['article_length']>1000]))

df = df[(df['article_length']>= 100) & (df['article_length'] <= 1000)]

print(len(df))

To further weed out low quality summaries that simply copy the first few sentences of the article, we trimmed down the article to 150 words, and get rouge2 score for the trimmed summaries compared against the trimmed articles. Then we chose an arbitrary rouge-2 score threshold based on manual inspection of the score. An overly high score would imply the summary is simplying copying from the first sentence of the article. Those are removed

# remove any summaries with rouge2 score > 0.15

In [None]:
threshold = 0.15

above_threshold = df[df['reference_rouge2'] > threshold]
region_of_interest = df[(df['reference_rouge2'] > threshold - 0.05) & (df['reference_rouge2'] < threshold)]
print(f'number of records to be removed: {len(above_threshold)}')

In [None]:
import random
row = random.randint(0, len(region_of_interest)-1)
print(region_of_interest.iloc[row]['summary'])
print('----------------------------')
print(region_of_interest.iloc[row]['article'])

In [None]:
df = df[df['reference_rouge2'] < threshold]

# remove articles and summaries with "{" in it, those indicate bad article parse

In [None]:
print('before dropping badly parsed articles', len(df))
df = df[~df['article'].str.contains('{')]
print('after dropping badly parsed articles', len(df))

# remove articles with more than 3 of the same 5-gram

In [None]:
from time import time

t = time()

def count_ngrams(text, n):
    splitted = text.split(' ')
    ngram_list = [splitted[i:i+n] for i in range(len(splitted)-n+1)]
    ngram_string = [' '.join(x) for x in ngram_list]
    return Counter(ngram_string)

high_repeat = []
for i, a in enumerate(df['article'].tolist()):
    five_grams = count_ngrams(a, 5)
    frequent_5grams = [(x, five_grams[x]) for x in five_grams if (five_grams[x] > 3) and (len(x.strip()) > 0)]
    if frequent_5grams:
        high_repeat.append(five_grams)
    else:
        high_repeat.append(None)

print(time()-t)

df['special'] = high_repeat

print(len(df[df['special'].notna()]))

In [None]:
print('before dropping similar articles', len(df))
df = df[df['special'].isna()]
print('after dropping similar articles', len(df))

high_repeat = []
for i, a in enumerate(df['summary'].tolist()):
    trigrams = count_ngrams(a, 3)
    frequent_3grams = [(x, trigrams[x]) for x in trigrams if (trigrams[x] > 3) and (len(x.strip()) > 0)]
    if frequent_3grams:
        high_repeat.append(trigrams)
    else:
        high_repeat.append(None)

df['special'] = high_repeat

print('number of similar summaries', len(df[df['special'].notna()]))

df = df[df['special'].isna()]
print('after dropping similar summary', len(df))

## Other nonsense records found through manual spot checking

In [None]:
# USA today's topics does not contain any meaningful articles or summaries
print('before dropping usatoday topics', len(df))
df = df[df['category']!= 'topics']
print('after dropping usatoday topics', len(df))

In [None]:
# these blacklist words were found during manual checking that cannot be remvoed using all above filters
# summaries containing these words are not meaningful
df['special'] = df['article'].apply(lambda x: 'rent in SF for $4000+?\n\nImage' in x)
df = df[~df['special']]
df['special'] = df['article'].apply(lambda x: '5, 101, 4, 0) +' in x)
df = df[~df['special']]

print(len(df))

In [None]:
# these blacklist words were found during manual checking that cannot be remvoed using all above filters
# summaries containing these words are not meaningful
blacklist = ['ARF ARF ARF', 'ceiling is lifted', 'Oscar predictions', 'FoxNews.com', 'Schuldercnn', 'usatoday']

for phrase in blacklist:
    print(phrase, len(df[df['summary'].str.contains(phrase)]))
    df = df[~df['summary'].str.contains(phrase)]

print('after dropping badly parsed summaries', len(df))

## Save

In [None]:
df = df[['original_row_idx', 'url', 'density_bin', 'article', 'summary', 'source', 'category']]

print(len(df))

df.to_csv('newsroom.csv', index = False)

df = df.sample(frac = 1).reset_index(drop = True)

train = df.iloc[:500000]
test = df.iloc[500000:]

train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)