In [None]:
import json, gzip
import pandas as pd
import numpy as np
from time import time
import re
from numpy import nan as npnan
import langid

data = []

with gzip.open("train.jsonl.gz") as f:
    for ln in f:
        obj = json.loads(ln)
        data.append(obj)
        
with gzip.open("dev.jsonl.gz") as f:
    for ln in f:
        obj = json.loads(ln)
        data.append(obj)

# Test.jsonl.gz is not loaded because the summaries for those are randomly copied phrases throughout the article. 
# It was probably created as a challenge for leaderboards.

# with gzip.open("test.jsonl.gz") as f:
#     for ln in f:
#         obj = json.loads(ln)
#         data.append(obj)

len(data)

In [None]:
# convert the loaded data dictionary to pandas df
url = [x['url'] for x in data]
title = [x['title'] for x in data]
date = [x['date'] for x in data]
compression = [x['compression'] for x in data]
coverage = [x['coverage'] for x in data]
density = [x['density'] for x in data]
compression_bin = [x['compression_bin'] for x in data]
coverage_bin = [x['coverage_bin'] for x in data]
density_bin = [x['density_bin'] for x in data]
article = [x['text'] for x in data]
summary = [x['summary'] for x in data]

df = pd.DataFrame({'url': url,
                   'date': date,
                   'title': title,
                   'compression': compression,
                   'compression_bin': compression_bin,
                   'coverage': coverage,
                   'coverage_bin': coverage_bin,
                   'density': density,
                   'density_bin': density_bin,
                   'article': article,
                   'summary': summary})

df = df.reset_index().rename(columns = {'index': 'original_row_idx'})

In [None]:
# calculate a rouge recall score for the first 150 words of the article
# this will be used to filter out records where the reference summary is copying the beginning of the article verbatim.

import rouge

summary = df['summary'].apply(lambda x: ' '.join(str(x).split()[:150])).tolist()
article = df['article'].apply(lambda x: ' '.join(str(x).split()[:150])).tolist()

evaluator = rouge.Rouge(metrics=['rouge-n'],
                           max_n=2,
                           limit_length=True,
                           length_limit=150,
                           length_limit_type='words',
                           apply_avg='Avg',
                           apply_best='Best',
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

reference_rouge2 = []

for i, pair in enumerate(zip(summary, article)):
    if i%10000 == 0:
        print(i)
    reference_rouge2.append(evaluator.get_scores(pair[0], pair[1])['rouge-2']['r'])
    
df['reference_rouge2'] = reference_rouge2

In [None]:
# parse out the source of the news from the URL. 
# Use UDF to process the splitted url since the root domain may not be in the same position after splitting due to subdomains
def split_full_url(url):
    splitted = url.split('/')
    splitted += ['0']*9
    char_only = [re.sub(r'[^A-z]','',x) for x in splitted[3:10]]
    noempty = [splitted[2]]+[x if x not in ['', 'category'] else npnan for x in char_only]
    return pd.Series(noempty)

t = time()

df[['u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7', 'u8']] = df['url'].apply(split_full_url)

print(time() - t)

df['source'] = df['u1'].apply(lambda x: x.replace('.co.uk', '').replace('.com', '').replace('.go','')\
             .replace('.au', '').replace('.ca', '').replace(':9898','').replace('http:', '.foxnews').split('.')[-1])

# these news sources have urls that may include the news categories. attempt to parse categories for these sources
parseable = ['nytimes', 'theguardian', 'foxnews', 'usatoday', 'nydailynews', 'time', 'cnn', 'telegraph', 
             '9news', 'bostonglobe', 'latimes', 'bbc', 'sfgate', 'abcnews', 'nbcnews', 'nypost']

df['category'] = df[['u2', 'u3', 'u4', 'u5', 'u6', 'u7', 'u8']].bfill(axis = 1).iloc[:, 0]
df['category'] = df[['category', 'source']]\
    .apply(lambda x: x[0] if x[1] in parseable else 'unparsable', axis = 1).apply(lambda x: str(x).lower())

df.drop(['u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7', 'u8'], axis = 1, inplace = True)

In [None]:
# use the first 150 words of the article to guess the language, in case it's not English
import langid
from time import time

article = df['article'].apply(lambda x: ' '.join(str(x).split()[:150])).tolist()

language = []

t = time()
for i, a in enumerate(article):
    language.append(langid.classify(a)[0])
    if i%10000 == 0:
        print(i, time() - t)
        
df['language'] = language

In [None]:
df.to_csv('newsroom_training_data_original.csv', index = False)

df.head(2)