In [1]:
import sys
import os
from time import time

DATAPATH = '/Users/balthasar/Google Drive/MTL_data/nytimes/'
sys.path.append(DATAPATH)
sys.path.append('../')

from src.util import count_lines_many, data_from_many

datafiles = [DATAPATH + file for file in os.listdir(DATAPATH)]
print("Number of entries: %i" % (count_lines_many(datafiles)))

Number of entries: 145495


In [9]:
# Label & Text Extraction

from bs4 import BeautifulSoup
import json
import re

# NYT Specific
def get_paragraphs(html):
    soup = BeautifulSoup(html,'html.parser')
    paragraphs = [tag.get_text() for tag in soup.findAll('title')+soup.findAll('p','story-body-text story-content')]
    if len(paragraphs) != 0:
        return paragraphs
    else:
        return 'NOCONTENT'

label_re = re.compile('(?<=https://www.nytimes.com/[12][0-9][0-9][0-9]/[0-1][0-9]/[0-3][0-9]/).*')
def get_label(url):
    label = label_re.search(url)
    if label:
        return '/'.join(label.group().split('/')[:-1])
    else:
        return "NOLABEL"

# General   
def unpack(item):
    return tuple(json.loads(item).values())

def words(x):
    return len(x.split(' '))

def bunch_paragraphs(paragraphs,target_length=300):
    """bunches paragraphs into lenghts as close as possible to the target length. not global optimum."""
    res = []
    current = paragraphs.pop(0)
    while len(paragraphs) > 0:
        next_ = paragraphs.pop(0)
        
        len_current = words(current)
        len_next_ = words(next_)
        if (len_current+len_next_-target_length)**2 <= (len_current-target_length)**2:
            current += ' '+next_
        else:
            res += [current]
            current = next_
    res += [current]
    return res

def preprocess(item):
    url, html = unpack(item)
    label = get_label(url)
    paragraphs = get_paragraphs(html)
    return label, paragraphs

In [3]:
# Class count -- count only entries that have easily identifiable content and label
from collections import Counter
from multiprocessing import Pool

classes = Counter()
subclasses = Counter()

i,j = (0,0)
word_count = 0
paragraph_count = 0

def analyze_item_one(item):
    global i,j,word_count,paragraph_count,classes,subclasses
    i+=1
    label,paragraphs = item
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        j+=1
        class_ = label.split('/')[0]
        subclass_ = label
        
        paragraph_count += len(paragraphs)
        for paragraph in paragraphs:
            word_count += len(paragraph.split(' '))

        classes.update([class_])
        subclasses.update([subclass_])
        
        print("[%i/%i] Word Count: %i Paragraph Count: %i Top-10 classes: %s" % (j,i,word_count,paragraph_count,str(classes.most_common(10))),end='\r')

def analyze_item_many(data):
    list(map(analyze_item_one,data))

data = map(preprocess,data_from_many(datafiles))
analyze_item_many(data)        
with open('../summaries/nytimes.txt','w') as file:
    file.write('NYTIMES\n')
    file.write(50*"#")
    file.write('\nWORD COUNT: %i\n' % word_count)
    file.write('PARAGRAPH COUNT: %i\n' % paragraph_count)
    file.write('CLASSES: \n')
    for item in list(classes.items()):
        file.write('%s\t%i\n' % item)
    file.write('\n\n')
    file.write('SUBCLASSES: \n')
    for item in list(subclasses.items()):
        file.write('%s\t%i\n' % item)
    file.write('\n\n')

[96231/145491] Word Count: 81108411 Paragraph Count: 1844644 Top-10 classes: [('world', 18555), ('sports', 14377), ('us', 13773), ('business', 8776), ('nyregion', 5306), ('books', 4786), ('arts', 4671), ('opinion', 4558), ('technology', 2374), ('science', 2288)]

In [16]:
# get paragraph stats
word_counts = Counter()

data = map(preprocess,data_from_many(datafiles))
i = 0
for label, paragraphs in data:
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        for paragraph in paragraphs:
            word_counts.update([len(paragraph.split(' '))])
            i += 1
        print("[%i] %s" % (i,word_counts.most_common(10)),end='\r')

[1844644] [(39, 32233), (40, 32204), (36, 32087), (38, 32016), (41, 31801), (37, 31579), (42, 31510), (43, 31425), (35, 31414), (34, 31265)]

In [17]:
with open('../summaries/nytimes_paragraph_stats.txt','w') as file:
    for item in list(word_counts.items()):
        file.write('%s\t%i\n' % item)

In [18]:
# get paragraph stats
bunched_word_counts = Counter()

data = map(preprocess,data_from_many(datafiles))
i = 0
for label, paragraphs in data:
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        paragraphs = bunch_paragraphs(paragraphs,target_length = 300)
        for paragraph in paragraphs:
            bunched_word_counts.update([len(paragraph.split(' '))])
            i += 1
        print("[%i] %s" % (i,bunched_word_counts.most_common(50)),end='\r')

[322494] [(300, 5209), (301, 5126), (303, 5082), (297, 5039), (299, 5007), (302, 4933), (293, 4891), (305, 4857), (294, 4849), (298, 4849), (304, 4831), (296, 4820), (306, 4796), (295, 4724), (307, 4698), (308, 4613), (292, 4598), (310, 4424), (291, 4413), (290, 4400), (309, 4367), (289, 4261), (311, 4237), (288, 4185), (312, 4159), (287, 4009), (286, 3912), (313, 3860), (314, 3779), (285, 3764), (15, 3749), (315, 3693), (14, 3661), (284, 3617), (316, 3606), (283, 3417), (317, 3368), (282, 3251), (318, 3144), (13, 3051), (281, 3010), (319, 2988), (16, 2918), (280, 2885), (279, 2736), (320, 2663), (321, 2616), (278, 2490), (322, 2415), (277, 2371)]