In [69]:
import sys
import os
from time import time

DATAPATH = '/Users/balthasar/Google Drive/MTL_data/nydailynews/'
sys.path.append(DATAPATH)
sys.path.append('../')

from src.util import count_lines_many, data_from_many

datafiles = [DATAPATH + file for file in os.listdir(DATAPATH)]
print("Number of entries: %i" % (count_lines_many(datafiles)))

Number of entries: 155328


In [201]:
import json 
import re
from bs4 import BeautifulSoup

# NY Daily News specific
def get_paragraphs(html):
    soup = BeautifulSoup(html,'html.parser')
    if soup.findAll("article"):
        paragraphs = [tag.get_text().strip('\r\n\t') for tag in 
                      soup.findAll("title")+
                      soup.findAll("article")[0].findAll('p')+
                      soup.findAll("span",itemprop="caption")+
                      soup.findAll("p","g-article-html")]
    else:
        paragraphs = [tag.get_text().strip('\r\n\t') for tag in 
              soup.findAll("title")+
              soup.findAll("span",itemprop="caption")+
              soup.findAll("p","g-article-html")]
        
    if len(paragraphs) > 1:
        return paragraphs
    else:
        return 'NOCONTENT'
    
label_re = re.compile('(?<=http://www.nydailynews.com/).*')
def get_label(url):
    label = label_re.search(url)
    if label:
        return '/'.join(label.group().split('/')[:-1])
    else:
        return "NOLABEL"
    

# General   
def unpack(item):
    return tuple(json.loads(item).values())

def words(x):
    return len(x.split(' '))

def bunch_paragraphs(paragraphs,target_length=300):
    """bunches paragraphs into lenghts as close as possible to the target length. not global optimum."""
    res = []
    current = paragraphs.pop(0)
    while len(paragraphs) > 0:
        next_ = paragraphs.pop(0)
        
        len_current = words(current)
        len_next_ = words(next_)
        if (len_current+len_next_-target_length)**2 <= (len_current-target_length)**2:
            current += ' '+next_
        else:
            res += [current]
            current = next_
    res += [current]
    return res

def preprocess(item):
    url, html = unpack(item)
    label = get_label(url)
    paragraphs = get_paragraphs(html)
    return label, paragraphs

In [None]:
# Class count -- count only entries that have easily identifiable content and label
from collections import Counter
from multiprocessing import Pool

classes = Counter()
subclasses = Counter()

i,j = (0,0)
word_count = 0
paragraph_count = 0

def analyze_item_one(item):
    global i,j,word_count,paragraph_count,classes,subclasses
    i+=1
    label,paragraphs = item
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        j+=1
        class_ = label.split('/')[0]
        subclass_ = label
        
        paragraph_count += len(paragraphs)
        for paragraph in paragraphs:
            word_count += len(paragraph.split(' '))

        classes.update([class_])
        subclasses.update([subclass_])
        
        print("[%i/%i] Word Count: %i Paragraph Count: %i Top-10 classes: %s" % (j,i,word_count,paragraph_count,str(classes.most_common(10))),end='\r')

def analyze_item_many(data):
    list(map(analyze_item_one,data))

data = map(preprocess,data_from_many(datafiles))
analyze_item_many(data)        
with open('../summaries/nytimes.txt','w') as file:
    file.write('NYDAILYNEWS\n')
    file.write(50*"#")
    file.write('\nWORD COUNT: %i\n' % word_count)
    file.write('PARAGRAPH COUNT: %i\n' % paragraph_count)
    file.write('CLASSES: \n')
    for item in list(classes.items()):
        file.write('%s\t%i\n' % item)
    file.write('\n\n')
    file.write('SUBCLASSES: \n')
    for item in list(subclasses.items()):
        file.write('%s\t%i\n' % item)
    file.write('\n\n')

[1228/1598] Word Count: 892737 Paragraph Count: 31125 Top-10 classes: [('amp', 523), ('sports', 235), ('news', 193), ('new-york', 91), ('entertainment', 80), ('autos', 59), ('opinion', 16), ('blogs', 12), ('life-style', 10), ('', 5)]

In [None]:
# get paragraph stats
word_counts = Counter()

data = map(preprocess,data_from_many(datafiles))
i = 0
for label, paragraphs in data:
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        for paragraph in paragraphs:
            word_counts.update([len(paragraph.split(' '))])
            i += 1
        print("[%i] %s" % (i,word_counts.most_common(10)),end='\r')

In [None]:
with open('../summaries/nydailynews_paragraph_stats.txt','w') as file:
    for item in list(word_counts.items()):
        file.write('%s\t%i\n' % item)

In [None]:
# get paragraph stats
bunched_word_counts = Counter()

data = map(preprocess,data_from_many(datafiles))
i = 0
for label, paragraphs in data:
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        paragraphs = bunch_paragraphs(paragraphs,target_length = 300)
        for paragraph in paragraphs:
            bunched_word_counts.update([len(paragraph.split(' '))])
            i += 1
        print("[%i] %s" % (i,bunched_word_counts.most_common(50)),end='\r')