In [7]:
import sys
import os
from time import time

DATAPATH = '/Users/balthasar/Google Drive/MTL_data/nydailynews/'
sys.path.append(DATAPATH)
sys.path.append('../')

from src.util import count_lines_many, data_from_many

datafiles = [DATAPATH + file for file in os.listdir(DATAPATH)]
print("Number of entries: %i" % (count_lines_many(datafiles)))

Number of entries: 155328


In [2]:
import json 
import re
from bs4 import BeautifulSoup

# NY Daily News specific
def get_paragraphs(html):
    soup = BeautifulSoup(html,'html.parser')
    if soup.findAll("article"):
        paragraphs = [tag.get_text().strip('\r\n\t') for tag in 
                      soup.findAll("title")+
                      soup.findAll("article")[0].findAll('p')+
                      soup.findAll("span",itemprop="caption")+
                      soup.findAll("p","g-article-html")]
    else:
        paragraphs = [tag.get_text().strip('\r\n\t') for tag in 
              soup.findAll("title")+
              soup.findAll("span",itemprop="caption")+
              soup.findAll("p","g-article-html")]
        
    if len(paragraphs) > 1:
        return paragraphs
    else:
        return 'NOCONTENT'
    
label_re = re.compile('(?<=http://www.nydailynews.com/).*')
def get_label(url):
    label = label_re.search(url)
    if label:
        return '/'.join(label.group().split('/')[:-1])
    else:
        return "NOLABEL"
    

# General   
def unpack(item):
    return tuple(json.loads(item).values())

def words(x):
    return len(x.split(' '))

def bunch_paragraphs(paragraphs,target_length=300):
    """bunches paragraphs into lenghts as close as possible to the target length. not global optimum."""
    res = []
    current = paragraphs.pop(0)
    while len(paragraphs) > 0:
        next_ = paragraphs.pop(0)
        
        len_current = words(current)
        len_next_ = words(next_)
        if (len_current+len_next_-target_length)**2 <= (len_current-target_length)**2:
            current += ' '+next_
        else:
            res += [current]
            current = next_
    res += [current]
    return res

def preprocess(item):
    url, html = unpack(item)
    label = get_label(url)
    paragraphs = get_paragraphs(html)
    return label, paragraphs

In [3]:
# Class count -- count only entries that have easily identifiable content and label
from collections import Counter
from multiprocessing import Pool

classes = Counter()
subclasses = Counter()

i,j = (0,0)
word_count = 0
paragraph_count = 0

def analyze_item_one(item):
    global i,j,word_count,paragraph_count,classes,subclasses
    i+=1
    label,paragraphs = item
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        j+=1
        class_ = label.split('/')[0]
        subclass_ = label
        
        paragraph_count += len(paragraphs)
        for paragraph in paragraphs:
            word_count += len(paragraph.split(' '))

        classes.update([class_])
        subclasses.update([subclass_])
        
        print("[%i/%i] Word Count: %i Paragraph Count: %i Top-10 classes: %s" % (j,i,word_count,paragraph_count,str(classes.most_common(10))),end='\r')

def analyze_item_many(data):
    list(map(analyze_item_one,data))

data = map(preprocess,data_from_many(datafiles))
analyze_item_many(data)        
with open('../summaries/nydailynews.txt','w') as file:
    file.write('NYDAILYNEWS\n')
    file.write(50*"#")
    file.write('\nWORD COUNT: %i\n' % word_count)
    file.write('PARAGRAPH COUNT: %i\n' % paragraph_count)
    file.write('CLASSES: \n')
    for item in list(classes.items()):
        file.write('%s\t%i\n' % item)
    file.write('\n\n')
    file.write('SUBCLASSES: \n')
    for item in list(subclasses.items()):
        file.write('%s\t%i\n' % item)
    file.write('\n\n')

[116794/155327] Word Count: 79815451 Paragraph Count: 2811957 Top-10 classes: [('amp', 49817), ('news', 19426), ('sports', 17924), ('entertainment', 9659), ('new-york', 8073), ('autos', 8038), ('opinion', 1592), ('life-style', 1006), ('blogs', 466), ('', 231)] 162)]

In [4]:
# get paragraph stats
word_counts = Counter()

data = map(preprocess,data_from_many(datafiles))
i = 0
for label, paragraphs in data:
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        for paragraph in paragraphs:
            word_counts.update([len(paragraph.split(' '))])
            i += 1
        print("[%i] %s" % (i,word_counts.most_common(10)),end='\r')

[2811957] [(10, 101672), (12, 87387), (11, 86087), (13, 84185), (9, 79281), (14, 78094), (15, 68413), (26, 66146), (24, 65931), (25, 65838)]

In [5]:
with open('../summaries/nydailynews_paragraph_stats.txt','w') as file:
    for item in list(word_counts.items()):
        file.write('%s\t%i\n' % item)

In [6]:
# get paragraph stats
bunched_word_counts = Counter()

data = map(preprocess,data_from_many(datafiles))
i = 0
for label, paragraphs in data:
    if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
        paragraphs = bunch_paragraphs(paragraphs,target_length = 300)
        for paragraph in paragraphs:
            bunched_word_counts.update([len(paragraph.split(' '))])
            i += 1
        print("[%i] %s" % (i,bunched_word_counts.most_common(50)),end='\r')

[320844] [(300, 7408), (299, 7264), (298, 7122), (301, 7109), (302, 6902), (297, 6895), (303, 6879), (304, 6700), (296, 6631), (305, 6263), (295, 6257), (306, 5894), (294, 5798), (307, 5486), (293, 5407), (308, 5255), (292, 5222), (309, 4919), (291, 4853), (290, 4716), (310, 4599), (289, 4387), (311, 4243), (288, 4096), (312, 3983), (287, 3690), (313, 3481), (286, 3270), (314, 3224), (285, 3041), (315, 2929), (284, 2750), (316, 2624), (283, 2519), (317, 2337), (282, 2243), (281, 2157), (318, 2038), (280, 1927), (279, 1810), (319, 1809), (320, 1662), (278, 1586), (321, 1577), (277, 1566), (276, 1436), (322, 1412), (275, 1322), (323, 1269), (274, 1194)]

In [9]:
# get a datasample for Friederike
import json

data = map(preprocess,data_from_many(datafiles))
i = 0
with open('nydailynews_datasample.csv','w') as file:
    for label, paragraphs in data:
        if label != 'NOLABEL' and paragraphs != 'NOCONTENT':
            text_chunks = bunch_paragraphs(paragraphs,target_length=250)
            labels = [label for i in range(len(text_chunks))]
            items = zip(labels,text_chunks)
            for item in items:
                if len(item[1].split(' ')) >= 100:
                    print(i,end='\r')
                    i+=1
                    file.write(json.dumps(item)+'\n')
                    if i == 3000:
                        raise RateLimit

2999

NameError: name 'RateLimit' is not defined